{'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-30 22:09:11.410034', 'step': 0, 'epoch': 0} {'type': 'pplx', 'content': 226674977.87649825, 'timestamp': '2025-09-30 22:09:11.415731', 'step': 0, 'epoch': 0} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:11.493702', 'step': 0, 'epoch': 1} {'type': 'loss', 'content': 0.7057779431343079, 'timestamp': '2025-09-30 22:09:11.497137', 'step': 1, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:11.558276', 'step': 1, 'epoch': 1} {'type': 'loss', 'content': 0.6982383131980896, 'timestamp': '2025-09-30 22:09:11.561370', 'step': 2, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:11.608783', 'step': 2, 'epoch': 1} {'type': 'loss', 'content': 0.7418850064277649, 'timestamp': '2025-09-30 22:09:11.611339', 'step': 3, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:11.652352', 'step': 3, 'epoch': 1} {'type': 'loss', 'content': 0.7169809341430664, 'timestamp': '2025-09-30 22:09:11.740181', 'step': 4, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:11.776307', 'step': 4, 'epoch': 1} {'type': 'loss', 'content': 0.1316017359495163, 'timestamp': '2025-09-30 22:09:11.786063', 'step': 5, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:11.819994', 'step': 5, 'epoch': 1} {'type': 'loss', 'content': 0.1368536502122879, 'timestamp': '2025-09-30 22:09:11.828623', 'step': 6, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:11.862287', 'step': 6, 'epoch': 1} {'type': 'loss', 'content': 0.13541023433208466, 'timestamp': '2025-09-30 22:09:11.866684', 'step': 7, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:11.905650', 'step': 7, 'epoch': 1} {'type': 'loss', 'content': 0.1483638435602188, 'timestamp': '2025-09-30 22:09:11.930916', 'step': 8, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:11.966263', 'step': 8, 'epoch': 1} {'type': 'loss', 'content': 0.005413126666098833, 'timestamp': '2025-09-30 22:09:11.969115', 'step': 9, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:12.002222', 'step': 9, 'epoch': 1} {'type': 'loss', 'content': 0.05680262669920921, 'timestamp': '2025-09-30 22:09:12.005936', 'step': 10, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:12.053094', 'step': 10, 'epoch': 1} {'type': 'loss', 'content': 0.025625307112932205, 'timestamp': '2025-09-30 22:09:12.056458', 'step': 11, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:12.090015', 'step': 11, 'epoch': 1} {'type': 'loss', 'content': 0.006495438050478697, 'timestamp': '2025-09-30 22:09:12.115606', 'step': 12, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:12.154093', 'step': 12, 'epoch': 1} {'type': 'loss', 'content': 0.02278040535748005, 'timestamp': '2025-09-30 22:09:12.156747', 'step': 13, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:12.190335', 'step': 13, 'epoch': 1} {'type': 'loss', 'content': 0.03420593962073326, 'timestamp': '2025-09-30 22:09:12.198560', 'step': 14, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:12.241662', 'step': 14, 'epoch': 1} {'type': 'loss', 'content': 0.026615411043167114, 'timestamp': '2025-09-30 22:09:12.251123', 'step': 15, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:12.289251', 'step': 15, 'epoch': 1} {'type': 'loss', 'content': 0.02100050076842308, 'timestamp': '2025-09-30 22:09:12.319233', 'step': 16, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:12.350524', 'step': 16, 'epoch': 1} {'type': 'loss', 'content': 0.05153878405690193, 'timestamp': '2025-09-30 22:09:12.353778', 'step': 17, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:12.389672', 'step': 17, 'epoch': 1} {'type': 'loss', 'content': 0.04646531492471695, 'timestamp': '2025-09-30 22:09:12.394417', 'step': 18, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:12.438804', 'step': 18, 'epoch': 1} {'type': 'loss', 'content': 0.03298034518957138, 'timestamp': '2025-09-30 22:09:12.441980', 'step': 19, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:12.476772', 'step': 19, 'epoch': 1} {'type': 'loss', 'content': 0.030850352719426155, 'timestamp': '2025-09-30 22:09:12.502012', 'step': 20, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:12.558330', 'step': 20, 'epoch': 1} {'type': 'loss', 'content': 0.0381910540163517, 'timestamp': '2025-09-30 22:09:12.561275', 'step': 21, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:12.594243', 'step': 21, 'epoch': 1} {'type': 'loss', 'content': 0.04014469310641289, 'timestamp': '2025-09-30 22:09:12.598636', 'step': 22, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:12.637101', 'step': 22, 'epoch': 1} {'type': 'loss', 'content': 0.02601078525185585, 'timestamp': '2025-09-30 22:09:12.639563', 'step': 23, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:12.674795', 'step': 23, 'epoch': 1} {'type': 'loss', 'content': 0.021312592551112175, 'timestamp': '2025-09-30 22:09:12.707547', 'step': 24, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:12.753705', 'step': 24, 'epoch': 1} {'type': 'loss', 'content': 0.024147259071469307, 'timestamp': '2025-09-30 22:09:12.757263', 'step': 25, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:12.813553', 'step': 25, 'epoch': 1} {'type': 'loss', 'content': 0.03936280682682991, 'timestamp': '2025-09-30 22:09:12.818761', 'step': 26, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:12.861309', 'step': 26, 'epoch': 1} {'type': 'loss', 'content': 0.02394765429198742, 'timestamp': '2025-09-30 22:09:12.865361', 'step': 27, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:12.909284', 'step': 27, 'epoch': 1} {'type': 'loss', 'content': 0.023970751091837883, 'timestamp': '2025-09-30 22:09:12.934619', 'step': 28, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:12.972121', 'step': 28, 'epoch': 1} {'type': 'loss', 'content': 0.03464008867740631, 'timestamp': '2025-09-30 22:09:12.975501', 'step': 29, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:13.019468', 'step': 29, 'epoch': 1} {'type': 'loss', 'content': 0.022118892520666122, 'timestamp': '2025-09-30 22:09:13.022793', 'step': 30, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:13.057302', 'step': 30, 'epoch': 1} {'type': 'loss', 'content': 0.02154598757624626, 'timestamp': '2025-09-30 22:09:13.060719', 'step': 31, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:13.096414', 'step': 31, 'epoch': 1} {'type': 'loss', 'content': 0.021323371678590775, 'timestamp': '2025-09-30 22:09:13.121348', 'step': 32, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:13.165154', 'step': 32, 'epoch': 1} {'type': 'loss', 'content': 0.018217450007796288, 'timestamp': '2025-09-30 22:09:13.168824', 'step': 33, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:13.213240', 'step': 33, 'epoch': 1} {'type': 'loss', 'content': 0.031157325953245163, 'timestamp': '2025-09-30 22:09:13.224499', 'step': 34, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:13.266485', 'step': 34, 'epoch': 1} {'type': 'loss', 'content': 0.016447557136416435, 'timestamp': '2025-09-30 22:09:13.271677', 'step': 35, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:13.304897', 'step': 35, 'epoch': 1} {'type': 'loss', 'content': 0.02151617407798767, 'timestamp': '2025-09-30 22:09:13.337755', 'step': 36, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:13.372624', 'step': 36, 'epoch': 1} {'type': 'loss', 'content': 0.017450835555791855, 'timestamp': '2025-09-30 22:09:13.375245', 'step': 37, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:13.407363', 'step': 37, 'epoch': 1} {'type': 'loss', 'content': 0.02037977986037731, 'timestamp': '2025-09-30 22:09:13.410733', 'step': 38, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:13.445573', 'step': 38, 'epoch': 1} {'type': 'loss', 'content': 0.016287634149193764, 'timestamp': '2025-09-30 22:09:13.450012', 'step': 39, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:13.492970', 'step': 39, 'epoch': 1} {'type': 'loss', 'content': 0.016756046563386917, 'timestamp': '2025-09-30 22:09:13.517671', 'step': 40, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:13.556627', 'step': 40, 'epoch': 1} {'type': 'loss', 'content': 0.009105638600885868, 'timestamp': '2025-09-30 22:09:13.560667', 'step': 41, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:13.595034', 'step': 41, 'epoch': 1} {'type': 'loss', 'content': 0.018572982400655746, 'timestamp': '2025-09-30 22:09:13.605675', 'step': 42, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:13.654795', 'step': 42, 'epoch': 1} {'type': 'loss', 'content': 0.03447669371962547, 'timestamp': '2025-09-30 22:09:13.665573', 'step': 43, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:09:13.715391', 'step': 43, 'epoch': 1} {'type': 'loss', 'content': 0.00934014655649662, 'timestamp': '2025-09-30 22:09:13.740834', 'step': 44, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:13.786780', 'step': 44, 'epoch': 1} {'type': 'loss', 'content': 0.0063931881450116634, 'timestamp': '2025-09-30 22:09:13.789911', 'step': 45, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:13.826157', 'step': 45, 'epoch': 1} {'type': 'loss', 'content': 0.006094901356846094, 'timestamp': '2025-09-30 22:09:13.829873', 'step': 46, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:13.864380', 'step': 46, 'epoch': 1} {'type': 'loss', 'content': 0.03661187365651131, 'timestamp': '2025-09-30 22:09:13.873107', 'step': 47, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:13.911041', 'step': 47, 'epoch': 1} {'type': 'loss', 'content': 0.0199322160333395, 'timestamp': '2025-09-30 22:09:13.935829', 'step': 48, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:13.969844', 'step': 48, 'epoch': 1} {'type': 'loss', 'content': 0.003671335754916072, 'timestamp': '2025-09-30 22:09:13.972300', 'step': 49, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:14.004074', 'step': 49, 'epoch': 1} {'type': 'loss', 'content': 0.043977946043014526, 'timestamp': '2025-09-30 22:09:14.006632', 'step': 50, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:14.042244', 'step': 50, 'epoch': 1} {'type': 'loss', 'content': 0.0037198185455054045, 'timestamp': '2025-09-30 22:09:14.044979', 'step': 51, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:14.080140', 'step': 51, 'epoch': 1} {'type': 'loss', 'content': 0.0373804084956646, 'timestamp': '2025-09-30 22:09:14.104935', 'step': 52, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:14.137673', 'step': 52, 'epoch': 1} {'type': 'loss', 'content': 0.03821820765733719, 'timestamp': '2025-09-30 22:09:14.140018', 'step': 53, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:09:14.171155', 'step': 53, 'epoch': 1} {'type': 'loss', 'content': 0.04410596936941147, 'timestamp': '2025-09-30 22:09:14.180328', 'step': 54, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:14.214887', 'step': 54, 'epoch': 1} {'type': 'loss', 'content': 0.024004271253943443, 'timestamp': '2025-09-30 22:09:14.223302', 'step': 55, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:14.261834', 'step': 55, 'epoch': 1} {'type': 'loss', 'content': 0.042835354804992676, 'timestamp': '2025-09-30 22:09:14.287079', 'step': 56, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:14.322256', 'step': 56, 'epoch': 1} {'type': 'loss', 'content': 0.02183734066784382, 'timestamp': '2025-09-30 22:09:14.324668', 'step': 57, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-30 22:09:15.277577', 'step': 57, 'epoch': 1} {'type': 'pplx', 'content': 50960626.10407889, 'timestamp': '2025-09-30 22:09:15.280389', 'step': 57, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:15.313721', 'step': 57, 'epoch': 1} {'type': 'loss', 'content': 0.020511453971266747, 'timestamp': '2025-09-30 22:09:15.320925', 'step': 58, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:15.358210', 'step': 58, 'epoch': 1} {'type': 'loss', 'content': 0.015296747907996178, 'timestamp': '2025-09-30 22:09:15.364228', 'step': 59, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:15.400614', 'step': 59, 'epoch': 1} {'type': 'loss', 'content': 0.019934866577386856, 'timestamp': '2025-09-30 22:09:15.424173', 'step': 60, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:15.460132', 'step': 60, 'epoch': 1} {'type': 'loss', 'content': 0.018061354756355286, 'timestamp': '2025-09-30 22:09:15.467461', 'step': 61, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:15.508520', 'step': 61, 'epoch': 1} {'type': 'loss', 'content': 0.02194632776081562, 'timestamp': '2025-09-30 22:09:15.510931', 'step': 62, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:15.556104', 'step': 62, 'epoch': 1} {'type': 'loss', 'content': 0.009745917282998562, 'timestamp': '2025-09-30 22:09:15.560689', 'step': 63, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:15.596293', 'step': 63, 'epoch': 1} {'type': 'loss', 'content': 0.009664694778621197, 'timestamp': '2025-09-30 22:09:15.620651', 'step': 64, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:15.654839', 'step': 64, 'epoch': 1} {'type': 'loss', 'content': 0.01291805598884821, 'timestamp': '2025-09-30 22:09:15.657181', 'step': 65, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:15.696461', 'step': 65, 'epoch': 1} {'type': 'loss', 'content': 0.026677941903471947, 'timestamp': '2025-09-30 22:09:15.698887', 'step': 66, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:15.733022', 'step': 66, 'epoch': 1} {'type': 'loss', 'content': 0.014336864463984966, 'timestamp': '2025-09-30 22:09:15.739349', 'step': 67, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:15.778181', 'step': 67, 'epoch': 1} {'type': 'loss', 'content': 0.025222107768058777, 'timestamp': '2025-09-30 22:09:15.802313', 'step': 68, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:15.840896', 'step': 68, 'epoch': 1} {'type': 'loss', 'content': 0.028957298025488853, 'timestamp': '2025-09-30 22:09:15.843184', 'step': 69, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:15.880458', 'step': 69, 'epoch': 1} {'type': 'loss', 'content': 0.01850477233529091, 'timestamp': '2025-09-30 22:09:15.886068', 'step': 70, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:15.918627', 'step': 70, 'epoch': 1} {'type': 'loss', 'content': 0.01517259981483221, 'timestamp': '2025-09-30 22:09:15.924044', 'step': 71, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:15.973451', 'step': 71, 'epoch': 1} {'type': 'loss', 'content': 0.02591550536453724, 'timestamp': '2025-09-30 22:09:15.999792', 'step': 72, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:16.035417', 'step': 72, 'epoch': 1} {'type': 'loss', 'content': 0.01952301897108555, 'timestamp': '2025-09-30 22:09:16.038792', 'step': 73, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:16.071567', 'step': 73, 'epoch': 1} {'type': 'loss', 'content': 0.02474440075457096, 'timestamp': '2025-09-30 22:09:16.074385', 'step': 74, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:16.105987', 'step': 74, 'epoch': 1} {'type': 'loss', 'content': 0.01817135512828827, 'timestamp': '2025-09-30 22:09:16.113633', 'step': 75, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:09:16.144323', 'step': 75, 'epoch': 1} {'type': 'loss', 'content': 0.021403932943940163, 'timestamp': '2025-09-30 22:09:16.168243', 'step': 76, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:16.203430', 'step': 76, 'epoch': 1} {'type': 'loss', 'content': 0.02099829539656639, 'timestamp': '2025-09-30 22:09:16.208170', 'step': 77, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:16.243817', 'step': 77, 'epoch': 1} {'type': 'loss', 'content': 0.02719767577946186, 'timestamp': '2025-09-30 22:09:16.246103', 'step': 78, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:16.280828', 'step': 78, 'epoch': 1} {'type': 'loss', 'content': 0.02752767875790596, 'timestamp': '2025-09-30 22:09:16.283641', 'step': 79, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:16.325063', 'step': 79, 'epoch': 1} {'type': 'loss', 'content': 0.022269679233431816, 'timestamp': '2025-09-30 22:09:16.353743', 'step': 80, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:16.389524', 'step': 80, 'epoch': 1} {'type': 'loss', 'content': 0.021567845717072487, 'timestamp': '2025-09-30 22:09:16.393149', 'step': 81, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:16.432841', 'step': 81, 'epoch': 1} {'type': 'loss', 'content': 0.023687588050961494, 'timestamp': '2025-09-30 22:09:16.437550', 'step': 82, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:16.473060', 'step': 82, 'epoch': 1} {'type': 'loss', 'content': 0.03083338961005211, 'timestamp': '2025-09-30 22:09:16.475688', 'step': 83, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:16.508283', 'step': 83, 'epoch': 1} {'type': 'loss', 'content': 0.027233019471168518, 'timestamp': '2025-09-30 22:09:16.535391', 'step': 84, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:16.569498', 'step': 84, 'epoch': 1} {'type': 'loss', 'content': 0.024067306891083717, 'timestamp': '2025-09-30 22:09:16.572035', 'step': 85, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:16.609005', 'step': 85, 'epoch': 1} {'type': 'loss', 'content': 0.01529394369572401, 'timestamp': '2025-09-30 22:09:16.612011', 'step': 86, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:16.655316', 'step': 86, 'epoch': 1} {'type': 'loss', 'content': 0.01770094782114029, 'timestamp': '2025-09-30 22:09:16.658191', 'step': 87, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:16.696119', 'step': 87, 'epoch': 1} {'type': 'loss', 'content': 0.017579296603798866, 'timestamp': '2025-09-30 22:09:16.722340', 'step': 88, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:16.754011', 'step': 88, 'epoch': 1} {'type': 'loss', 'content': 0.016920411959290504, 'timestamp': '2025-09-30 22:09:16.756733', 'step': 89, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:16.790972', 'step': 89, 'epoch': 1} {'type': 'loss', 'content': 0.01924826018512249, 'timestamp': '2025-09-30 22:09:16.793910', 'step': 90, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:16.835103', 'step': 90, 'epoch': 1} {'type': 'loss', 'content': 0.020246148109436035, 'timestamp': '2025-09-30 22:09:16.838536', 'step': 91, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:09:16.877822', 'step': 91, 'epoch': 1} {'type': 'loss', 'content': 0.026985352858901024, 'timestamp': '2025-09-30 22:09:16.902800', 'step': 92, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:16.946383', 'step': 92, 'epoch': 1} {'type': 'loss', 'content': 0.015785491093993187, 'timestamp': '2025-09-30 22:09:16.954224', 'step': 93, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:16.996404', 'step': 93, 'epoch': 1} {'type': 'loss', 'content': 0.024172216653823853, 'timestamp': '2025-09-30 22:09:17.002136', 'step': 94, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:17.050024', 'step': 94, 'epoch': 1} {'type': 'loss', 'content': 0.01940486952662468, 'timestamp': '2025-09-30 22:09:17.054239', 'step': 95, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:17.086262', 'step': 95, 'epoch': 1} {'type': 'loss', 'content': 0.023085501044988632, 'timestamp': '2025-09-30 22:09:17.116579', 'step': 96, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:17.157370', 'step': 96, 'epoch': 1} {'type': 'loss', 'content': 0.01945091411471367, 'timestamp': '2025-09-30 22:09:17.160912', 'step': 97, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:17.199023', 'step': 97, 'epoch': 1} {'type': 'loss', 'content': 0.020526302978396416, 'timestamp': '2025-09-30 22:09:17.202637', 'step': 98, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:17.241885', 'step': 98, 'epoch': 1} {'type': 'loss', 'content': 0.017805086448788643, 'timestamp': '2025-09-30 22:09:17.244654', 'step': 99, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:17.285187', 'step': 99, 'epoch': 1} {'type': 'loss', 'content': 0.017751798033714294, 'timestamp': '2025-09-30 22:09:17.309846', 'step': 100, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:17.349427', 'step': 100, 'epoch': 1} {'type': 'loss', 'content': 0.01985841430723667, 'timestamp': '2025-09-30 22:09:17.352096', 'step': 101, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:17.383952', 'step': 101, 'epoch': 1} {'type': 'loss', 'content': 0.018005359917879105, 'timestamp': '2025-09-30 22:09:17.387053', 'step': 102, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:17.422484', 'step': 102, 'epoch': 1} {'type': 'loss', 'content': 0.01930832304060459, 'timestamp': '2025-09-30 22:09:17.424941', 'step': 103, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:17.456526', 'step': 103, 'epoch': 1} {'type': 'loss', 'content': 0.028776034712791443, 'timestamp': '2025-09-30 22:09:17.480383', 'step': 104, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:17.515503', 'step': 104, 'epoch': 1} {'type': 'loss', 'content': 0.030385851860046387, 'timestamp': '2025-09-30 22:09:17.518190', 'step': 105, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:17.555704', 'step': 105, 'epoch': 1} {'type': 'loss', 'content': 0.009547047317028046, 'timestamp': '2025-09-30 22:09:17.558826', 'step': 106, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:17.590456', 'step': 106, 'epoch': 1} {'type': 'loss', 'content': 0.021585334092378616, 'timestamp': '2025-09-30 22:09:17.594339', 'step': 107, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:17.633081', 'step': 107, 'epoch': 1} {'type': 'loss', 'content': 0.04027863219380379, 'timestamp': '2025-09-30 22:09:17.663088', 'step': 108, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:17.706977', 'step': 108, 'epoch': 1} {'type': 'loss', 'content': 0.04367966577410698, 'timestamp': '2025-09-30 22:09:17.710272', 'step': 109, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:09:17.754783', 'step': 109, 'epoch': 1} {'type': 'loss', 'content': 0.03343404084444046, 'timestamp': '2025-09-30 22:09:17.758998', 'step': 110, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:17.799151', 'step': 110, 'epoch': 1} {'type': 'loss', 'content': 0.009330673143267632, 'timestamp': '2025-09-30 22:09:17.806239', 'step': 111, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:17.842615', 'step': 111, 'epoch': 1} {'type': 'loss', 'content': 0.028857093304395676, 'timestamp': '2025-09-30 22:09:17.871630', 'step': 112, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:17.903221', 'step': 112, 'epoch': 1} {'type': 'loss', 'content': 0.025730669498443604, 'timestamp': '2025-09-30 22:09:17.905760', 'step': 113, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:17.943633', 'step': 113, 'epoch': 1} {'type': 'loss', 'content': 0.021670332178473473, 'timestamp': '2025-09-30 22:09:17.947613', 'step': 114, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-30 22:09:18.896142', 'step': 114, 'epoch': 1} {'type': 'pplx', 'content': 58534139.80761209, 'timestamp': '2025-09-30 22:09:18.898883', 'step': 114, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:18.927855', 'step': 114, 'epoch': 1} {'type': 'loss', 'content': 0.009823307394981384, 'timestamp': '2025-09-30 22:09:18.929993', 'step': 115, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:09:18.967139', 'step': 115, 'epoch': 1} {'type': 'loss', 'content': 0.028453484177589417, 'timestamp': '2025-09-30 22:09:18.991551', 'step': 116, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:19.026563', 'step': 116, 'epoch': 1} {'type': 'loss', 'content': 0.023278359323740005, 'timestamp': '2025-09-30 22:09:19.030352', 'step': 117, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:19.067552', 'step': 117, 'epoch': 1} {'type': 'loss', 'content': 0.017218248918652534, 'timestamp': '2025-09-30 22:09:19.069876', 'step': 118, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:19.104189', 'step': 118, 'epoch': 1} {'type': 'loss', 'content': 0.015407579019665718, 'timestamp': '2025-09-30 22:09:19.111364', 'step': 119, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:19.156579', 'step': 119, 'epoch': 1} {'type': 'loss', 'content': 0.02096530608832836, 'timestamp': '2025-09-30 22:09:19.181183', 'step': 120, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:19.222936', 'step': 120, 'epoch': 1} {'type': 'loss', 'content': 0.030719464644789696, 'timestamp': '2025-09-30 22:09:19.225745', 'step': 121, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:09:19.256834', 'step': 121, 'epoch': 1} {'type': 'loss', 'content': 0.023789221420884132, 'timestamp': '2025-09-30 22:09:19.259053', 'step': 122, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:09:19.293113', 'step': 122, 'epoch': 1} {'type': 'loss', 'content': 0.02422069013118744, 'timestamp': '2025-09-30 22:09:19.303236', 'step': 123, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:19.352509', 'step': 123, 'epoch': 1} {'type': 'loss', 'content': 0.015484781935811043, 'timestamp': '2025-09-30 22:09:19.378591', 'step': 124, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:19.414774', 'step': 124, 'epoch': 1} {'type': 'loss', 'content': 0.021923230960965157, 'timestamp': '2025-09-30 22:09:19.418211', 'step': 125, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:19.450426', 'step': 125, 'epoch': 1} {'type': 'loss', 'content': 0.03109300322830677, 'timestamp': '2025-09-30 22:09:19.454284', 'step': 126, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:09:19.492377', 'step': 126, 'epoch': 1} {'type': 'loss', 'content': 0.02031245455145836, 'timestamp': '2025-09-30 22:09:19.495342', 'step': 127, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:19.527494', 'step': 127, 'epoch': 1} {'type': 'loss', 'content': 0.022137144580483437, 'timestamp': '2025-09-30 22:09:19.552142', 'step': 128, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:19.583975', 'step': 128, 'epoch': 1} {'type': 'loss', 'content': 0.020822782069444656, 'timestamp': '2025-09-30 22:09:19.586482', 'step': 129, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:19.625574', 'step': 129, 'epoch': 1} {'type': 'loss', 'content': 0.027242343872785568, 'timestamp': '2025-09-30 22:09:19.628308', 'step': 130, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:19.662693', 'step': 130, 'epoch': 1} {'type': 'loss', 'content': 0.027840612456202507, 'timestamp': '2025-09-30 22:09:19.665395', 'step': 131, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:19.697783', 'step': 131, 'epoch': 1} {'type': 'loss', 'content': 0.020926009863615036, 'timestamp': '2025-09-30 22:09:19.725080', 'step': 132, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:19.756602', 'step': 132, 'epoch': 1} {'type': 'loss', 'content': 0.014202489517629147, 'timestamp': '2025-09-30 22:09:19.758795', 'step': 133, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:19.789976', 'step': 133, 'epoch': 1} {'type': 'loss', 'content': 0.024952108040452003, 'timestamp': '2025-09-30 22:09:19.792420', 'step': 134, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:09:19.826685', 'step': 134, 'epoch': 1} {'type': 'loss', 'content': 0.023683322593569756, 'timestamp': '2025-09-30 22:09:19.828977', 'step': 135, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:19.863113', 'step': 135, 'epoch': 1} {'type': 'loss', 'content': 0.02011030726134777, 'timestamp': '2025-09-30 22:09:19.887275', 'step': 136, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:09:19.928020', 'step': 136, 'epoch': 1} {'type': 'loss', 'content': 0.022830024361610413, 'timestamp': '2025-09-30 22:09:19.931512', 'step': 137, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:19.963089', 'step': 137, 'epoch': 1} {'type': 'loss', 'content': 0.02508159540593624, 'timestamp': '2025-09-30 22:09:19.967458', 'step': 138, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:20.000658', 'step': 138, 'epoch': 1} {'type': 'loss', 'content': 0.016803130507469177, 'timestamp': '2025-09-30 22:09:20.004268', 'step': 139, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:20.040017', 'step': 139, 'epoch': 1} {'type': 'loss', 'content': 0.026307707652449608, 'timestamp': '2025-09-30 22:09:20.066332', 'step': 140, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:20.103807', 'step': 140, 'epoch': 1} {'type': 'loss', 'content': 0.01853339932858944, 'timestamp': '2025-09-30 22:09:20.107421', 'step': 141, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:20.146944', 'step': 141, 'epoch': 1} {'type': 'loss', 'content': 0.01986740343272686, 'timestamp': '2025-09-30 22:09:20.149468', 'step': 142, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:20.180455', 'step': 142, 'epoch': 1} {'type': 'loss', 'content': 0.026442429050803185, 'timestamp': '2025-09-30 22:09:20.183589', 'step': 143, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:20.224127', 'step': 143, 'epoch': 1} {'type': 'loss', 'content': 0.013860334642231464, 'timestamp': '2025-09-30 22:09:20.248424', 'step': 144, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:20.279970', 'step': 144, 'epoch': 1} {'type': 'loss', 'content': 0.01697557047009468, 'timestamp': '2025-09-30 22:09:20.285375', 'step': 145, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:20.321143', 'step': 145, 'epoch': 1} {'type': 'loss', 'content': 0.01053065899759531, 'timestamp': '2025-09-30 22:09:20.323946', 'step': 146, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:20.357847', 'step': 146, 'epoch': 1} {'type': 'loss', 'content': 0.023069296032190323, 'timestamp': '2025-09-30 22:09:20.365149', 'step': 147, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:20.408388', 'step': 147, 'epoch': 1} {'type': 'loss', 'content': 0.031005268916487694, 'timestamp': '2025-09-30 22:09:20.432411', 'step': 148, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:20.463781', 'step': 148, 'epoch': 1} {'type': 'loss', 'content': 0.04410288482904434, 'timestamp': '2025-09-30 22:09:20.466537', 'step': 149, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:20.500020', 'step': 149, 'epoch': 1} {'type': 'loss', 'content': 0.04395507648587227, 'timestamp': '2025-09-30 22:09:20.502770', 'step': 150, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:20.544615', 'step': 150, 'epoch': 1} {'type': 'loss', 'content': 0.017706802114844322, 'timestamp': '2025-09-30 22:09:20.557460', 'step': 151, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:20.599912', 'step': 151, 'epoch': 1} {'type': 'loss', 'content': 0.007974297739565372, 'timestamp': '2025-09-30 22:09:20.623923', 'step': 152, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:09:20.674167', 'step': 152, 'epoch': 1} {'type': 'loss', 'content': 0.010354185476899147, 'timestamp': '2025-09-30 22:09:20.682383', 'step': 153, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:20.735253', 'step': 153, 'epoch': 1} {'type': 'loss', 'content': 0.02493741549551487, 'timestamp': '2025-09-30 22:09:20.741201', 'step': 154, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:20.782802', 'step': 154, 'epoch': 1} {'type': 'loss', 'content': 0.030498506501317024, 'timestamp': '2025-09-30 22:09:20.792049', 'step': 155, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:20.829033', 'step': 155, 'epoch': 1} {'type': 'loss', 'content': 0.020751025527715683, 'timestamp': '2025-09-30 22:09:20.855163', 'step': 156, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:20.892618', 'step': 156, 'epoch': 1} {'type': 'loss', 'content': 0.04443346709012985, 'timestamp': '2025-09-30 22:09:20.895268', 'step': 157, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:20.928220', 'step': 157, 'epoch': 1} {'type': 'loss', 'content': 0.06002501770853996, 'timestamp': '2025-09-30 22:09:20.936899', 'step': 158, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:09:20.972521', 'step': 158, 'epoch': 1} {'type': 'loss', 'content': 0.04527200013399124, 'timestamp': '2025-09-30 22:09:20.977524', 'step': 159, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:21.009917', 'step': 159, 'epoch': 1} {'type': 'loss', 'content': 0.03547225147485733, 'timestamp': '2025-09-30 22:09:21.039356', 'step': 160, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:21.073410', 'step': 160, 'epoch': 1} {'type': 'loss', 'content': 0.02295391820371151, 'timestamp': '2025-09-30 22:09:21.082083', 'step': 161, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:21.116796', 'step': 161, 'epoch': 1} {'type': 'loss', 'content': 0.03174513950943947, 'timestamp': '2025-09-30 22:09:21.122115', 'step': 162, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:21.167665', 'step': 162, 'epoch': 1} {'type': 'loss', 'content': 0.02424296736717224, 'timestamp': '2025-09-30 22:09:21.172311', 'step': 163, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:09:21.219074', 'step': 163, 'epoch': 1} {'type': 'loss', 'content': 0.017598729580640793, 'timestamp': '2025-09-30 22:09:21.244379', 'step': 164, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:21.286836', 'step': 164, 'epoch': 1} {'type': 'loss', 'content': 0.02205030620098114, 'timestamp': '2025-09-30 22:09:21.289414', 'step': 165, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:21.322458', 'step': 165, 'epoch': 1} {'type': 'loss', 'content': 0.027921559289097786, 'timestamp': '2025-09-30 22:09:21.325980', 'step': 166, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:21.365573', 'step': 166, 'epoch': 1} {'type': 'loss', 'content': 0.029713410884141922, 'timestamp': '2025-09-30 22:09:21.368535', 'step': 167, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:21.405306', 'step': 167, 'epoch': 1} {'type': 'loss', 'content': 0.017205597832798958, 'timestamp': '2025-09-30 22:09:21.430538', 'step': 168, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:21.476090', 'step': 168, 'epoch': 1} {'type': 'loss', 'content': 0.02522202394902706, 'timestamp': '2025-09-30 22:09:21.479456', 'step': 169, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:21.512103', 'step': 169, 'epoch': 1} {'type': 'loss', 'content': 0.029750773683190346, 'timestamp': '2025-09-30 22:09:21.520054', 'step': 170, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:21.560214', 'step': 170, 'epoch': 1} {'type': 'loss', 'content': 0.02134212851524353, 'timestamp': '2025-09-30 22:09:21.567431', 'step': 171, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-30 22:09:22.542027', 'step': 171, 'epoch': 1} {'type': 'pplx', 'content': 63135953.81431602, 'timestamp': '2025-09-30 22:09:22.549503', 'step': 171, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:09:22.579778', 'step': 171, 'epoch': 1} {'type': 'loss', 'content': 0.025313351303339005, 'timestamp': '2025-09-30 22:09:22.609048', 'step': 172, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:22.644211', 'step': 172, 'epoch': 1} {'type': 'loss', 'content': 0.02162635698914528, 'timestamp': '2025-09-30 22:09:22.648087', 'step': 173, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:22.685698', 'step': 173, 'epoch': 1} {'type': 'loss', 'content': 0.023029791191220284, 'timestamp': '2025-09-30 22:09:22.700922', 'step': 174, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:22.737162', 'step': 174, 'epoch': 1} {'type': 'loss', 'content': 0.02027418650686741, 'timestamp': '2025-09-30 22:09:22.739647', 'step': 175, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:09:22.773725', 'step': 175, 'epoch': 1} {'type': 'loss', 'content': 0.02678442932665348, 'timestamp': '2025-09-30 22:09:22.798748', 'step': 176, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:22.833449', 'step': 176, 'epoch': 1} {'type': 'loss', 'content': 0.019500982016324997, 'timestamp': '2025-09-30 22:09:22.836804', 'step': 177, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:22.869460', 'step': 177, 'epoch': 1} {'type': 'loss', 'content': 0.018234090879559517, 'timestamp': '2025-09-30 22:09:22.873359', 'step': 178, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:22.914959', 'step': 178, 'epoch': 1} {'type': 'loss', 'content': 0.01549634337425232, 'timestamp': '2025-09-30 22:09:22.916908', 'step': 179, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:22.949079', 'step': 179, 'epoch': 1} {'type': 'loss', 'content': 0.023833613842725754, 'timestamp': '2025-09-30 22:09:22.972976', 'step': 180, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:23.011745', 'step': 180, 'epoch': 1} {'type': 'loss', 'content': 0.020760536193847656, 'timestamp': '2025-09-30 22:09:23.016768', 'step': 181, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:23.055342', 'step': 181, 'epoch': 1} {'type': 'loss', 'content': 0.02735050581395626, 'timestamp': '2025-09-30 22:09:23.057868', 'step': 182, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:23.090773', 'step': 182, 'epoch': 1} {'type': 'loss', 'content': 0.04049814119935036, 'timestamp': '2025-09-30 22:09:23.093111', 'step': 183, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:23.125859', 'step': 183, 'epoch': 1} {'type': 'loss', 'content': 0.03238435834646225, 'timestamp': '2025-09-30 22:09:23.150246', 'step': 184, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:09:23.184750', 'step': 184, 'epoch': 1} {'type': 'loss', 'content': 0.02755420282483101, 'timestamp': '2025-09-30 22:09:23.187523', 'step': 185, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:23.219075', 'step': 185, 'epoch': 1} {'type': 'loss', 'content': 0.021524375304579735, 'timestamp': '2025-09-30 22:09:23.221725', 'step': 186, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:23.256297', 'step': 186, 'epoch': 1} {'type': 'loss', 'content': 0.028563236817717552, 'timestamp': '2025-09-30 22:09:23.260646', 'step': 187, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:23.298047', 'step': 187, 'epoch': 1} {'type': 'loss', 'content': 0.020461514592170715, 'timestamp': '2025-09-30 22:09:23.321944', 'step': 188, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:23.359970', 'step': 188, 'epoch': 1} {'type': 'loss', 'content': 0.017072943970561028, 'timestamp': '2025-09-30 22:09:23.362327', 'step': 189, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:23.398504', 'step': 189, 'epoch': 1} {'type': 'loss', 'content': 0.023945823311805725, 'timestamp': '2025-09-30 22:09:23.402855', 'step': 190, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:23.440466', 'step': 190, 'epoch': 1} {'type': 'loss', 'content': 0.018736975267529488, 'timestamp': '2025-09-30 22:09:23.442991', 'step': 191, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:09:23.482508', 'step': 191, 'epoch': 1} {'type': 'loss', 'content': 0.02092103660106659, 'timestamp': '2025-09-30 22:09:23.508844', 'step': 192, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:23.548356', 'step': 192, 'epoch': 1} {'type': 'loss', 'content': 0.011695189401507378, 'timestamp': '2025-09-30 22:09:23.561921', 'step': 193, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:23.604456', 'step': 193, 'epoch': 1} {'type': 'loss', 'content': 0.012270850129425526, 'timestamp': '2025-09-30 22:09:23.607780', 'step': 194, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:23.641746', 'step': 194, 'epoch': 1} {'type': 'loss', 'content': 0.017633767798542976, 'timestamp': '2025-09-30 22:09:23.645103', 'step': 195, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:23.681614', 'step': 195, 'epoch': 1} {'type': 'loss', 'content': 0.02073209546506405, 'timestamp': '2025-09-30 22:09:23.706313', 'step': 196, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:23.745909', 'step': 196, 'epoch': 1} {'type': 'loss', 'content': 0.011094124056398869, 'timestamp': '2025-09-30 22:09:23.755657', 'step': 197, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:23.788928', 'step': 197, 'epoch': 1} {'type': 'loss', 'content': 0.02200642041862011, 'timestamp': '2025-09-30 22:09:23.791820', 'step': 198, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:09:23.828375', 'step': 198, 'epoch': 1} {'type': 'loss', 'content': 0.03361174091696739, 'timestamp': '2025-09-30 22:09:23.830747', 'step': 199, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:23.872473', 'step': 199, 'epoch': 1} {'type': 'loss', 'content': 0.032245974987745285, 'timestamp': '2025-09-30 22:09:23.897029', 'step': 200, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:23.929554', 'step': 200, 'epoch': 1} {'type': 'loss', 'content': 0.017100024968385696, 'timestamp': '2025-09-30 22:09:23.935873', 'step': 201, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:23.975347', 'step': 201, 'epoch': 1} {'type': 'loss', 'content': 0.019537942484021187, 'timestamp': '2025-09-30 22:09:23.977741', 'step': 202, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:24.022466', 'step': 202, 'epoch': 1} {'type': 'loss', 'content': 0.02034500427544117, 'timestamp': '2025-09-30 22:09:24.024897', 'step': 203, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:09:24.057311', 'step': 203, 'epoch': 1} {'type': 'loss', 'content': 0.02110188640654087, 'timestamp': '2025-09-30 22:09:24.080940', 'step': 204, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:24.116185', 'step': 204, 'epoch': 1} {'type': 'loss', 'content': 0.02113529108464718, 'timestamp': '2025-09-30 22:09:24.123925', 'step': 205, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:24.170996', 'step': 205, 'epoch': 1} {'type': 'loss', 'content': 0.010029876604676247, 'timestamp': '2025-09-30 22:09:24.177332', 'step': 206, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:24.211497', 'step': 206, 'epoch': 1} {'type': 'loss', 'content': 0.030450621619820595, 'timestamp': '2025-09-30 22:09:24.220174', 'step': 207, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:24.261166', 'step': 207, 'epoch': 1} {'type': 'loss', 'content': 0.019182788208127022, 'timestamp': '2025-09-30 22:09:24.285128', 'step': 208, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:24.318848', 'step': 208, 'epoch': 1} {'type': 'loss', 'content': 0.030275586992502213, 'timestamp': '2025-09-30 22:09:24.323077', 'step': 209, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:24.364047', 'step': 209, 'epoch': 1} {'type': 'loss', 'content': 0.03572225570678711, 'timestamp': '2025-09-30 22:09:24.377896', 'step': 210, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:24.416921', 'step': 210, 'epoch': 1} {'type': 'loss', 'content': 0.018198251724243164, 'timestamp': '2025-09-30 22:09:24.421098', 'step': 211, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:24.457468', 'step': 211, 'epoch': 1} {'type': 'loss', 'content': 0.04005300626158714, 'timestamp': '2025-09-30 22:09:24.482710', 'step': 212, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:24.520417', 'step': 212, 'epoch': 1} {'type': 'loss', 'content': 0.029761286452412605, 'timestamp': '2025-09-30 22:09:24.523772', 'step': 213, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:24.566134', 'step': 213, 'epoch': 1} {'type': 'loss', 'content': 0.018643589690327644, 'timestamp': '2025-09-30 22:09:24.569736', 'step': 214, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:24.626292', 'step': 214, 'epoch': 1} {'type': 'loss', 'content': 0.026240238919854164, 'timestamp': '2025-09-30 22:09:24.630332', 'step': 215, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:24.679963', 'step': 215, 'epoch': 1} {'type': 'loss', 'content': 0.0247743409126997, 'timestamp': '2025-09-30 22:09:24.705726', 'step': 216, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:09:24.742582', 'step': 216, 'epoch': 1} {'type': 'loss', 'content': 0.02074492536485195, 'timestamp': '2025-09-30 22:09:24.748216', 'step': 217, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:24.804052', 'step': 217, 'epoch': 1} {'type': 'loss', 'content': 0.024284886196255684, 'timestamp': '2025-09-30 22:09:24.815954', 'step': 218, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:24.849931', 'step': 218, 'epoch': 1} {'type': 'loss', 'content': 0.022740570828318596, 'timestamp': '2025-09-30 22:09:24.854642', 'step': 219, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:24.894445', 'step': 219, 'epoch': 1} {'type': 'loss', 'content': 0.01697150431573391, 'timestamp': '2025-09-30 22:09:24.919994', 'step': 220, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:24.960812', 'step': 220, 'epoch': 1} {'type': 'loss', 'content': 0.021439313888549805, 'timestamp': '2025-09-30 22:09:24.965122', 'step': 221, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:25.011331', 'step': 221, 'epoch': 1} {'type': 'loss', 'content': 0.023758551105856895, 'timestamp': '2025-09-30 22:09:25.014195', 'step': 222, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:25.057607', 'step': 222, 'epoch': 1} {'type': 'loss', 'content': 0.019697774201631546, 'timestamp': '2025-09-30 22:09:25.070052', 'step': 223, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:25.119111', 'step': 223, 'epoch': 1} {'type': 'loss', 'content': 0.02828163094818592, 'timestamp': '2025-09-30 22:09:25.144906', 'step': 224, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:25.179325', 'step': 224, 'epoch': 1} {'type': 'loss', 'content': 0.01745227910578251, 'timestamp': '2025-09-30 22:09:25.182684', 'step': 225, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:25.216373', 'step': 225, 'epoch': 1} {'type': 'loss', 'content': 0.020431501790881157, 'timestamp': '2025-09-30 22:09:25.219420', 'step': 226, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:25.262506', 'step': 226, 'epoch': 1} {'type': 'loss', 'content': 0.02470734715461731, 'timestamp': '2025-09-30 22:09:25.266184', 'step': 227, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:25.300680', 'step': 227, 'epoch': 1} {'type': 'loss', 'content': 0.020329369232058525, 'timestamp': '2025-09-30 22:09:25.334365', 'step': 228, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-30 22:09:26.396318', 'step': 228, 'epoch': 1} {'type': 'pplx', 'content': 64583602.40184191, 'timestamp': '2025-09-30 22:09:26.400761', 'step': 228, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:26.436859', 'step': 228, 'epoch': 1} {'type': 'loss', 'content': 0.020061250776052475, 'timestamp': '2025-09-30 22:09:26.445307', 'step': 229, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:26.482340', 'step': 229, 'epoch': 1} {'type': 'loss', 'content': 0.023501979187130928, 'timestamp': '2025-09-30 22:09:26.486298', 'step': 230, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:26.520038', 'step': 230, 'epoch': 1} {'type': 'loss', 'content': 0.022963250055909157, 'timestamp': '2025-09-30 22:09:26.523926', 'step': 231, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:26.557188', 'step': 231, 'epoch': 1} {'type': 'loss', 'content': 0.026661524549126625, 'timestamp': '2025-09-30 22:09:26.583351', 'step': 232, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:09:26.617885', 'step': 232, 'epoch': 1} {'type': 'loss', 'content': 0.02811359241604805, 'timestamp': '2025-09-30 22:09:26.622318', 'step': 233, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:26.656864', 'step': 233, 'epoch': 1} {'type': 'loss', 'content': 0.020175501704216003, 'timestamp': '2025-09-30 22:09:26.660057', 'step': 234, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:26.704221', 'step': 234, 'epoch': 1} {'type': 'loss', 'content': 0.021329758688807487, 'timestamp': '2025-09-30 22:09:26.707883', 'step': 235, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:26.744204', 'step': 235, 'epoch': 1} {'type': 'loss', 'content': 0.023740265518426895, 'timestamp': '2025-09-30 22:09:26.769331', 'step': 236, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:09:26.802865', 'step': 236, 'epoch': 1} {'type': 'loss', 'content': 0.02140076644718647, 'timestamp': '2025-09-30 22:09:26.811659', 'step': 237, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:26.849074', 'step': 237, 'epoch': 1} {'type': 'loss', 'content': 0.024655591696500778, 'timestamp': '2025-09-30 22:09:26.853322', 'step': 238, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:26.897500', 'step': 238, 'epoch': 1} {'type': 'loss', 'content': 0.02812766842544079, 'timestamp': '2025-09-30 22:09:26.900902', 'step': 239, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:26.937171', 'step': 239, 'epoch': 1} {'type': 'loss', 'content': 0.021700652316212654, 'timestamp': '2025-09-30 22:09:26.961785', 'step': 240, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:09:26.993967', 'step': 240, 'epoch': 1} {'type': 'loss', 'content': 0.017045430839061737, 'timestamp': '2025-09-30 22:09:26.997221', 'step': 241, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:27.029087', 'step': 241, 'epoch': 1} {'type': 'loss', 'content': 0.014452247880399227, 'timestamp': '2025-09-30 22:09:27.035776', 'step': 242, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:27.083171', 'step': 242, 'epoch': 1} {'type': 'loss', 'content': 0.01355509739369154, 'timestamp': '2025-09-30 22:09:27.086116', 'step': 243, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:27.117351', 'step': 243, 'epoch': 1} {'type': 'loss', 'content': 0.01483946107327938, 'timestamp': '2025-09-30 22:09:27.141796', 'step': 244, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:27.184555', 'step': 244, 'epoch': 1} {'type': 'loss', 'content': 0.03674229606986046, 'timestamp': '2025-09-30 22:09:27.193271', 'step': 245, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:27.240463', 'step': 245, 'epoch': 1} {'type': 'loss', 'content': 0.03379470854997635, 'timestamp': '2025-09-30 22:09:27.249237', 'step': 246, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:27.291190', 'step': 246, 'epoch': 1} {'type': 'loss', 'content': 0.031083837151527405, 'timestamp': '2025-09-30 22:09:27.294330', 'step': 247, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:27.327714', 'step': 247, 'epoch': 1} {'type': 'loss', 'content': 0.02411106415092945, 'timestamp': '2025-09-30 22:09:27.351673', 'step': 248, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:27.392985', 'step': 248, 'epoch': 1} {'type': 'loss', 'content': 0.03847876191139221, 'timestamp': '2025-09-30 22:09:27.397613', 'step': 249, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:27.434230', 'step': 249, 'epoch': 1} {'type': 'loss', 'content': 0.013434567488729954, 'timestamp': '2025-09-30 22:09:27.437393', 'step': 250, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:27.470533', 'step': 250, 'epoch': 1} {'type': 'loss', 'content': 0.0336221382021904, 'timestamp': '2025-09-30 22:09:27.482598', 'step': 251, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:27.518775', 'step': 251, 'epoch': 1} {'type': 'loss', 'content': 0.011063228361308575, 'timestamp': '2025-09-30 22:09:27.545257', 'step': 252, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:27.587336', 'step': 252, 'epoch': 1} {'type': 'loss', 'content': 0.009829501621425152, 'timestamp': '2025-09-30 22:09:27.590919', 'step': 253, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:27.625865', 'step': 253, 'epoch': 1} {'type': 'loss', 'content': 0.02949223481118679, 'timestamp': '2025-09-30 22:09:27.628946', 'step': 254, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:27.669542', 'step': 254, 'epoch': 1} {'type': 'loss', 'content': 0.01204278226941824, 'timestamp': '2025-09-30 22:09:27.673618', 'step': 255, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:09:27.709938', 'step': 255, 'epoch': 1} {'type': 'loss', 'content': 0.016949284821748734, 'timestamp': '2025-09-30 22:09:27.735954', 'step': 256, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:27.774681', 'step': 256, 'epoch': 1} {'type': 'loss', 'content': 0.01266380213201046, 'timestamp': '2025-09-30 22:09:27.778898', 'step': 257, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:27.812145', 'step': 257, 'epoch': 1} {'type': 'loss', 'content': 0.03958168253302574, 'timestamp': '2025-09-30 22:09:27.818913', 'step': 258, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:27.859520', 'step': 258, 'epoch': 1} {'type': 'loss', 'content': 0.02375471219420433, 'timestamp': '2025-09-30 22:09:27.862962', 'step': 259, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:27.902512', 'step': 259, 'epoch': 1} {'type': 'loss', 'content': 0.022940808907151222, 'timestamp': '2025-09-30 22:09:27.926849', 'step': 260, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:27.959007', 'step': 260, 'epoch': 1} {'type': 'loss', 'content': 0.023124128580093384, 'timestamp': '2025-09-30 22:09:27.963644', 'step': 261, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:27.996992', 'step': 261, 'epoch': 1} {'type': 'loss', 'content': 0.020926760509610176, 'timestamp': '2025-09-30 22:09:28.000115', 'step': 262, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:09:28.033434', 'step': 262, 'epoch': 1} {'type': 'loss', 'content': 0.020220564678311348, 'timestamp': '2025-09-30 22:09:28.036473', 'step': 263, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:09:28.073437', 'step': 263, 'epoch': 1} {'type': 'loss', 'content': 0.011818875558674335, 'timestamp': '2025-09-30 22:09:28.097489', 'step': 264, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:28.130951', 'step': 264, 'epoch': 1} {'type': 'loss', 'content': 0.022864850237965584, 'timestamp': '2025-09-30 22:09:28.135352', 'step': 265, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:28.169449', 'step': 265, 'epoch': 1} {'type': 'loss', 'content': 0.021252285689115524, 'timestamp': '2025-09-30 22:09:28.174293', 'step': 266, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:28.214276', 'step': 266, 'epoch': 1} {'type': 'loss', 'content': 0.021793512627482414, 'timestamp': '2025-09-30 22:09:28.222872', 'step': 267, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:09:28.261060', 'step': 267, 'epoch': 1} {'type': 'loss', 'content': 0.03829879313707352, 'timestamp': '2025-09-30 22:09:28.285559', 'step': 268, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:09:28.317907', 'step': 268, 'epoch': 1} {'type': 'loss', 'content': 0.016349567100405693, 'timestamp': '2025-09-30 22:09:28.324735', 'step': 269, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:28.356375', 'step': 269, 'epoch': 1} {'type': 'loss', 'content': 0.021319296211004257, 'timestamp': '2025-09-30 22:09:28.364055', 'step': 270, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:28.397989', 'step': 270, 'epoch': 1} {'type': 'loss', 'content': 0.023110728710889816, 'timestamp': '2025-09-30 22:09:28.405917', 'step': 271, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:09:28.446429', 'step': 271, 'epoch': 1} {'type': 'loss', 'content': 0.01968984305858612, 'timestamp': '2025-09-30 22:09:28.473040', 'step': 272, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:28.505796', 'step': 272, 'epoch': 1} {'type': 'loss', 'content': 0.024730121716856956, 'timestamp': '2025-09-30 22:09:28.508918', 'step': 273, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:28.549681', 'step': 273, 'epoch': 1} {'type': 'loss', 'content': 0.014643555507063866, 'timestamp': '2025-09-30 22:09:28.556887', 'step': 274, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:28.589634', 'step': 274, 'epoch': 1} {'type': 'loss', 'content': 0.019117284566164017, 'timestamp': '2025-09-30 22:09:28.592910', 'step': 275, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:09:28.627191', 'step': 275, 'epoch': 1} {'type': 'loss', 'content': 0.027165459468960762, 'timestamp': '2025-09-30 22:09:28.655612', 'step': 276, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:28.692616', 'step': 276, 'epoch': 1} {'type': 'loss', 'content': 0.0140123525634408, 'timestamp': '2025-09-30 22:09:28.695435', 'step': 277, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:28.730800', 'step': 277, 'epoch': 1} {'type': 'loss', 'content': 0.02771705947816372, 'timestamp': '2025-09-30 22:09:28.734046', 'step': 278, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:28.767972', 'step': 278, 'epoch': 1} {'type': 'loss', 'content': 0.022224754095077515, 'timestamp': '2025-09-30 22:09:28.771316', 'step': 279, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:28.808793', 'step': 279, 'epoch': 1} {'type': 'loss', 'content': 0.04482335224747658, 'timestamp': '2025-09-30 22:09:28.835645', 'step': 280, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:28.871446', 'step': 280, 'epoch': 1} {'type': 'loss', 'content': 0.017204681411385536, 'timestamp': '2025-09-30 22:09:28.876235', 'step': 281, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:28.912127', 'step': 281, 'epoch': 1} {'type': 'loss', 'content': 0.017169682309031487, 'timestamp': '2025-09-30 22:09:28.915984', 'step': 282, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:28.956136', 'step': 282, 'epoch': 1} {'type': 'loss', 'content': 0.022637654095888138, 'timestamp': '2025-09-30 22:09:28.960484', 'step': 283, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:28.994509', 'step': 283, 'epoch': 1} {'type': 'loss', 'content': 0.03536595404148102, 'timestamp': '2025-09-30 22:09:29.026264', 'step': 284, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:29.071959', 'step': 284, 'epoch': 1} {'type': 'loss', 'content': 0.024568825960159302, 'timestamp': '2025-09-30 22:09:29.074404', 'step': 285, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-30 22:09:30.063079', 'step': 285, 'epoch': 1} {'type': 'pplx', 'content': 67541395.63657059, 'timestamp': '2025-09-30 22:09:30.068836', 'step': 285, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:30.098905', 'step': 285, 'epoch': 1} {'type': 'loss', 'content': 0.014133117161691189, 'timestamp': '2025-09-30 22:09:30.101945', 'step': 286, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:09:30.133810', 'step': 286, 'epoch': 1} {'type': 'loss', 'content': 0.019359812140464783, 'timestamp': '2025-09-30 22:09:30.136460', 'step': 287, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:30.173794', 'step': 287, 'epoch': 1} {'type': 'loss', 'content': 0.02667580358684063, 'timestamp': '2025-09-30 22:09:30.198910', 'step': 288, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:30.231598', 'step': 288, 'epoch': 1} {'type': 'loss', 'content': 0.026296624913811684, 'timestamp': '2025-09-30 22:09:30.236977', 'step': 289, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:30.274855', 'step': 289, 'epoch': 1} {'type': 'loss', 'content': 0.016411451622843742, 'timestamp': '2025-09-30 22:09:30.278524', 'step': 290, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:30.314050', 'step': 290, 'epoch': 1} {'type': 'loss', 'content': 0.020339542999863625, 'timestamp': '2025-09-30 22:09:30.319097', 'step': 291, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:09:30.352661', 'step': 291, 'epoch': 1} {'type': 'loss', 'content': 0.02564086951315403, 'timestamp': '2025-09-30 22:09:30.381112', 'step': 292, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:30.422462', 'step': 292, 'epoch': 1} {'type': 'loss', 'content': 0.015429101884365082, 'timestamp': '2025-09-30 22:09:30.425828', 'step': 293, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:30.461677', 'step': 293, 'epoch': 1} {'type': 'loss', 'content': 0.029271453619003296, 'timestamp': '2025-09-30 22:09:30.469204', 'step': 294, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:30.505070', 'step': 294, 'epoch': 1} {'type': 'loss', 'content': 0.020868074148893356, 'timestamp': '2025-09-30 22:09:30.507786', 'step': 295, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:30.542341', 'step': 295, 'epoch': 1} {'type': 'loss', 'content': 0.014817570336163044, 'timestamp': '2025-09-30 22:09:30.569318', 'step': 296, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:30.613647', 'step': 296, 'epoch': 1} {'type': 'loss', 'content': 0.01824035681784153, 'timestamp': '2025-09-30 22:09:30.616991', 'step': 297, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:30.649677', 'step': 297, 'epoch': 1} {'type': 'loss', 'content': 0.02374071255326271, 'timestamp': '2025-09-30 22:09:30.652636', 'step': 298, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:30.698100', 'step': 298, 'epoch': 1} {'type': 'loss', 'content': 0.01639840006828308, 'timestamp': '2025-09-30 22:09:30.701630', 'step': 299, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:09:30.748179', 'step': 299, 'epoch': 1} {'type': 'loss', 'content': 0.018694445490837097, 'timestamp': '2025-09-30 22:09:30.773482', 'step': 300, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:30.818274', 'step': 300, 'epoch': 1} {'type': 'loss', 'content': 0.024248849600553513, 'timestamp': '2025-09-30 22:09:30.824282', 'step': 301, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:30.865970', 'step': 301, 'epoch': 1} {'type': 'loss', 'content': 0.01262893807142973, 'timestamp': '2025-09-30 22:09:30.868640', 'step': 302, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:30.903772', 'step': 302, 'epoch': 1} {'type': 'loss', 'content': 0.011411392129957676, 'timestamp': '2025-09-30 22:09:30.910795', 'step': 303, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:30.957868', 'step': 303, 'epoch': 1} {'type': 'loss', 'content': 0.015114856883883476, 'timestamp': '2025-09-30 22:09:30.983434', 'step': 304, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:31.020505', 'step': 304, 'epoch': 1} {'type': 'loss', 'content': 0.020919784903526306, 'timestamp': '2025-09-30 22:09:31.023106', 'step': 305, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:31.058977', 'step': 305, 'epoch': 1} {'type': 'loss', 'content': 0.030423318967223167, 'timestamp': '2025-09-30 22:09:31.062077', 'step': 306, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:31.094352', 'step': 306, 'epoch': 1} {'type': 'loss', 'content': 0.029984524473547935, 'timestamp': '2025-09-30 22:09:31.097811', 'step': 307, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:31.129591', 'step': 307, 'epoch': 1} {'type': 'loss', 'content': 0.018007751554250717, 'timestamp': '2025-09-30 22:09:31.158882', 'step': 308, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:31.191676', 'step': 308, 'epoch': 1} {'type': 'loss', 'content': 0.005845424719154835, 'timestamp': '2025-09-30 22:09:31.194782', 'step': 309, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:31.229860', 'step': 309, 'epoch': 1} {'type': 'loss', 'content': 0.033217113465070724, 'timestamp': '2025-09-30 22:09:31.238443', 'step': 310, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:31.272512', 'step': 310, 'epoch': 1} {'type': 'loss', 'content': 0.03235980495810509, 'timestamp': '2025-09-30 22:09:31.275483', 'step': 311, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:31.309284', 'step': 311, 'epoch': 1} {'type': 'loss', 'content': 0.019056664779782295, 'timestamp': '2025-09-30 22:09:31.333963', 'step': 312, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:31.378766', 'step': 312, 'epoch': 1} {'type': 'loss', 'content': 0.02388598583638668, 'timestamp': '2025-09-30 22:09:31.393057', 'step': 313, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:31.431437', 'step': 313, 'epoch': 1} {'type': 'loss', 'content': 0.015021202154457569, 'timestamp': '2025-09-30 22:09:31.436657', 'step': 314, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:31.472069', 'step': 314, 'epoch': 1} {'type': 'loss', 'content': 0.03160003945231438, 'timestamp': '2025-09-30 22:09:31.477990', 'step': 315, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:31.513901', 'step': 315, 'epoch': 1} {'type': 'loss', 'content': 0.006040097679942846, 'timestamp': '2025-09-30 22:09:31.540087', 'step': 316, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:31.576512', 'step': 316, 'epoch': 1} {'type': 'loss', 'content': 0.010645993985235691, 'timestamp': '2025-09-30 22:09:31.581099', 'step': 317, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:09:31.617568', 'step': 317, 'epoch': 1} {'type': 'loss', 'content': 0.028969671577215195, 'timestamp': '2025-09-30 22:09:31.621272', 'step': 318, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:31.656645', 'step': 318, 'epoch': 1} {'type': 'loss', 'content': 0.015102268196642399, 'timestamp': '2025-09-30 22:09:31.662613', 'step': 319, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:09:31.698770', 'step': 319, 'epoch': 1} {'type': 'loss', 'content': 0.020058704540133476, 'timestamp': '2025-09-30 22:09:31.723844', 'step': 320, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:31.769420', 'step': 320, 'epoch': 1} {'type': 'loss', 'content': 0.01846270076930523, 'timestamp': '2025-09-30 22:09:31.774034', 'step': 321, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:31.822909', 'step': 321, 'epoch': 1} {'type': 'loss', 'content': 0.035393718630075455, 'timestamp': '2025-09-30 22:09:31.828845', 'step': 322, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:31.867154', 'step': 322, 'epoch': 1} {'type': 'loss', 'content': 0.0352255143225193, 'timestamp': '2025-09-30 22:09:31.877932', 'step': 323, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:31.925281', 'step': 323, 'epoch': 1} {'type': 'loss', 'content': 0.025907455012202263, 'timestamp': '2025-09-30 22:09:31.950942', 'step': 324, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:31.995930', 'step': 324, 'epoch': 1} {'type': 'loss', 'content': 0.02590997889637947, 'timestamp': '2025-09-30 22:09:32.000413', 'step': 325, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:32.047117', 'step': 325, 'epoch': 1} {'type': 'loss', 'content': 0.02406112290918827, 'timestamp': '2025-09-30 22:09:32.050366', 'step': 326, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:32.086862', 'step': 326, 'epoch': 1} {'type': 'loss', 'content': 0.026598971337080002, 'timestamp': '2025-09-30 22:09:32.091614', 'step': 327, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:09:32.127983', 'step': 327, 'epoch': 1} {'type': 'loss', 'content': 0.024215033277869225, 'timestamp': '2025-09-30 22:09:32.152703', 'step': 328, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:09:32.205176', 'step': 328, 'epoch': 1} {'type': 'loss', 'content': 0.01977970078587532, 'timestamp': '2025-09-30 22:09:32.216556', 'step': 329, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:32.281426', 'step': 329, 'epoch': 1} {'type': 'loss', 'content': 0.02125716581940651, 'timestamp': '2025-09-30 22:09:32.289404', 'step': 330, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:32.333052', 'step': 330, 'epoch': 1} {'type': 'loss', 'content': 0.020354611799120903, 'timestamp': '2025-09-30 22:09:32.337434', 'step': 331, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:32.378729', 'step': 331, 'epoch': 1} {'type': 'loss', 'content': 0.033674128353595734, 'timestamp': '2025-09-30 22:09:32.410082', 'step': 332, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:09:32.450701', 'step': 332, 'epoch': 1} {'type': 'loss', 'content': 0.030016472563147545, 'timestamp': '2025-09-30 22:09:32.462638', 'step': 333, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:32.510816', 'step': 333, 'epoch': 1} {'type': 'loss', 'content': 0.025528879836201668, 'timestamp': '2025-09-30 22:09:32.514079', 'step': 334, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:32.562892', 'step': 334, 'epoch': 1} {'type': 'loss', 'content': 0.022933050990104675, 'timestamp': '2025-09-30 22:09:32.574319', 'step': 335, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:09:32.610613', 'step': 335, 'epoch': 1} {'type': 'loss', 'content': 0.02693336084485054, 'timestamp': '2025-09-30 22:09:32.636625', 'step': 336, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:32.677899', 'step': 336, 'epoch': 1} {'type': 'loss', 'content': 0.0301218144595623, 'timestamp': '2025-09-30 22:09:32.681008', 'step': 337, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:32.722275', 'step': 337, 'epoch': 1} {'type': 'loss', 'content': 0.02498083934187889, 'timestamp': '2025-09-30 22:09:32.729951', 'step': 338, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:32.771925', 'step': 338, 'epoch': 1} {'type': 'loss', 'content': 0.020787309855222702, 'timestamp': '2025-09-30 22:09:32.775976', 'step': 339, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:32.812922', 'step': 339, 'epoch': 1} {'type': 'loss', 'content': 0.020002322271466255, 'timestamp': '2025-09-30 22:09:32.839529', 'step': 340, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:09:32.903630', 'step': 340, 'epoch': 1} {'type': 'loss', 'content': 0.023302387446165085, 'timestamp': '2025-09-30 22:09:32.907785', 'step': 341, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:32.945673', 'step': 341, 'epoch': 1} {'type': 'loss', 'content': 0.030038101598620415, 'timestamp': '2025-09-30 22:09:32.953338', 'step': 342, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-30 22:09:34.031111', 'step': 342, 'epoch': 1} {'type': 'pplx', 'content': 69919293.60206336, 'timestamp': '2025-09-30 22:09:34.042131', 'step': 342, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:34.080251', 'step': 342, 'epoch': 1} {'type': 'loss', 'content': 0.02141713909804821, 'timestamp': '2025-09-30 22:09:34.083537', 'step': 343, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:34.128037', 'step': 343, 'epoch': 1} {'type': 'loss', 'content': 0.04290292412042618, 'timestamp': '2025-09-30 22:09:34.153547', 'step': 344, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:34.187199', 'step': 344, 'epoch': 1} {'type': 'loss', 'content': 0.023067476227879524, 'timestamp': '2025-09-30 22:09:34.191669', 'step': 345, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:34.226177', 'step': 345, 'epoch': 1} {'type': 'loss', 'content': 0.030794357880949974, 'timestamp': '2025-09-30 22:09:34.229473', 'step': 346, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:34.279983', 'step': 346, 'epoch': 1} {'type': 'loss', 'content': 0.021632444113492966, 'timestamp': '2025-09-30 22:09:34.284325', 'step': 347, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:34.334928', 'step': 347, 'epoch': 1} {'type': 'loss', 'content': 0.024647142738103867, 'timestamp': '2025-09-30 22:09:34.361104', 'step': 348, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:34.398676', 'step': 348, 'epoch': 1} {'type': 'loss', 'content': 0.01926472969353199, 'timestamp': '2025-09-30 22:09:34.401633', 'step': 349, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:34.434687', 'step': 349, 'epoch': 1} {'type': 'loss', 'content': 0.029012974351644516, 'timestamp': '2025-09-30 22:09:34.438084', 'step': 350, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:34.475765', 'step': 350, 'epoch': 1} {'type': 'loss', 'content': 0.0349954217672348, 'timestamp': '2025-09-30 22:09:34.478355', 'step': 351, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:34.513407', 'step': 351, 'epoch': 1} {'type': 'loss', 'content': 0.024386536329984665, 'timestamp': '2025-09-30 22:09:34.543507', 'step': 352, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:34.583125', 'step': 352, 'epoch': 1} {'type': 'loss', 'content': 0.037951208651065826, 'timestamp': '2025-09-30 22:09:34.588211', 'step': 353, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:34.637639', 'step': 353, 'epoch': 1} {'type': 'loss', 'content': 0.028125781565904617, 'timestamp': '2025-09-30 22:09:34.641137', 'step': 354, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:34.683930', 'step': 354, 'epoch': 1} {'type': 'loss', 'content': 0.029399218037724495, 'timestamp': '2025-09-30 22:09:34.687425', 'step': 355, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:34.728408', 'step': 355, 'epoch': 1} {'type': 'loss', 'content': 0.012775695882737637, 'timestamp': '2025-09-30 22:09:34.753795', 'step': 356, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:34.804617', 'step': 356, 'epoch': 1} {'type': 'loss', 'content': 0.017647113651037216, 'timestamp': '2025-09-30 22:09:34.808689', 'step': 357, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:34.853194', 'step': 357, 'epoch': 1} {'type': 'loss', 'content': 0.01233199518173933, 'timestamp': '2025-09-30 22:09:34.864538', 'step': 358, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:34.907648', 'step': 358, 'epoch': 1} {'type': 'loss', 'content': 0.009669276885688305, 'timestamp': '2025-09-30 22:09:34.911690', 'step': 359, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:09:34.946291', 'step': 359, 'epoch': 1} {'type': 'loss', 'content': 0.019029730930924416, 'timestamp': '2025-09-30 22:09:34.970626', 'step': 360, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:09:35.006672', 'step': 360, 'epoch': 1} {'type': 'loss', 'content': 0.02731066755950451, 'timestamp': '2025-09-30 22:09:35.009767', 'step': 361, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:35.050270', 'step': 361, 'epoch': 1} {'type': 'loss', 'content': 0.032158952206373215, 'timestamp': '2025-09-30 22:09:35.054516', 'step': 362, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:35.102949', 'step': 362, 'epoch': 1} {'type': 'loss', 'content': 0.022303972393274307, 'timestamp': '2025-09-30 22:09:35.108215', 'step': 363, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:35.148425', 'step': 363, 'epoch': 1} {'type': 'loss', 'content': 0.010140285827219486, 'timestamp': '2025-09-30 22:09:35.173555', 'step': 364, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:35.212952', 'step': 364, 'epoch': 1} {'type': 'loss', 'content': 0.02385084703564644, 'timestamp': '2025-09-30 22:09:35.225443', 'step': 365, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:35.271733', 'step': 365, 'epoch': 1} {'type': 'loss', 'content': 0.013454384170472622, 'timestamp': '2025-09-30 22:09:35.277010', 'step': 366, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:09:35.312917', 'step': 366, 'epoch': 1} {'type': 'loss', 'content': 0.017787154763936996, 'timestamp': '2025-09-30 22:09:35.323639', 'step': 367, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:35.360561', 'step': 367, 'epoch': 1} {'type': 'loss', 'content': 0.02700085937976837, 'timestamp': '2025-09-30 22:09:35.385136', 'step': 368, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:35.421380', 'step': 368, 'epoch': 1} {'type': 'loss', 'content': 0.02195524238049984, 'timestamp': '2025-09-30 22:09:35.430490', 'step': 369, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:35.468443', 'step': 369, 'epoch': 1} {'type': 'loss', 'content': 0.0264048483222723, 'timestamp': '2025-09-30 22:09:35.472904', 'step': 370, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:35.511783', 'step': 370, 'epoch': 1} {'type': 'loss', 'content': 0.012875868938863277, 'timestamp': '2025-09-30 22:09:35.516881', 'step': 371, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:35.556326', 'step': 371, 'epoch': 1} {'type': 'loss', 'content': 0.03059910610318184, 'timestamp': '2025-09-30 22:09:35.581083', 'step': 372, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:35.623872', 'step': 372, 'epoch': 1} {'type': 'loss', 'content': 0.026967348530888557, 'timestamp': '2025-09-30 22:09:35.627985', 'step': 373, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:35.664828', 'step': 373, 'epoch': 1} {'type': 'loss', 'content': 0.025812586769461632, 'timestamp': '2025-09-30 22:09:35.677022', 'step': 374, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:35.715645', 'step': 374, 'epoch': 1} {'type': 'loss', 'content': 0.017953310161828995, 'timestamp': '2025-09-30 22:09:35.718931', 'step': 375, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:35.779443', 'step': 375, 'epoch': 1} {'type': 'loss', 'content': 0.01472870446741581, 'timestamp': '2025-09-30 22:09:35.804726', 'step': 376, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:35.839732', 'step': 376, 'epoch': 1} {'type': 'loss', 'content': 0.015016913414001465, 'timestamp': '2025-09-30 22:09:35.844058', 'step': 377, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:35.888207', 'step': 377, 'epoch': 1} {'type': 'loss', 'content': 0.02110576257109642, 'timestamp': '2025-09-30 22:09:35.892136', 'step': 378, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:35.943435', 'step': 378, 'epoch': 1} {'type': 'loss', 'content': 0.020046228542923927, 'timestamp': '2025-09-30 22:09:35.948441', 'step': 379, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:35.991476', 'step': 379, 'epoch': 1} {'type': 'loss', 'content': 0.020357858389616013, 'timestamp': '2025-09-30 22:09:36.017437', 'step': 380, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:36.052483', 'step': 380, 'epoch': 1} {'type': 'loss', 'content': 0.02067236602306366, 'timestamp': '2025-09-30 22:09:36.056689', 'step': 381, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:09:36.096194', 'step': 381, 'epoch': 1} {'type': 'loss', 'content': 0.021537670865654945, 'timestamp': '2025-09-30 22:09:36.100337', 'step': 382, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:36.141962', 'step': 382, 'epoch': 1} {'type': 'loss', 'content': 0.017676519230008125, 'timestamp': '2025-09-30 22:09:36.145376', 'step': 383, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:36.179449', 'step': 383, 'epoch': 1} {'type': 'loss', 'content': 0.02317376248538494, 'timestamp': '2025-09-30 22:09:36.204712', 'step': 384, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:36.245389', 'step': 384, 'epoch': 1} {'type': 'loss', 'content': 0.016081545501947403, 'timestamp': '2025-09-30 22:09:36.261000', 'step': 385, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:36.296757', 'step': 385, 'epoch': 1} {'type': 'loss', 'content': 0.01612349972128868, 'timestamp': '2025-09-30 22:09:36.309147', 'step': 386, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:36.352038', 'step': 386, 'epoch': 1} {'type': 'loss', 'content': 0.01686694473028183, 'timestamp': '2025-09-30 22:09:36.359009', 'step': 387, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:36.398587', 'step': 387, 'epoch': 1} {'type': 'loss', 'content': 0.023269539698958397, 'timestamp': '2025-09-30 22:09:36.423721', 'step': 388, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:36.462294', 'step': 388, 'epoch': 1} {'type': 'loss', 'content': 0.02148287370800972, 'timestamp': '2025-09-30 22:09:36.466876', 'step': 389, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:36.507808', 'step': 389, 'epoch': 1} {'type': 'loss', 'content': 0.02776309847831726, 'timestamp': '2025-09-30 22:09:36.511686', 'step': 390, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:36.548916', 'step': 390, 'epoch': 1} {'type': 'loss', 'content': 0.029952269047498703, 'timestamp': '2025-09-30 22:09:36.553771', 'step': 391, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:36.588283', 'step': 391, 'epoch': 1} {'type': 'loss', 'content': 0.015080233104526997, 'timestamp': '2025-09-30 22:09:36.613242', 'step': 392, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:36.647960', 'step': 392, 'epoch': 1} {'type': 'loss', 'content': 0.01509413868188858, 'timestamp': '2025-09-30 22:09:36.651767', 'step': 393, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:09:36.685129', 'step': 393, 'epoch': 1} {'type': 'loss', 'content': 0.02030949667096138, 'timestamp': '2025-09-30 22:09:36.688359', 'step': 394, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:36.737919', 'step': 394, 'epoch': 1} {'type': 'loss', 'content': 0.010384783148765564, 'timestamp': '2025-09-30 22:09:36.745003', 'step': 395, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:36.782177', 'step': 395, 'epoch': 1} {'type': 'loss', 'content': 0.01926104538142681, 'timestamp': '2025-09-30 22:09:36.806865', 'step': 396, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:36.846794', 'step': 396, 'epoch': 1} {'type': 'loss', 'content': 0.007770257536321878, 'timestamp': '2025-09-30 22:09:36.852637', 'step': 397, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:36.903386', 'step': 397, 'epoch': 1} {'type': 'loss', 'content': 0.012745345011353493, 'timestamp': '2025-09-30 22:09:36.907521', 'step': 398, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:36.945406', 'step': 398, 'epoch': 1} {'type': 'loss', 'content': 0.020830152556300163, 'timestamp': '2025-09-30 22:09:36.952080', 'step': 399, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-30 22:09:37.985435', 'step': 399, 'epoch': 1} {'type': 'pplx', 'content': 69766047.56409389, 'timestamp': '2025-09-30 22:09:37.989273', 'step': 399, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:38.023315', 'step': 399, 'epoch': 1} {'type': 'loss', 'content': 0.015908202156424522, 'timestamp': '2025-09-30 22:09:38.049697', 'step': 400, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:38.094255', 'step': 400, 'epoch': 1} {'type': 'loss', 'content': 0.00617224583402276, 'timestamp': '2025-09-30 22:09:38.097537', 'step': 401, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:38.134714', 'step': 401, 'epoch': 1} {'type': 'loss', 'content': 0.02191239409148693, 'timestamp': '2025-09-30 22:09:38.138873', 'step': 402, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:09:38.174380', 'step': 402, 'epoch': 1} {'type': 'loss', 'content': 0.020212730392813683, 'timestamp': '2025-09-30 22:09:38.180307', 'step': 403, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:38.218317', 'step': 403, 'epoch': 1} {'type': 'loss', 'content': 0.035332221537828445, 'timestamp': '2025-09-30 22:09:38.250151', 'step': 404, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:09:38.291319', 'step': 404, 'epoch': 1} {'type': 'loss', 'content': 0.028063569217920303, 'timestamp': '2025-09-30 22:09:38.303187', 'step': 405, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:09:38.339867', 'step': 405, 'epoch': 1} {'type': 'loss', 'content': 0.037404220551252365, 'timestamp': '2025-09-30 22:09:38.344974', 'step': 406, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:38.384853', 'step': 406, 'epoch': 1} {'type': 'loss', 'content': 0.009203077293932438, 'timestamp': '2025-09-30 22:09:38.390086', 'step': 407, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:38.429683', 'step': 407, 'epoch': 1} {'type': 'loss', 'content': 0.006914178375154734, 'timestamp': '2025-09-30 22:09:38.455627', 'step': 408, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:38.492632', 'step': 408, 'epoch': 1} {'type': 'loss', 'content': 0.01954054646193981, 'timestamp': '2025-09-30 22:09:38.495612', 'step': 409, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:38.529636', 'step': 409, 'epoch': 1} {'type': 'loss', 'content': 0.021575123071670532, 'timestamp': '2025-09-30 22:09:38.532507', 'step': 410, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:09:38.574228', 'step': 410, 'epoch': 1} {'type': 'loss', 'content': 0.01723899319767952, 'timestamp': '2025-09-30 22:09:38.577214', 'step': 411, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:38.620060', 'step': 411, 'epoch': 1} {'type': 'loss', 'content': 0.038678593933582306, 'timestamp': '2025-09-30 22:09:38.650713', 'step': 412, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:38.683786', 'step': 412, 'epoch': 1} {'type': 'loss', 'content': 0.013338501565158367, 'timestamp': '2025-09-30 22:09:38.687134', 'step': 413, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:38.724089', 'step': 413, 'epoch': 1} {'type': 'loss', 'content': 0.03637400269508362, 'timestamp': '2025-09-30 22:09:38.736940', 'step': 414, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:38.771853', 'step': 414, 'epoch': 1} {'type': 'loss', 'content': 0.04411586374044418, 'timestamp': '2025-09-30 22:09:38.786289', 'step': 415, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:38.819623', 'step': 415, 'epoch': 1} {'type': 'loss', 'content': 0.00751527538523078, 'timestamp': '2025-09-30 22:09:38.844258', 'step': 416, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:38.882058', 'step': 416, 'epoch': 1} {'type': 'loss', 'content': 0.016765791922807693, 'timestamp': '2025-09-30 22:09:38.894642', 'step': 417, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:38.932004', 'step': 417, 'epoch': 1} {'type': 'loss', 'content': 0.028791921213269234, 'timestamp': '2025-09-30 22:09:38.936323', 'step': 418, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:38.976353', 'step': 418, 'epoch': 1} {'type': 'loss', 'content': 0.022568225860595703, 'timestamp': '2025-09-30 22:09:38.979880', 'step': 419, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:09:39.013998', 'step': 419, 'epoch': 1} {'type': 'loss', 'content': 0.014296877197921276, 'timestamp': '2025-09-30 22:09:39.044562', 'step': 420, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:39.081846', 'step': 420, 'epoch': 1} {'type': 'loss', 'content': 0.012356564402580261, 'timestamp': '2025-09-30 22:09:39.086430', 'step': 421, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:09:39.130132', 'step': 421, 'epoch': 1} {'type': 'loss', 'content': 0.010573080740869045, 'timestamp': '2025-09-30 22:09:39.135029', 'step': 422, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:39.176056', 'step': 422, 'epoch': 1} {'type': 'loss', 'content': 0.013825618661940098, 'timestamp': '2025-09-30 22:09:39.179347', 'step': 423, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:09:39.212189', 'step': 423, 'epoch': 1} {'type': 'loss', 'content': 0.015153718180954456, 'timestamp': '2025-09-30 22:09:39.236550', 'step': 424, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:39.270159', 'step': 424, 'epoch': 1} {'type': 'loss', 'content': 0.019143089652061462, 'timestamp': '2025-09-30 22:09:39.274183', 'step': 425, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:09:39.306859', 'step': 425, 'epoch': 1} {'type': 'loss', 'content': 0.025414396077394485, 'timestamp': '2025-09-30 22:09:39.309563', 'step': 426, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:39.344109', 'step': 426, 'epoch': 1} {'type': 'loss', 'content': 0.018626336008310318, 'timestamp': '2025-09-30 22:09:39.346665', 'step': 427, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:39.386968', 'step': 427, 'epoch': 1} {'type': 'loss', 'content': 0.013591468334197998, 'timestamp': '2025-09-30 22:09:39.411056', 'step': 428, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:39.443796', 'step': 428, 'epoch': 1} {'type': 'loss', 'content': 0.02437545731663704, 'timestamp': '2025-09-30 22:09:39.450852', 'step': 429, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:39.485737', 'step': 429, 'epoch': 1} {'type': 'loss', 'content': 0.020426731556653976, 'timestamp': '2025-09-30 22:09:39.489024', 'step': 430, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:39.528782', 'step': 430, 'epoch': 1} {'type': 'loss', 'content': 0.01356650609523058, 'timestamp': '2025-09-30 22:09:39.536063', 'step': 431, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:39.580730', 'step': 431, 'epoch': 1} {'type': 'loss', 'content': 0.019357116892933846, 'timestamp': '2025-09-30 22:09:39.606081', 'step': 432, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:39.647934', 'step': 432, 'epoch': 1} {'type': 'loss', 'content': 0.014607379212975502, 'timestamp': '2025-09-30 22:09:39.657392', 'step': 433, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:09:39.705398', 'step': 433, 'epoch': 1} {'type': 'loss', 'content': 0.02458411268889904, 'timestamp': '2025-09-30 22:09:39.716036', 'step': 434, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:39.753100', 'step': 434, 'epoch': 1} {'type': 'loss', 'content': 0.01863814704120159, 'timestamp': '2025-09-30 22:09:39.761228', 'step': 435, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:39.794958', 'step': 435, 'epoch': 1} {'type': 'loss', 'content': 0.015778204426169395, 'timestamp': '2025-09-30 22:09:39.820506', 'step': 436, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:39.858954', 'step': 436, 'epoch': 1} {'type': 'loss', 'content': 0.026689749211072922, 'timestamp': '2025-09-30 22:09:39.861771', 'step': 437, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:39.903709', 'step': 437, 'epoch': 1} {'type': 'loss', 'content': 0.028620868921279907, 'timestamp': '2025-09-30 22:09:39.912094', 'step': 438, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:39.950297', 'step': 438, 'epoch': 1} {'type': 'loss', 'content': 0.038862477988004684, 'timestamp': '2025-09-30 22:09:39.953880', 'step': 439, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:39.991357', 'step': 439, 'epoch': 1} {'type': 'loss', 'content': 0.01892365887761116, 'timestamp': '2025-09-30 22:09:40.016499', 'step': 440, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:40.058678', 'step': 440, 'epoch': 1} {'type': 'loss', 'content': 0.02564559504389763, 'timestamp': '2025-09-30 22:09:40.068642', 'step': 441, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:40.109996', 'step': 441, 'epoch': 1} {'type': 'loss', 'content': 0.02982628531754017, 'timestamp': '2025-09-30 22:09:40.114870', 'step': 442, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:09:40.155375', 'step': 442, 'epoch': 1} {'type': 'loss', 'content': 0.012708432041108608, 'timestamp': '2025-09-30 22:09:40.167290', 'step': 443, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:40.227142', 'step': 443, 'epoch': 1} {'type': 'loss', 'content': 0.03156870976090431, 'timestamp': '2025-09-30 22:09:40.262332', 'step': 444, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:40.307457', 'step': 444, 'epoch': 1} {'type': 'loss', 'content': 0.012731562368571758, 'timestamp': '2025-09-30 22:09:40.320228', 'step': 445, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:40.373129', 'step': 445, 'epoch': 1} {'type': 'loss', 'content': 0.019928231835365295, 'timestamp': '2025-09-30 22:09:40.377514', 'step': 446, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:40.424312', 'step': 446, 'epoch': 1} {'type': 'loss', 'content': 0.010161194019019604, 'timestamp': '2025-09-30 22:09:40.438210', 'step': 447, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:40.484577', 'step': 447, 'epoch': 1} {'type': 'loss', 'content': 0.017352495342493057, 'timestamp': '2025-09-30 22:09:40.521010', 'step': 448, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:40.564598', 'step': 448, 'epoch': 1} {'type': 'loss', 'content': 0.009393865242600441, 'timestamp': '2025-09-30 22:09:40.569244', 'step': 449, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:40.612354', 'step': 449, 'epoch': 1} {'type': 'loss', 'content': 0.02728438377380371, 'timestamp': '2025-09-30 22:09:40.615217', 'step': 450, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:40.655775', 'step': 450, 'epoch': 1} {'type': 'loss', 'content': 0.020611116662621498, 'timestamp': '2025-09-30 22:09:40.659271', 'step': 451, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:40.693792', 'step': 451, 'epoch': 1} {'type': 'loss', 'content': 0.009651792235672474, 'timestamp': '2025-09-30 22:09:40.723988', 'step': 452, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:40.776475', 'step': 452, 'epoch': 1} {'type': 'loss', 'content': 0.016400078311562538, 'timestamp': '2025-09-30 22:09:40.788329', 'step': 453, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:09:40.840089', 'step': 453, 'epoch': 1} {'type': 'loss', 'content': 0.018265044316649437, 'timestamp': '2025-09-30 22:09:40.843522', 'step': 454, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:40.885816', 'step': 454, 'epoch': 1} {'type': 'loss', 'content': 0.02419288642704487, 'timestamp': '2025-09-30 22:09:40.889386', 'step': 455, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:40.927807', 'step': 455, 'epoch': 1} {'type': 'loss', 'content': 0.013079372234642506, 'timestamp': '2025-09-30 22:09:40.960208', 'step': 456, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-30 22:09:41.993404', 'step': 456, 'epoch': 1} {'type': 'pplx', 'content': 71624224.58457558, 'timestamp': '2025-09-30 22:09:41.996470', 'step': 456, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:42.026109', 'step': 456, 'epoch': 1} {'type': 'loss', 'content': 0.022408118471503258, 'timestamp': '2025-09-30 22:09:42.028771', 'step': 457, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:42.065959', 'step': 457, 'epoch': 1} {'type': 'loss', 'content': 0.03150170296430588, 'timestamp': '2025-09-30 22:09:42.068419', 'step': 458, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:42.103334', 'step': 458, 'epoch': 1} {'type': 'loss', 'content': 0.010739601217210293, 'timestamp': '2025-09-30 22:09:42.108529', 'step': 459, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:09:42.140776', 'step': 459, 'epoch': 1} {'type': 'loss', 'content': 0.03207072988152504, 'timestamp': '2025-09-30 22:09:42.165701', 'step': 460, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:42.197356', 'step': 460, 'epoch': 1} {'type': 'loss', 'content': 0.017702434211969376, 'timestamp': '2025-09-30 22:09:42.200545', 'step': 461, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:42.233174', 'step': 461, 'epoch': 1} {'type': 'loss', 'content': 0.009795568883419037, 'timestamp': '2025-09-30 22:09:42.237262', 'step': 462, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:42.276405', 'step': 462, 'epoch': 1} {'type': 'loss', 'content': 0.008488679304718971, 'timestamp': '2025-09-30 22:09:42.285896', 'step': 463, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:42.318785', 'step': 463, 'epoch': 1} {'type': 'loss', 'content': 0.021367086097598076, 'timestamp': '2025-09-30 22:09:42.343436', 'step': 464, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:42.376115', 'step': 464, 'epoch': 1} {'type': 'loss', 'content': 0.01849384978413582, 'timestamp': '2025-09-30 22:09:42.379287', 'step': 465, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:42.419389', 'step': 465, 'epoch': 1} {'type': 'loss', 'content': 0.015463622286915779, 'timestamp': '2025-09-30 22:09:42.425901', 'step': 466, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:42.463041', 'step': 466, 'epoch': 1} {'type': 'loss', 'content': 0.018109837546944618, 'timestamp': '2025-09-30 22:09:42.466051', 'step': 467, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:42.498020', 'step': 467, 'epoch': 1} {'type': 'loss', 'content': 0.027336323633790016, 'timestamp': '2025-09-30 22:09:42.524118', 'step': 468, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:42.556592', 'step': 468, 'epoch': 1} {'type': 'loss', 'content': 0.023516599088907242, 'timestamp': '2025-09-30 22:09:42.559441', 'step': 469, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:42.602347', 'step': 469, 'epoch': 1} {'type': 'loss', 'content': 0.03789094090461731, 'timestamp': '2025-09-30 22:09:42.606093', 'step': 470, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:42.644859', 'step': 470, 'epoch': 1} {'type': 'loss', 'content': 0.006722516845911741, 'timestamp': '2025-09-30 22:09:42.647902', 'step': 471, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:42.681530', 'step': 471, 'epoch': 1} {'type': 'loss', 'content': 0.018469030037522316, 'timestamp': '2025-09-30 22:09:42.707026', 'step': 472, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:09:42.746588', 'step': 472, 'epoch': 1} {'type': 'loss', 'content': 0.019120389595627785, 'timestamp': '2025-09-30 22:09:42.749953', 'step': 473, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:42.784143', 'step': 473, 'epoch': 1} {'type': 'loss', 'content': 0.013059214688837528, 'timestamp': '2025-09-30 22:09:42.788133', 'step': 474, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:42.822558', 'step': 474, 'epoch': 1} {'type': 'loss', 'content': 0.011780438013374805, 'timestamp': '2025-09-30 22:09:42.825250', 'step': 475, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:42.869895', 'step': 475, 'epoch': 1} {'type': 'loss', 'content': 0.01069071888923645, 'timestamp': '2025-09-30 22:09:42.894335', 'step': 476, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:42.935117', 'step': 476, 'epoch': 1} {'type': 'loss', 'content': 0.012630014680325985, 'timestamp': '2025-09-30 22:09:42.941017', 'step': 477, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:42.973933', 'step': 477, 'epoch': 1} {'type': 'loss', 'content': 0.014345736242830753, 'timestamp': '2025-09-30 22:09:42.977658', 'step': 478, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:43.016170', 'step': 478, 'epoch': 1} {'type': 'loss', 'content': 0.013579734601080418, 'timestamp': '2025-09-30 22:09:43.022549', 'step': 479, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:43.056113', 'step': 479, 'epoch': 1} {'type': 'loss', 'content': 0.005495802033692598, 'timestamp': '2025-09-30 22:09:43.080369', 'step': 480, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:43.112060', 'step': 480, 'epoch': 1} {'type': 'loss', 'content': 0.024428632110357285, 'timestamp': '2025-09-30 22:09:43.115005', 'step': 481, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:43.157534', 'step': 481, 'epoch': 1} {'type': 'loss', 'content': 0.027178974822163582, 'timestamp': '2025-09-30 22:09:43.160113', 'step': 482, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:43.193019', 'step': 482, 'epoch': 1} {'type': 'loss', 'content': 0.011072909459471703, 'timestamp': '2025-09-30 22:09:43.199496', 'step': 483, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:43.230455', 'step': 483, 'epoch': 1} {'type': 'loss', 'content': 0.015952181071043015, 'timestamp': '2025-09-30 22:09:43.255671', 'step': 484, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:43.299126', 'step': 484, 'epoch': 1} {'type': 'loss', 'content': 0.01662355288863182, 'timestamp': '2025-09-30 22:09:43.301958', 'step': 485, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:43.341828', 'step': 485, 'epoch': 1} {'type': 'loss', 'content': 0.013828770257532597, 'timestamp': '2025-09-30 22:09:43.344889', 'step': 486, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:09:43.380170', 'step': 486, 'epoch': 1} {'type': 'loss', 'content': 0.024804027751088142, 'timestamp': '2025-09-30 22:09:43.383012', 'step': 487, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:43.417512', 'step': 487, 'epoch': 1} {'type': 'loss', 'content': 0.024769103154540062, 'timestamp': '2025-09-30 22:09:43.442612', 'step': 488, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:43.476936', 'step': 488, 'epoch': 1} {'type': 'loss', 'content': 0.014705442823469639, 'timestamp': '2025-09-30 22:09:43.485386', 'step': 489, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:43.520192', 'step': 489, 'epoch': 1} {'type': 'loss', 'content': 0.012051516212522984, 'timestamp': '2025-09-30 22:09:43.524132', 'step': 490, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:43.559375', 'step': 490, 'epoch': 1} {'type': 'loss', 'content': 0.022848201915621758, 'timestamp': '2025-09-30 22:09:43.562399', 'step': 491, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:43.596133', 'step': 491, 'epoch': 1} {'type': 'loss', 'content': 0.008396407589316368, 'timestamp': '2025-09-30 22:09:43.620436', 'step': 492, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:43.654644', 'step': 492, 'epoch': 1} {'type': 'loss', 'content': 0.006951562594622374, 'timestamp': '2025-09-30 22:09:43.658010', 'step': 493, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:43.697030', 'step': 493, 'epoch': 1} {'type': 'loss', 'content': 0.03694300726056099, 'timestamp': '2025-09-30 22:09:43.700477', 'step': 494, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:43.742657', 'step': 494, 'epoch': 1} {'type': 'loss', 'content': 0.011894827708601952, 'timestamp': '2025-09-30 22:09:43.750525', 'step': 495, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:43.788152', 'step': 495, 'epoch': 1} {'type': 'loss', 'content': 0.018705202266573906, 'timestamp': '2025-09-30 22:09:43.822061', 'step': 496, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:43.862987', 'step': 496, 'epoch': 1} {'type': 'loss', 'content': 0.008354590274393559, 'timestamp': '2025-09-30 22:09:43.867623', 'step': 497, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:09:43.909660', 'step': 497, 'epoch': 1} {'type': 'loss', 'content': 0.012969491071999073, 'timestamp': '2025-09-30 22:09:43.912524', 'step': 498, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:43.947799', 'step': 498, 'epoch': 1} {'type': 'loss', 'content': 0.021853569895029068, 'timestamp': '2025-09-30 22:09:43.950860', 'step': 499, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:43.987077', 'step': 499, 'epoch': 1} {'type': 'loss', 'content': 0.0062960060313344, 'timestamp': '2025-09-30 22:09:44.011190', 'step': 500, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 500', 'timestamp': '2025-09-30 22:09:50.738730', 'step': 500, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:50.780347', 'step': 500, 'epoch': 1} {'type': 'loss', 'content': 0.005837480071932077, 'timestamp': '2025-09-30 22:09:50.784736', 'step': 501, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:50.818683', 'step': 501, 'epoch': 1} {'type': 'loss', 'content': 0.018935924395918846, 'timestamp': '2025-09-30 22:09:50.822618', 'step': 502, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:50.858601', 'step': 502, 'epoch': 1} {'type': 'loss', 'content': 0.014775346033275127, 'timestamp': '2025-09-30 22:09:50.862412', 'step': 503, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:50.897092', 'step': 503, 'epoch': 1} {'type': 'loss', 'content': 0.015353151597082615, 'timestamp': '2025-09-30 22:09:50.921935', 'step': 504, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:50.970503', 'step': 504, 'epoch': 1} {'type': 'loss', 'content': 0.0022692049387842417, 'timestamp': '2025-09-30 22:09:50.973217', 'step': 505, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:51.006814', 'step': 505, 'epoch': 1} {'type': 'loss', 'content': 0.060447730123996735, 'timestamp': '2025-09-30 22:09:51.010466', 'step': 506, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:51.045170', 'step': 506, 'epoch': 1} {'type': 'loss', 'content': 0.030348746106028557, 'timestamp': '2025-09-30 22:09:51.048114', 'step': 507, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:09:51.092241', 'step': 507, 'epoch': 1} {'type': 'loss', 'content': 0.037154439836740494, 'timestamp': '2025-09-30 22:09:51.118561', 'step': 508, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:51.161099', 'step': 508, 'epoch': 1} {'type': 'loss', 'content': 0.0052850088104605675, 'timestamp': '2025-09-30 22:09:51.177958', 'step': 509, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:51.231146', 'step': 509, 'epoch': 1} {'type': 'loss', 'content': 0.00752244982868433, 'timestamp': '2025-09-30 22:09:51.235644', 'step': 510, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:51.297841', 'step': 510, 'epoch': 1} {'type': 'loss', 'content': 0.036073219031095505, 'timestamp': '2025-09-30 22:09:51.302005', 'step': 511, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:51.346641', 'step': 511, 'epoch': 1} {'type': 'loss', 'content': 0.003323480486869812, 'timestamp': '2025-09-30 22:09:51.371866', 'step': 512, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:51.415960', 'step': 512, 'epoch': 1} {'type': 'loss', 'content': 0.0270464438945055, 'timestamp': '2025-09-30 22:09:51.430159', 'step': 513, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-30 22:09:52.498120', 'step': 513, 'epoch': 1} {'type': 'pplx', 'content': 69181075.1968092, 'timestamp': '2025-09-30 22:09:52.501210', 'step': 513, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:52.531996', 'step': 513, 'epoch': 1} {'type': 'loss', 'content': 0.003395517822355032, 'timestamp': '2025-09-30 22:09:52.542774', 'step': 514, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:52.578982', 'step': 514, 'epoch': 1} {'type': 'loss', 'content': 0.025082603096961975, 'timestamp': '2025-09-30 22:09:52.583259', 'step': 515, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:52.620589', 'step': 515, 'epoch': 1} {'type': 'loss', 'content': 0.018759986385703087, 'timestamp': '2025-09-30 22:09:52.653747', 'step': 516, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:09:52.693029', 'step': 516, 'epoch': 1} {'type': 'loss', 'content': 0.032633136957883835, 'timestamp': '2025-09-30 22:09:52.696843', 'step': 517, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:52.732376', 'step': 517, 'epoch': 1} {'type': 'loss', 'content': 0.02764914371073246, 'timestamp': '2025-09-30 22:09:52.736492', 'step': 518, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:52.771273', 'step': 518, 'epoch': 1} {'type': 'loss', 'content': 0.01991136558353901, 'timestamp': '2025-09-30 22:09:52.776288', 'step': 519, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:52.810695', 'step': 519, 'epoch': 1} {'type': 'loss', 'content': 0.00627879286184907, 'timestamp': '2025-09-30 22:09:52.834895', 'step': 520, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:52.876942', 'step': 520, 'epoch': 1} {'type': 'loss', 'content': 0.022504499182105064, 'timestamp': '2025-09-30 22:09:52.879858', 'step': 521, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:52.931814', 'step': 521, 'epoch': 1} {'type': 'loss', 'content': 0.020745843648910522, 'timestamp': '2025-09-30 22:09:52.935751', 'step': 522, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:52.977983', 'step': 522, 'epoch': 1} {'type': 'loss', 'content': 0.010214862413704395, 'timestamp': '2025-09-30 22:09:52.981858', 'step': 523, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:53.018030', 'step': 523, 'epoch': 1} {'type': 'loss', 'content': 0.00609058141708374, 'timestamp': '2025-09-30 22:09:53.043069', 'step': 524, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:53.082381', 'step': 524, 'epoch': 1} {'type': 'loss', 'content': 0.021598542109131813, 'timestamp': '2025-09-30 22:09:53.085908', 'step': 525, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:53.133503', 'step': 525, 'epoch': 1} {'type': 'loss', 'content': 0.03276056423783302, 'timestamp': '2025-09-30 22:09:53.136848', 'step': 526, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:53.178781', 'step': 526, 'epoch': 1} {'type': 'loss', 'content': 0.015305069275200367, 'timestamp': '2025-09-30 22:09:53.182574', 'step': 527, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:53.214935', 'step': 527, 'epoch': 1} {'type': 'loss', 'content': 0.013995721936225891, 'timestamp': '2025-09-30 22:09:53.239870', 'step': 528, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:53.273616', 'step': 528, 'epoch': 1} {'type': 'loss', 'content': 0.008622610941529274, 'timestamp': '2025-09-30 22:09:53.277669', 'step': 529, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:53.318583', 'step': 529, 'epoch': 1} {'type': 'loss', 'content': 0.007376163732260466, 'timestamp': '2025-09-30 22:09:53.329815', 'step': 530, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:53.370223', 'step': 530, 'epoch': 1} {'type': 'loss', 'content': 0.02018904872238636, 'timestamp': '2025-09-30 22:09:53.374649', 'step': 531, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:53.413421', 'step': 531, 'epoch': 1} {'type': 'loss', 'content': 0.015136617235839367, 'timestamp': '2025-09-30 22:09:53.437519', 'step': 532, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:53.472547', 'step': 532, 'epoch': 1} {'type': 'loss', 'content': 0.02124469354748726, 'timestamp': '2025-09-30 22:09:53.476684', 'step': 533, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:53.514396', 'step': 533, 'epoch': 1} {'type': 'loss', 'content': 0.03296453133225441, 'timestamp': '2025-09-30 22:09:53.518042', 'step': 534, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:53.560333', 'step': 534, 'epoch': 1} {'type': 'loss', 'content': 0.018277723342180252, 'timestamp': '2025-09-30 22:09:53.564731', 'step': 535, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:53.602170', 'step': 535, 'epoch': 1} {'type': 'loss', 'content': 0.006135314702987671, 'timestamp': '2025-09-30 22:09:53.634980', 'step': 536, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:53.688603', 'step': 536, 'epoch': 1} {'type': 'loss', 'content': 0.032480981200933456, 'timestamp': '2025-09-30 22:09:53.691849', 'step': 537, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:53.725993', 'step': 537, 'epoch': 1} {'type': 'loss', 'content': 0.013781941495835781, 'timestamp': '2025-09-30 22:09:53.729468', 'step': 538, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:53.763671', 'step': 538, 'epoch': 1} {'type': 'loss', 'content': 0.060867954045534134, 'timestamp': '2025-09-30 22:09:53.768282', 'step': 539, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:53.812288', 'step': 539, 'epoch': 1} {'type': 'loss', 'content': 0.019219841808080673, 'timestamp': '2025-09-30 22:09:53.837721', 'step': 540, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:53.883827', 'step': 540, 'epoch': 1} {'type': 'loss', 'content': 0.04094391316175461, 'timestamp': '2025-09-30 22:09:53.898395', 'step': 541, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:53.947781', 'step': 541, 'epoch': 1} {'type': 'loss', 'content': 0.01698930561542511, 'timestamp': '2025-09-30 22:09:53.954204', 'step': 542, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:54.018268', 'step': 542, 'epoch': 1} {'type': 'loss', 'content': 0.005156234372407198, 'timestamp': '2025-09-30 22:09:54.022377', 'step': 543, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:54.083498', 'step': 543, 'epoch': 1} {'type': 'loss', 'content': 0.025457771494984627, 'timestamp': '2025-09-30 22:09:54.119462', 'step': 544, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:54.164984', 'step': 544, 'epoch': 1} {'type': 'loss', 'content': 0.026326140388846397, 'timestamp': '2025-09-30 22:09:54.169457', 'step': 545, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:54.204557', 'step': 545, 'epoch': 1} {'type': 'loss', 'content': 0.03105790540575981, 'timestamp': '2025-09-30 22:09:54.208175', 'step': 546, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:54.249537', 'step': 546, 'epoch': 1} {'type': 'loss', 'content': 0.03859322890639305, 'timestamp': '2025-09-30 22:09:54.263448', 'step': 547, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:09:54.306427', 'step': 547, 'epoch': 1} {'type': 'loss', 'content': 0.02075488492846489, 'timestamp': '2025-09-30 22:09:54.331639', 'step': 548, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:54.372706', 'step': 548, 'epoch': 1} {'type': 'loss', 'content': 0.0369565524160862, 'timestamp': '2025-09-30 22:09:54.384731', 'step': 549, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:54.439782', 'step': 549, 'epoch': 1} {'type': 'loss', 'content': 0.026859017089009285, 'timestamp': '2025-09-30 22:09:54.443146', 'step': 550, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:54.477483', 'step': 550, 'epoch': 1} {'type': 'loss', 'content': 0.027645906433463097, 'timestamp': '2025-09-30 22:09:54.489100', 'step': 551, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:54.532929', 'step': 551, 'epoch': 1} {'type': 'loss', 'content': 0.035172879695892334, 'timestamp': '2025-09-30 22:09:54.557550', 'step': 552, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:54.591364', 'step': 552, 'epoch': 1} {'type': 'loss', 'content': 0.03344567492604256, 'timestamp': '2025-09-30 22:09:54.594414', 'step': 553, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:54.635780', 'step': 553, 'epoch': 1} {'type': 'loss', 'content': 0.025778179988265038, 'timestamp': '2025-09-30 22:09:54.638541', 'step': 554, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:54.672505', 'step': 554, 'epoch': 1} {'type': 'loss', 'content': 0.00893770344555378, 'timestamp': '2025-09-30 22:09:54.675913', 'step': 555, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:54.720654', 'step': 555, 'epoch': 1} {'type': 'loss', 'content': 0.02957429364323616, 'timestamp': '2025-09-30 22:09:54.744864', 'step': 556, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:09:54.781349', 'step': 556, 'epoch': 1} {'type': 'loss', 'content': 0.028988074511289597, 'timestamp': '2025-09-30 22:09:54.787979', 'step': 557, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:54.830351', 'step': 557, 'epoch': 1} {'type': 'loss', 'content': 0.017317553982138634, 'timestamp': '2025-09-30 22:09:54.833830', 'step': 558, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:54.866807', 'step': 558, 'epoch': 1} {'type': 'loss', 'content': 0.029713379219174385, 'timestamp': '2025-09-30 22:09:54.874652', 'step': 559, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:54.916014', 'step': 559, 'epoch': 1} {'type': 'loss', 'content': 0.010365837253630161, 'timestamp': '2025-09-30 22:09:54.940306', 'step': 560, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:54.980306', 'step': 560, 'epoch': 1} {'type': 'loss', 'content': 0.021086541935801506, 'timestamp': '2025-09-30 22:09:54.983397', 'step': 561, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:55.026773', 'step': 561, 'epoch': 1} {'type': 'loss', 'content': 0.01718318462371826, 'timestamp': '2025-09-30 22:09:55.034896', 'step': 562, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:55.069910', 'step': 562, 'epoch': 1} {'type': 'loss', 'content': 0.01298757828772068, 'timestamp': '2025-09-30 22:09:55.073087', 'step': 563, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:55.118024', 'step': 563, 'epoch': 1} {'type': 'loss', 'content': 0.0263203177601099, 'timestamp': '2025-09-30 22:09:55.149975', 'step': 564, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:09:55.182194', 'step': 564, 'epoch': 1} {'type': 'loss', 'content': 0.024742284789681435, 'timestamp': '2025-09-30 22:09:55.185155', 'step': 565, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:55.220844', 'step': 565, 'epoch': 1} {'type': 'loss', 'content': 0.021866092458367348, 'timestamp': '2025-09-30 22:09:55.228652', 'step': 566, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:55.270924', 'step': 566, 'epoch': 1} {'type': 'loss', 'content': 0.013656304217875004, 'timestamp': '2025-09-30 22:09:55.273579', 'step': 567, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:09:55.306996', 'step': 567, 'epoch': 1} {'type': 'loss', 'content': 0.021317383274435997, 'timestamp': '2025-09-30 22:09:55.335191', 'step': 568, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:55.375128', 'step': 568, 'epoch': 1} {'type': 'loss', 'content': 0.02845599129796028, 'timestamp': '2025-09-30 22:09:55.380309', 'step': 569, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:55.416485', 'step': 569, 'epoch': 1} {'type': 'loss', 'content': 0.014579109847545624, 'timestamp': '2025-09-30 22:09:55.419575', 'step': 570, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-30 22:09:56.359081', 'step': 570, 'epoch': 1} {'type': 'pplx', 'content': 77694531.26741207, 'timestamp': '2025-09-30 22:09:56.362304', 'step': 570, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:56.397136', 'step': 570, 'epoch': 1} {'type': 'loss', 'content': 0.014638873748481274, 'timestamp': '2025-09-30 22:09:56.405918', 'step': 571, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:56.446234', 'step': 571, 'epoch': 1} {'type': 'loss', 'content': 0.016072118654847145, 'timestamp': '2025-09-30 22:09:56.470593', 'step': 572, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:09:56.503615', 'step': 572, 'epoch': 1} {'type': 'loss', 'content': 0.026216475293040276, 'timestamp': '2025-09-30 22:09:56.507736', 'step': 573, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:56.541132', 'step': 573, 'epoch': 1} {'type': 'loss', 'content': 0.016313405707478523, 'timestamp': '2025-09-30 22:09:56.543511', 'step': 574, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:56.575571', 'step': 574, 'epoch': 1} {'type': 'loss', 'content': 0.017599981278181076, 'timestamp': '2025-09-30 22:09:56.577989', 'step': 575, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:56.610692', 'step': 575, 'epoch': 1} {'type': 'loss', 'content': 0.012232857756316662, 'timestamp': '2025-09-30 22:09:56.641116', 'step': 576, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:09:56.673337', 'step': 576, 'epoch': 1} {'type': 'loss', 'content': 0.021130342036485672, 'timestamp': '2025-09-30 22:09:56.679142', 'step': 577, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:09:56.710451', 'step': 577, 'epoch': 1} {'type': 'loss', 'content': 0.02429911494255066, 'timestamp': '2025-09-30 22:09:56.713187', 'step': 578, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:56.746106', 'step': 578, 'epoch': 1} {'type': 'loss', 'content': 0.01091067399829626, 'timestamp': '2025-09-30 22:09:56.750525', 'step': 579, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:56.781104', 'step': 579, 'epoch': 1} {'type': 'loss', 'content': 0.00966098252683878, 'timestamp': '2025-09-30 22:09:56.805207', 'step': 580, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:56.836940', 'step': 580, 'epoch': 1} {'type': 'loss', 'content': 0.015667086467146873, 'timestamp': '2025-09-30 22:09:56.841298', 'step': 581, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:56.883250', 'step': 581, 'epoch': 1} {'type': 'loss', 'content': 0.007937231101095676, 'timestamp': '2025-09-30 22:09:56.885852', 'step': 582, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:56.917925', 'step': 582, 'epoch': 1} {'type': 'loss', 'content': 0.022754153236746788, 'timestamp': '2025-09-30 22:09:56.922117', 'step': 583, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:56.962312', 'step': 583, 'epoch': 1} {'type': 'loss', 'content': 0.013802661560475826, 'timestamp': '2025-09-30 22:09:56.986630', 'step': 584, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:57.019297', 'step': 584, 'epoch': 1} {'type': 'loss', 'content': 0.01588551141321659, 'timestamp': '2025-09-30 22:09:57.022169', 'step': 585, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:57.053100', 'step': 585, 'epoch': 1} {'type': 'loss', 'content': 0.007105494383722544, 'timestamp': '2025-09-30 22:09:57.061645', 'step': 586, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:57.096839', 'step': 586, 'epoch': 1} {'type': 'loss', 'content': 0.00457183551043272, 'timestamp': '2025-09-30 22:09:57.099156', 'step': 587, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:09:57.132381', 'step': 587, 'epoch': 1} {'type': 'loss', 'content': 0.036326173692941666, 'timestamp': '2025-09-30 22:09:57.159837', 'step': 588, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:57.201453', 'step': 588, 'epoch': 1} {'type': 'loss', 'content': 0.031208250671625137, 'timestamp': '2025-09-30 22:09:57.209571', 'step': 589, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:57.253806', 'step': 589, 'epoch': 1} {'type': 'loss', 'content': 0.026373323053121567, 'timestamp': '2025-09-30 22:09:57.261750', 'step': 590, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:09:57.311706', 'step': 590, 'epoch': 1} {'type': 'loss', 'content': 0.029552871361374855, 'timestamp': '2025-09-30 22:09:57.314904', 'step': 591, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:57.348917', 'step': 591, 'epoch': 1} {'type': 'loss', 'content': 0.015717869624495506, 'timestamp': '2025-09-30 22:09:57.372907', 'step': 592, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:57.413967', 'step': 592, 'epoch': 1} {'type': 'loss', 'content': 0.0234203040599823, 'timestamp': '2025-09-30 22:09:57.416505', 'step': 593, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:09:57.449886', 'step': 593, 'epoch': 1} {'type': 'loss', 'content': 0.006487022154033184, 'timestamp': '2025-09-30 22:09:57.452631', 'step': 594, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:57.488007', 'step': 594, 'epoch': 1} {'type': 'loss', 'content': 0.018183846026659012, 'timestamp': '2025-09-30 22:09:57.497039', 'step': 595, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:57.540000', 'step': 595, 'epoch': 1} {'type': 'loss', 'content': 0.03163054212927818, 'timestamp': '2025-09-30 22:09:57.564703', 'step': 596, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:09:57.602692', 'step': 596, 'epoch': 1} {'type': 'loss', 'content': 0.01731395348906517, 'timestamp': '2025-09-30 22:09:57.605511', 'step': 597, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:57.642712', 'step': 597, 'epoch': 1} {'type': 'loss', 'content': 0.016153180971741676, 'timestamp': '2025-09-30 22:09:57.645756', 'step': 598, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:57.684776', 'step': 598, 'epoch': 1} {'type': 'loss', 'content': 0.03578946739435196, 'timestamp': '2025-09-30 22:09:57.687752', 'step': 599, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:57.719187', 'step': 599, 'epoch': 1} {'type': 'loss', 'content': 0.018138926476240158, 'timestamp': '2025-09-30 22:09:57.755394', 'step': 600, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:57.786383', 'step': 600, 'epoch': 1} {'type': 'loss', 'content': 0.0256302822381258, 'timestamp': '2025-09-30 22:09:57.788419', 'step': 601, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:57.819218', 'step': 601, 'epoch': 1} {'type': 'loss', 'content': 0.019077766686677933, 'timestamp': '2025-09-30 22:09:57.821834', 'step': 602, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:57.859076', 'step': 602, 'epoch': 1} {'type': 'loss', 'content': 0.015015407465398312, 'timestamp': '2025-09-30 22:09:57.861984', 'step': 603, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:57.894027', 'step': 603, 'epoch': 1} {'type': 'loss', 'content': 0.022305699065327644, 'timestamp': '2025-09-30 22:09:57.919835', 'step': 604, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:57.953265', 'step': 604, 'epoch': 1} {'type': 'loss', 'content': 0.017405861988663673, 'timestamp': '2025-09-30 22:09:57.956514', 'step': 605, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:57.989323', 'step': 605, 'epoch': 1} {'type': 'loss', 'content': 0.02004510723054409, 'timestamp': '2025-09-30 22:09:57.991412', 'step': 606, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:58.022784', 'step': 606, 'epoch': 1} {'type': 'loss', 'content': 0.022168070077896118, 'timestamp': '2025-09-30 22:09:58.025373', 'step': 607, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:09:58.056916', 'step': 607, 'epoch': 1} {'type': 'loss', 'content': 0.015638450160622597, 'timestamp': '2025-09-30 22:09:58.081625', 'step': 608, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:58.113926', 'step': 608, 'epoch': 1} {'type': 'loss', 'content': 0.017927611246705055, 'timestamp': '2025-09-30 22:09:58.130234', 'step': 609, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:58.171678', 'step': 609, 'epoch': 1} {'type': 'loss', 'content': 0.01894911751151085, 'timestamp': '2025-09-30 22:09:58.174991', 'step': 610, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:58.215274', 'step': 610, 'epoch': 1} {'type': 'loss', 'content': 0.02456963248550892, 'timestamp': '2025-09-30 22:09:58.217971', 'step': 611, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:58.250551', 'step': 611, 'epoch': 1} {'type': 'loss', 'content': 0.03773067519068718, 'timestamp': '2025-09-30 22:09:58.275222', 'step': 612, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:58.310140', 'step': 612, 'epoch': 1} {'type': 'loss', 'content': 0.030875859782099724, 'timestamp': '2025-09-30 22:09:58.312482', 'step': 613, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:58.345460', 'step': 613, 'epoch': 1} {'type': 'loss', 'content': 0.012332675978541374, 'timestamp': '2025-09-30 22:09:58.351572', 'step': 614, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:58.384197', 'step': 614, 'epoch': 1} {'type': 'loss', 'content': 0.03169454261660576, 'timestamp': '2025-09-30 22:09:58.387041', 'step': 615, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:58.419955', 'step': 615, 'epoch': 1} {'type': 'loss', 'content': 0.025642435997724533, 'timestamp': '2025-09-30 22:09:58.444713', 'step': 616, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:58.480313', 'step': 616, 'epoch': 1} {'type': 'loss', 'content': 0.03705728054046631, 'timestamp': '2025-09-30 22:09:58.483990', 'step': 617, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:58.515874', 'step': 617, 'epoch': 1} {'type': 'loss', 'content': 0.01200761180371046, 'timestamp': '2025-09-30 22:09:58.519461', 'step': 618, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:58.550113', 'step': 618, 'epoch': 1} {'type': 'loss', 'content': 0.025091679766774178, 'timestamp': '2025-09-30 22:09:58.552635', 'step': 619, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:58.583437', 'step': 619, 'epoch': 1} {'type': 'loss', 'content': 0.014052960090339184, 'timestamp': '2025-09-30 22:09:58.606813', 'step': 620, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:58.644661', 'step': 620, 'epoch': 1} {'type': 'loss', 'content': 0.013598896563053131, 'timestamp': '2025-09-30 22:09:58.646757', 'step': 621, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:58.678094', 'step': 621, 'epoch': 1} {'type': 'loss', 'content': 0.015057659707963467, 'timestamp': '2025-09-30 22:09:58.680288', 'step': 622, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:58.711648', 'step': 622, 'epoch': 1} {'type': 'loss', 'content': 0.013657798990607262, 'timestamp': '2025-09-30 22:09:58.714492', 'step': 623, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:58.747139', 'step': 623, 'epoch': 1} {'type': 'loss', 'content': 0.04932836815714836, 'timestamp': '2025-09-30 22:09:58.774265', 'step': 624, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:58.814607', 'step': 624, 'epoch': 1} {'type': 'loss', 'content': 0.021991120651364326, 'timestamp': '2025-09-30 22:09:58.817022', 'step': 625, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:58.852026', 'step': 625, 'epoch': 1} {'type': 'loss', 'content': 0.01033029519021511, 'timestamp': '2025-09-30 22:09:58.865897', 'step': 626, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:58.899075', 'step': 626, 'epoch': 1} {'type': 'loss', 'content': 0.004971671849489212, 'timestamp': '2025-09-30 22:09:58.901055', 'step': 627, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-30 22:09:59.686406', 'step': 627, 'epoch': 1} {'type': 'pplx', 'content': 75762044.56936751, 'timestamp': '2025-09-30 22:09:59.689148', 'step': 627, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:59.717607', 'step': 627, 'epoch': 1} {'type': 'loss', 'content': 0.005967786069959402, 'timestamp': '2025-09-30 22:09:59.742173', 'step': 628, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:59.773138', 'step': 628, 'epoch': 1} {'type': 'loss', 'content': 0.04627048969268799, 'timestamp': '2025-09-30 22:09:59.775314', 'step': 629, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:59.805781', 'step': 629, 'epoch': 1} {'type': 'loss', 'content': 0.03623666614294052, 'timestamp': '2025-09-30 22:09:59.807782', 'step': 630, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:59.837806', 'step': 630, 'epoch': 1} {'type': 'loss', 'content': 0.016331830993294716, 'timestamp': '2025-09-30 22:09:59.840250', 'step': 631, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:59.872032', 'step': 631, 'epoch': 1} {'type': 'loss', 'content': 0.015092739835381508, 'timestamp': '2025-09-30 22:09:59.895811', 'step': 632, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:59.925652', 'step': 632, 'epoch': 1} {'type': 'loss', 'content': 0.004001949448138475, 'timestamp': '2025-09-30 22:09:59.927516', 'step': 633, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:09:59.958001', 'step': 633, 'epoch': 1} {'type': 'loss', 'content': 0.008666311390697956, 'timestamp': '2025-09-30 22:09:59.960060', 'step': 634, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:09:59.990041', 'step': 634, 'epoch': 1} {'type': 'loss', 'content': 0.030506502836942673, 'timestamp': '2025-09-30 22:09:59.992155', 'step': 635, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:00.022023', 'step': 635, 'epoch': 1} {'type': 'loss', 'content': 0.006849688943475485, 'timestamp': '2025-09-30 22:10:00.045475', 'step': 636, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:00.076045', 'step': 636, 'epoch': 1} {'type': 'loss', 'content': 0.002905628876760602, 'timestamp': '2025-09-30 22:10:00.078323', 'step': 637, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:00.112261', 'step': 637, 'epoch': 1} {'type': 'loss', 'content': 0.0353107824921608, 'timestamp': '2025-09-30 22:10:00.114919', 'step': 638, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:00.145532', 'step': 638, 'epoch': 1} {'type': 'loss', 'content': 0.03190189599990845, 'timestamp': '2025-09-30 22:10:00.147962', 'step': 639, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:00.181810', 'step': 639, 'epoch': 1} {'type': 'loss', 'content': 0.002591110300272703, 'timestamp': '2025-09-30 22:10:00.209648', 'step': 640, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:00.241939', 'step': 640, 'epoch': 1} {'type': 'loss', 'content': 0.010863580740988255, 'timestamp': '2025-09-30 22:10:00.245521', 'step': 641, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:00.285889', 'step': 641, 'epoch': 1} {'type': 'loss', 'content': 0.01622161455452442, 'timestamp': '2025-09-30 22:10:00.288459', 'step': 642, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:00.324166', 'step': 642, 'epoch': 1} {'type': 'loss', 'content': 0.008066744543612003, 'timestamp': '2025-09-30 22:10:00.329827', 'step': 643, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:00.362746', 'step': 643, 'epoch': 1} {'type': 'loss', 'content': 0.03653540462255478, 'timestamp': '2025-09-30 22:10:00.392762', 'step': 644, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:00.424716', 'step': 644, 'epoch': 1} {'type': 'loss', 'content': 0.008595902472734451, 'timestamp': '2025-09-30 22:10:00.427010', 'step': 645, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:00.458102', 'step': 645, 'epoch': 1} {'type': 'loss', 'content': 0.03257131204009056, 'timestamp': '2025-09-30 22:10:00.460636', 'step': 646, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:00.492091', 'step': 646, 'epoch': 1} {'type': 'loss', 'content': 0.007714335806667805, 'timestamp': '2025-09-30 22:10:00.494391', 'step': 647, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:00.529366', 'step': 647, 'epoch': 1} {'type': 'loss', 'content': 0.00786947924643755, 'timestamp': '2025-09-30 22:10:00.555784', 'step': 648, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:00.593070', 'step': 648, 'epoch': 1} {'type': 'loss', 'content': 0.009578948840498924, 'timestamp': '2025-09-30 22:10:00.596750', 'step': 649, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:00.631079', 'step': 649, 'epoch': 1} {'type': 'loss', 'content': 0.007144716568291187, 'timestamp': '2025-09-30 22:10:00.635260', 'step': 650, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:00.668063', 'step': 650, 'epoch': 1} {'type': 'loss', 'content': 0.01738644763827324, 'timestamp': '2025-09-30 22:10:00.670564', 'step': 651, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:00.700774', 'step': 651, 'epoch': 1} {'type': 'loss', 'content': 0.013808409683406353, 'timestamp': '2025-09-30 22:10:00.725001', 'step': 652, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:00.755689', 'step': 652, 'epoch': 1} {'type': 'loss', 'content': 0.02000904455780983, 'timestamp': '2025-09-30 22:10:00.762451', 'step': 653, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:00.799705', 'step': 653, 'epoch': 1} {'type': 'loss', 'content': 0.023145198822021484, 'timestamp': '2025-09-30 22:10:00.813065', 'step': 654, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:00.844954', 'step': 654, 'epoch': 1} {'type': 'loss', 'content': 0.006938356440514326, 'timestamp': '2025-09-30 22:10:00.847578', 'step': 655, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:00.879227', 'step': 655, 'epoch': 1} {'type': 'loss', 'content': 0.006100603379309177, 'timestamp': '2025-09-30 22:10:00.902911', 'step': 656, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:00.935058', 'step': 656, 'epoch': 1} {'type': 'loss', 'content': 0.004185267724096775, 'timestamp': '2025-09-30 22:10:00.937749', 'step': 657, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:00.971195', 'step': 657, 'epoch': 1} {'type': 'loss', 'content': 0.01654222048819065, 'timestamp': '2025-09-30 22:10:00.974159', 'step': 658, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:01.007289', 'step': 658, 'epoch': 1} {'type': 'loss', 'content': 0.0077790203504264355, 'timestamp': '2025-09-30 22:10:01.009528', 'step': 659, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:01.053381', 'step': 659, 'epoch': 1} {'type': 'loss', 'content': 0.011989668942987919, 'timestamp': '2025-09-30 22:10:01.077026', 'step': 660, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:01.107400', 'step': 660, 'epoch': 1} {'type': 'loss', 'content': 0.010614539496600628, 'timestamp': '2025-09-30 22:10:01.112680', 'step': 661, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:01.159568', 'step': 661, 'epoch': 1} {'type': 'loss', 'content': 0.0053671300411224365, 'timestamp': '2025-09-30 22:10:01.168495', 'step': 662, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:01.200395', 'step': 662, 'epoch': 1} {'type': 'loss', 'content': 0.016355205327272415, 'timestamp': '2025-09-30 22:10:01.203041', 'step': 663, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:01.237777', 'step': 663, 'epoch': 1} {'type': 'loss', 'content': 0.019035687670111656, 'timestamp': '2025-09-30 22:10:01.263115', 'step': 664, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:01.298938', 'step': 664, 'epoch': 1} {'type': 'loss', 'content': 0.009006036445498466, 'timestamp': '2025-09-30 22:10:01.318806', 'step': 665, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:01.349258', 'step': 665, 'epoch': 1} {'type': 'loss', 'content': 0.00685217697173357, 'timestamp': '2025-09-30 22:10:01.351550', 'step': 666, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:01.384878', 'step': 666, 'epoch': 1} {'type': 'loss', 'content': 0.01326062437146902, 'timestamp': '2025-09-30 22:10:01.387328', 'step': 667, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:01.419025', 'step': 667, 'epoch': 1} {'type': 'loss', 'content': 0.013570294715464115, 'timestamp': '2025-09-30 22:10:01.443509', 'step': 668, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:01.475504', 'step': 668, 'epoch': 1} {'type': 'loss', 'content': 0.039509546011686325, 'timestamp': '2025-09-30 22:10:01.478771', 'step': 669, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:01.525788', 'step': 669, 'epoch': 1} {'type': 'loss', 'content': 0.04997308924794197, 'timestamp': '2025-09-30 22:10:01.528750', 'step': 670, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:01.560112', 'step': 670, 'epoch': 1} {'type': 'loss', 'content': 0.005994971841573715, 'timestamp': '2025-09-30 22:10:01.563800', 'step': 671, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:01.595169', 'step': 671, 'epoch': 1} {'type': 'loss', 'content': 0.018137672916054726, 'timestamp': '2025-09-30 22:10:01.622396', 'step': 672, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:01.654248', 'step': 672, 'epoch': 1} {'type': 'loss', 'content': 0.02233460359275341, 'timestamp': '2025-09-30 22:10:01.658163', 'step': 673, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:01.689437', 'step': 673, 'epoch': 1} {'type': 'loss', 'content': 0.026135217398405075, 'timestamp': '2025-09-30 22:10:01.692105', 'step': 674, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:01.729466', 'step': 674, 'epoch': 1} {'type': 'loss', 'content': 0.04665851593017578, 'timestamp': '2025-09-30 22:10:01.734628', 'step': 675, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:01.766347', 'step': 675, 'epoch': 1} {'type': 'loss', 'content': 0.012036020867526531, 'timestamp': '2025-09-30 22:10:01.790289', 'step': 676, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:01.826971', 'step': 676, 'epoch': 1} {'type': 'loss', 'content': 0.01776084490120411, 'timestamp': '2025-09-30 22:10:01.829109', 'step': 677, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:01.858951', 'step': 677, 'epoch': 1} {'type': 'loss', 'content': 0.00692532816901803, 'timestamp': '2025-09-30 22:10:01.863459', 'step': 678, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:01.904224', 'step': 678, 'epoch': 1} {'type': 'loss', 'content': 0.008399528451263905, 'timestamp': '2025-09-30 22:10:01.906615', 'step': 679, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:01.938699', 'step': 679, 'epoch': 1} {'type': 'loss', 'content': 0.012349965050816536, 'timestamp': '2025-09-30 22:10:01.962845', 'step': 680, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:01.995586', 'step': 680, 'epoch': 1} {'type': 'loss', 'content': 0.010101139545440674, 'timestamp': '2025-09-30 22:10:01.999394', 'step': 681, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:02.029697', 'step': 681, 'epoch': 1} {'type': 'loss', 'content': 0.012034410610795021, 'timestamp': '2025-09-30 22:10:02.031565', 'step': 682, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:02.063467', 'step': 682, 'epoch': 1} {'type': 'loss', 'content': 0.01522676832973957, 'timestamp': '2025-09-30 22:10:02.065813', 'step': 683, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:02.095464', 'step': 683, 'epoch': 1} {'type': 'loss', 'content': 0.007156494073569775, 'timestamp': '2025-09-30 22:10:02.119029', 'step': 684, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-30 22:10:03.026146', 'step': 684, 'epoch': 1} {'type': 'pplx', 'content': 75040225.03656973, 'timestamp': '2025-09-30 22:10:03.027940', 'step': 684, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:03.057877', 'step': 684, 'epoch': 1} {'type': 'loss', 'content': 0.027570990845561028, 'timestamp': '2025-09-30 22:10:03.059859', 'step': 685, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:03.096902', 'step': 685, 'epoch': 1} {'type': 'loss', 'content': 0.018303800374269485, 'timestamp': '2025-09-30 22:10:03.098881', 'step': 686, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:03.128674', 'step': 686, 'epoch': 1} {'type': 'loss', 'content': 0.005918011534959078, 'timestamp': '2025-09-30 22:10:03.131031', 'step': 687, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:03.162133', 'step': 687, 'epoch': 1} {'type': 'loss', 'content': 0.00888880342245102, 'timestamp': '2025-09-30 22:10:03.185819', 'step': 688, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:03.216448', 'step': 688, 'epoch': 1} {'type': 'loss', 'content': 0.03235236182808876, 'timestamp': '2025-09-30 22:10:03.219377', 'step': 689, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:03.250220', 'step': 689, 'epoch': 1} {'type': 'loss', 'content': 0.007580969948321581, 'timestamp': '2025-09-30 22:10:03.252494', 'step': 690, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:03.283394', 'step': 690, 'epoch': 1} {'type': 'loss', 'content': 0.011107278056442738, 'timestamp': '2025-09-30 22:10:03.286750', 'step': 691, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:03.329475', 'step': 691, 'epoch': 1} {'type': 'loss', 'content': 0.010450707748532295, 'timestamp': '2025-09-30 22:10:03.353238', 'step': 692, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:03.389345', 'step': 692, 'epoch': 1} {'type': 'loss', 'content': 0.021500239148736, 'timestamp': '2025-09-30 22:10:03.391531', 'step': 693, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:03.422398', 'step': 693, 'epoch': 1} {'type': 'loss', 'content': 0.04603993892669678, 'timestamp': '2025-09-30 22:10:03.425868', 'step': 694, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:03.457563', 'step': 694, 'epoch': 1} {'type': 'loss', 'content': 0.005174047313630581, 'timestamp': '2025-09-30 22:10:03.466641', 'step': 695, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:03.501262', 'step': 695, 'epoch': 1} {'type': 'loss', 'content': 0.02627837099134922, 'timestamp': '2025-09-30 22:10:03.524896', 'step': 696, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:03.561635', 'step': 696, 'epoch': 1} {'type': 'loss', 'content': 0.01084921695291996, 'timestamp': '2025-09-30 22:10:03.566186', 'step': 697, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:03.600203', 'step': 697, 'epoch': 1} {'type': 'loss', 'content': 0.011157850734889507, 'timestamp': '2025-09-30 22:10:03.605230', 'step': 698, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:03.639902', 'step': 698, 'epoch': 1} {'type': 'loss', 'content': 0.023838777095079422, 'timestamp': '2025-09-30 22:10:03.651726', 'step': 699, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:03.694541', 'step': 699, 'epoch': 1} {'type': 'loss', 'content': 0.02645074389874935, 'timestamp': '2025-09-30 22:10:03.718493', 'step': 700, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:03.760015', 'step': 700, 'epoch': 1} {'type': 'loss', 'content': 0.006150428671389818, 'timestamp': '2025-09-30 22:10:03.762389', 'step': 701, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:03.793712', 'step': 701, 'epoch': 1} {'type': 'loss', 'content': 0.018974557518959045, 'timestamp': '2025-09-30 22:10:03.796231', 'step': 702, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:03.827927', 'step': 702, 'epoch': 1} {'type': 'loss', 'content': 0.018012331798672676, 'timestamp': '2025-09-30 22:10:03.833070', 'step': 703, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:03.872360', 'step': 703, 'epoch': 1} {'type': 'loss', 'content': 0.014314976520836353, 'timestamp': '2025-09-30 22:10:03.897893', 'step': 704, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:03.929370', 'step': 704, 'epoch': 1} {'type': 'loss', 'content': 0.020005246624350548, 'timestamp': '2025-09-30 22:10:03.933187', 'step': 705, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:03.964732', 'step': 705, 'epoch': 1} {'type': 'loss', 'content': 0.006193004548549652, 'timestamp': '2025-09-30 22:10:03.966911', 'step': 706, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:04.000306', 'step': 706, 'epoch': 1} {'type': 'loss', 'content': 0.006298999767750502, 'timestamp': '2025-09-30 22:10:04.002478', 'step': 707, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:04.033057', 'step': 707, 'epoch': 1} {'type': 'loss', 'content': 0.012022623792290688, 'timestamp': '2025-09-30 22:10:04.056790', 'step': 708, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:04.096985', 'step': 708, 'epoch': 1} {'type': 'loss', 'content': 0.010289990343153477, 'timestamp': '2025-09-30 22:10:04.099617', 'step': 709, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:04.130289', 'step': 709, 'epoch': 1} {'type': 'loss', 'content': 0.04707222059369087, 'timestamp': '2025-09-30 22:10:04.133120', 'step': 710, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:10:04.163461', 'step': 710, 'epoch': 1} {'type': 'loss', 'content': 0.02272258698940277, 'timestamp': '2025-09-30 22:10:04.166094', 'step': 711, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:04.206302', 'step': 711, 'epoch': 1} {'type': 'loss', 'content': 0.034390226006507874, 'timestamp': '2025-09-30 22:10:04.230759', 'step': 712, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:04.261763', 'step': 712, 'epoch': 1} {'type': 'loss', 'content': 0.014662191271781921, 'timestamp': '2025-09-30 22:10:04.263917', 'step': 713, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:04.294300', 'step': 713, 'epoch': 1} {'type': 'loss', 'content': 0.011226105503737926, 'timestamp': '2025-09-30 22:10:04.296253', 'step': 714, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:04.327259', 'step': 714, 'epoch': 1} {'type': 'loss', 'content': 0.02475506253540516, 'timestamp': '2025-09-30 22:10:04.330232', 'step': 715, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:04.360791', 'step': 715, 'epoch': 1} {'type': 'loss', 'content': 0.035920243710279465, 'timestamp': '2025-09-30 22:10:04.384648', 'step': 716, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:04.415677', 'step': 716, 'epoch': 1} {'type': 'loss', 'content': 0.01731456071138382, 'timestamp': '2025-09-30 22:10:04.417852', 'step': 717, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:04.448702', 'step': 717, 'epoch': 1} {'type': 'loss', 'content': 0.015075340867042542, 'timestamp': '2025-09-30 22:10:04.451638', 'step': 718, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:04.482199', 'step': 718, 'epoch': 1} {'type': 'loss', 'content': 0.017238110303878784, 'timestamp': '2025-09-30 22:10:04.484997', 'step': 719, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:04.516673', 'step': 719, 'epoch': 1} {'type': 'loss', 'content': 0.028648529201745987, 'timestamp': '2025-09-30 22:10:04.541086', 'step': 720, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:04.571548', 'step': 720, 'epoch': 1} {'type': 'loss', 'content': 0.005524549167603254, 'timestamp': '2025-09-30 22:10:04.573834', 'step': 721, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:04.604370', 'step': 721, 'epoch': 1} {'type': 'loss', 'content': 0.006129828747361898, 'timestamp': '2025-09-30 22:10:04.606408', 'step': 722, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:04.643225', 'step': 722, 'epoch': 1} {'type': 'loss', 'content': 0.010930589400231838, 'timestamp': '2025-09-30 22:10:04.645764', 'step': 723, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:04.677728', 'step': 723, 'epoch': 1} {'type': 'loss', 'content': 0.02778017707169056, 'timestamp': '2025-09-30 22:10:04.702056', 'step': 724, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:04.733280', 'step': 724, 'epoch': 1} {'type': 'loss', 'content': 0.03585416451096535, 'timestamp': '2025-09-30 22:10:04.737077', 'step': 725, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:04.771163', 'step': 725, 'epoch': 1} {'type': 'loss', 'content': 0.021228177472949028, 'timestamp': '2025-09-30 22:10:04.773894', 'step': 726, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:04.811822', 'step': 726, 'epoch': 1} {'type': 'loss', 'content': 0.01692606322467327, 'timestamp': '2025-09-30 22:10:04.815307', 'step': 727, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:04.856289', 'step': 727, 'epoch': 1} {'type': 'loss', 'content': 0.009032380767166615, 'timestamp': '2025-09-30 22:10:04.880861', 'step': 728, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:04.913131', 'step': 728, 'epoch': 1} {'type': 'loss', 'content': 0.0076371668837964535, 'timestamp': '2025-09-30 22:10:04.915737', 'step': 729, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:04.956413', 'step': 729, 'epoch': 1} {'type': 'loss', 'content': 0.02644680067896843, 'timestamp': '2025-09-30 22:10:04.959566', 'step': 730, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:04.990070', 'step': 730, 'epoch': 1} {'type': 'loss', 'content': 0.010779125615954399, 'timestamp': '2025-09-30 22:10:04.993057', 'step': 731, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:05.027996', 'step': 731, 'epoch': 1} {'type': 'loss', 'content': 0.023335661739110947, 'timestamp': '2025-09-30 22:10:05.051825', 'step': 732, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:05.081923', 'step': 732, 'epoch': 1} {'type': 'loss', 'content': 0.01767139323055744, 'timestamp': '2025-09-30 22:10:05.084638', 'step': 733, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:05.115460', 'step': 733, 'epoch': 1} {'type': 'loss', 'content': 0.011058658361434937, 'timestamp': '2025-09-30 22:10:05.118036', 'step': 734, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:05.148161', 'step': 734, 'epoch': 1} {'type': 'loss', 'content': 0.014833243563771248, 'timestamp': '2025-09-30 22:10:05.150681', 'step': 735, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:05.186507', 'step': 735, 'epoch': 1} {'type': 'loss', 'content': 0.012499203905463219, 'timestamp': '2025-09-30 22:10:05.210091', 'step': 736, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:05.241766', 'step': 736, 'epoch': 1} {'type': 'loss', 'content': 0.027693988755345345, 'timestamp': '2025-09-30 22:10:05.243953', 'step': 737, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:05.274186', 'step': 737, 'epoch': 1} {'type': 'loss', 'content': 0.012153781950473785, 'timestamp': '2025-09-30 22:10:05.276161', 'step': 738, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:05.306651', 'step': 738, 'epoch': 1} {'type': 'loss', 'content': 0.020603006705641747, 'timestamp': '2025-09-30 22:10:05.308770', 'step': 739, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:05.339346', 'step': 739, 'epoch': 1} {'type': 'loss', 'content': 0.0332343615591526, 'timestamp': '2025-09-30 22:10:05.363220', 'step': 740, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:05.394412', 'step': 740, 'epoch': 1} {'type': 'loss', 'content': 0.026437604799866676, 'timestamp': '2025-09-30 22:10:05.397857', 'step': 741, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-30 22:10:06.220183', 'step': 741, 'epoch': 1} {'type': 'pplx', 'content': 75644240.30694193, 'timestamp': '2025-09-30 22:10:06.222609', 'step': 741, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:06.252650', 'step': 741, 'epoch': 1} {'type': 'loss', 'content': 0.01217217929661274, 'timestamp': '2025-09-30 22:10:06.255053', 'step': 742, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:06.287688', 'step': 742, 'epoch': 1} {'type': 'loss', 'content': 0.014330956153571606, 'timestamp': '2025-09-30 22:10:06.289835', 'step': 743, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:06.320607', 'step': 743, 'epoch': 1} {'type': 'loss', 'content': 0.034744616597890854, 'timestamp': '2025-09-30 22:10:06.344390', 'step': 744, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:06.377075', 'step': 744, 'epoch': 1} {'type': 'loss', 'content': 0.015021897852420807, 'timestamp': '2025-09-30 22:10:06.379763', 'step': 745, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:06.410778', 'step': 745, 'epoch': 1} {'type': 'loss', 'content': 0.011597558856010437, 'timestamp': '2025-09-30 22:10:06.413225', 'step': 746, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:06.445145', 'step': 746, 'epoch': 1} {'type': 'loss', 'content': 0.009742917492985725, 'timestamp': '2025-09-30 22:10:06.447374', 'step': 747, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:06.478670', 'step': 747, 'epoch': 1} {'type': 'loss', 'content': 0.029180850833654404, 'timestamp': '2025-09-30 22:10:06.508907', 'step': 748, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:06.539174', 'step': 748, 'epoch': 1} {'type': 'loss', 'content': 0.01945856213569641, 'timestamp': '2025-09-30 22:10:06.541367', 'step': 749, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:06.571909', 'step': 749, 'epoch': 1} {'type': 'loss', 'content': 0.014946605078876019, 'timestamp': '2025-09-30 22:10:06.574379', 'step': 750, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:06.606384', 'step': 750, 'epoch': 1} {'type': 'loss', 'content': 0.020322533324360847, 'timestamp': '2025-09-30 22:10:06.609089', 'step': 751, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:06.639777', 'step': 751, 'epoch': 1} {'type': 'loss', 'content': 0.03409485146403313, 'timestamp': '2025-09-30 22:10:06.663743', 'step': 752, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:06.696549', 'step': 752, 'epoch': 1} {'type': 'loss', 'content': 0.018665427342057228, 'timestamp': '2025-09-30 22:10:06.699250', 'step': 753, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:06.731448', 'step': 753, 'epoch': 1} {'type': 'loss', 'content': 0.01316206157207489, 'timestamp': '2025-09-30 22:10:06.733644', 'step': 754, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:06.764588', 'step': 754, 'epoch': 1} {'type': 'loss', 'content': 0.017740074545145035, 'timestamp': '2025-09-30 22:10:06.768527', 'step': 755, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:06.801537', 'step': 755, 'epoch': 1} {'type': 'loss', 'content': 0.022789278998970985, 'timestamp': '2025-09-30 22:10:06.825231', 'step': 756, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:06.855432', 'step': 756, 'epoch': 1} {'type': 'loss', 'content': 0.006472249049693346, 'timestamp': '2025-09-30 22:10:06.857852', 'step': 757, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:06.889451', 'step': 757, 'epoch': 1} {'type': 'loss', 'content': 0.0057207318022847176, 'timestamp': '2025-09-30 22:10:06.891417', 'step': 758, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:06.921905', 'step': 758, 'epoch': 1} {'type': 'loss', 'content': 0.010666334070265293, 'timestamp': '2025-09-30 22:10:06.924240', 'step': 759, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:06.954759', 'step': 759, 'epoch': 1} {'type': 'loss', 'content': 0.012298032641410828, 'timestamp': '2025-09-30 22:10:06.979562', 'step': 760, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:07.010569', 'step': 760, 'epoch': 1} {'type': 'loss', 'content': 0.004690289031714201, 'timestamp': '2025-09-30 22:10:07.014921', 'step': 761, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:10:07.045047', 'step': 761, 'epoch': 1} {'type': 'loss', 'content': 0.008399398066103458, 'timestamp': '2025-09-30 22:10:07.047650', 'step': 762, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:07.077913', 'step': 762, 'epoch': 1} {'type': 'loss', 'content': 0.03833383321762085, 'timestamp': '2025-09-30 22:10:07.088868', 'step': 763, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:07.125384', 'step': 763, 'epoch': 1} {'type': 'loss', 'content': 0.011823548004031181, 'timestamp': '2025-09-30 22:10:07.149206', 'step': 764, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:07.180770', 'step': 764, 'epoch': 1} {'type': 'loss', 'content': 0.008272853679955006, 'timestamp': '2025-09-30 22:10:07.184020', 'step': 765, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:07.215207', 'step': 765, 'epoch': 1} {'type': 'loss', 'content': 0.01553357858210802, 'timestamp': '2025-09-30 22:10:07.218279', 'step': 766, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:07.252165', 'step': 766, 'epoch': 1} {'type': 'loss', 'content': 0.008146891370415688, 'timestamp': '2025-09-30 22:10:07.255040', 'step': 767, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:07.286700', 'step': 767, 'epoch': 1} {'type': 'loss', 'content': 0.04488224908709526, 'timestamp': '2025-09-30 22:10:07.311102', 'step': 768, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:07.341855', 'step': 768, 'epoch': 1} {'type': 'loss', 'content': 0.014228535816073418, 'timestamp': '2025-09-30 22:10:07.343860', 'step': 769, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:07.373883', 'step': 769, 'epoch': 1} {'type': 'loss', 'content': 0.03609326854348183, 'timestamp': '2025-09-30 22:10:07.375919', 'step': 770, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:07.410372', 'step': 770, 'epoch': 1} {'type': 'loss', 'content': 0.02611478976905346, 'timestamp': '2025-09-30 22:10:07.413795', 'step': 771, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:07.444880', 'step': 771, 'epoch': 1} {'type': 'loss', 'content': 0.004257719498127699, 'timestamp': '2025-09-30 22:10:07.468544', 'step': 772, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:07.499657', 'step': 772, 'epoch': 1} {'type': 'loss', 'content': 0.05273055285215378, 'timestamp': '2025-09-30 22:10:07.501850', 'step': 773, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:07.532096', 'step': 773, 'epoch': 1} {'type': 'loss', 'content': 0.0034468381199985743, 'timestamp': '2025-09-30 22:10:07.534575', 'step': 774, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:07.567005', 'step': 774, 'epoch': 1} {'type': 'loss', 'content': 0.0350152887403965, 'timestamp': '2025-09-30 22:10:07.569682', 'step': 775, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:07.600602', 'step': 775, 'epoch': 1} {'type': 'loss', 'content': 0.029106054455041885, 'timestamp': '2025-09-30 22:10:07.624757', 'step': 776, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:07.656333', 'step': 776, 'epoch': 1} {'type': 'loss', 'content': 0.005838477518409491, 'timestamp': '2025-09-30 22:10:07.658869', 'step': 777, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:07.690631', 'step': 777, 'epoch': 1} {'type': 'loss', 'content': 0.0022769535426050425, 'timestamp': '2025-09-30 22:10:07.693302', 'step': 778, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:07.724407', 'step': 778, 'epoch': 1} {'type': 'loss', 'content': 0.026938190683722496, 'timestamp': '2025-09-30 22:10:07.726716', 'step': 779, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:07.758892', 'step': 779, 'epoch': 1} {'type': 'loss', 'content': 0.0065045664086937904, 'timestamp': '2025-09-30 22:10:07.784913', 'step': 780, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:07.815369', 'step': 780, 'epoch': 1} {'type': 'loss', 'content': 0.021315261721611023, 'timestamp': '2025-09-30 22:10:07.817962', 'step': 781, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:07.856826', 'step': 781, 'epoch': 1} {'type': 'loss', 'content': 0.012665718793869019, 'timestamp': '2025-09-30 22:10:07.859628', 'step': 782, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:07.890650', 'step': 782, 'epoch': 1} {'type': 'loss', 'content': 0.023632079362869263, 'timestamp': '2025-09-30 22:10:07.892741', 'step': 783, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:07.924441', 'step': 783, 'epoch': 1} {'type': 'loss', 'content': 0.015980595722794533, 'timestamp': '2025-09-30 22:10:07.948390', 'step': 784, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:07.978929', 'step': 784, 'epoch': 1} {'type': 'loss', 'content': 0.005975284148007631, 'timestamp': '2025-09-30 22:10:07.980917', 'step': 785, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:08.013879', 'step': 785, 'epoch': 1} {'type': 'loss', 'content': 0.00365698104724288, 'timestamp': '2025-09-30 22:10:08.016923', 'step': 786, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:08.048097', 'step': 786, 'epoch': 1} {'type': 'loss', 'content': 0.005156506318598986, 'timestamp': '2025-09-30 22:10:08.050351', 'step': 787, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:08.081140', 'step': 787, 'epoch': 1} {'type': 'loss', 'content': 0.02221284620463848, 'timestamp': '2025-09-30 22:10:08.104662', 'step': 788, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:08.138633', 'step': 788, 'epoch': 1} {'type': 'loss', 'content': 0.011932899244129658, 'timestamp': '2025-09-30 22:10:08.141098', 'step': 789, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:08.173020', 'step': 789, 'epoch': 1} {'type': 'loss', 'content': 0.005140057764947414, 'timestamp': '2025-09-30 22:10:08.175327', 'step': 790, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:08.207913', 'step': 790, 'epoch': 1} {'type': 'loss', 'content': 0.015069144777953625, 'timestamp': '2025-09-30 22:10:08.210280', 'step': 791, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:08.242050', 'step': 791, 'epoch': 1} {'type': 'loss', 'content': 0.01991919055581093, 'timestamp': '2025-09-30 22:10:08.265688', 'step': 792, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:08.297469', 'step': 792, 'epoch': 1} {'type': 'loss', 'content': 0.011542352847754955, 'timestamp': '2025-09-30 22:10:08.300828', 'step': 793, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:08.334971', 'step': 793, 'epoch': 1} {'type': 'loss', 'content': 0.03699946776032448, 'timestamp': '2025-09-30 22:10:08.343172', 'step': 794, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:08.374120', 'step': 794, 'epoch': 1} {'type': 'loss', 'content': 0.011802605353295803, 'timestamp': '2025-09-30 22:10:08.376597', 'step': 795, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:08.415508', 'step': 795, 'epoch': 1} {'type': 'loss', 'content': 0.0063971565105021, 'timestamp': '2025-09-30 22:10:08.440533', 'step': 796, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:08.472313', 'step': 796, 'epoch': 1} {'type': 'loss', 'content': 0.010037742555141449, 'timestamp': '2025-09-30 22:10:08.475351', 'step': 797, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:08.509320', 'step': 797, 'epoch': 1} {'type': 'loss', 'content': 0.01431556511670351, 'timestamp': '2025-09-30 22:10:08.511646', 'step': 798, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-30 22:10:09.292369', 'step': 798, 'epoch': 1} {'type': 'pplx', 'content': 76406729.60806397, 'timestamp': '2025-09-30 22:10:09.294385', 'step': 798, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:09.324329', 'step': 798, 'epoch': 1} {'type': 'loss', 'content': 0.006856878288090229, 'timestamp': '2025-09-30 22:10:09.326823', 'step': 799, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:09.360363', 'step': 799, 'epoch': 1} {'type': 'loss', 'content': 0.037827786058187485, 'timestamp': '2025-09-30 22:10:09.384515', 'step': 800, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:09.433939', 'step': 800, 'epoch': 1} {'type': 'loss', 'content': 0.025704285129904747, 'timestamp': '2025-09-30 22:10:09.437939', 'step': 801, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:09.479720', 'step': 801, 'epoch': 1} {'type': 'loss', 'content': 0.0315730981528759, 'timestamp': '2025-09-30 22:10:09.485052', 'step': 802, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:09.535661', 'step': 802, 'epoch': 1} {'type': 'loss', 'content': 0.026073981076478958, 'timestamp': '2025-09-30 22:10:09.540638', 'step': 803, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:09.588412', 'step': 803, 'epoch': 1} {'type': 'loss', 'content': 0.009723112918436527, 'timestamp': '2025-09-30 22:10:09.614821', 'step': 804, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:09.652465', 'step': 804, 'epoch': 1} {'type': 'loss', 'content': 0.02134951762855053, 'timestamp': '2025-09-30 22:10:09.657368', 'step': 805, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:09.712952', 'step': 805, 'epoch': 1} {'type': 'loss', 'content': 0.02399459108710289, 'timestamp': '2025-09-30 22:10:09.716284', 'step': 806, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:09.752627', 'step': 806, 'epoch': 1} {'type': 'loss', 'content': 0.012168281711637974, 'timestamp': '2025-09-30 22:10:09.755850', 'step': 807, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:09.791768', 'step': 807, 'epoch': 1} {'type': 'loss', 'content': 0.01669606752693653, 'timestamp': '2025-09-30 22:10:09.819829', 'step': 808, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:09.857837', 'step': 808, 'epoch': 1} {'type': 'loss', 'content': 0.008840478025376797, 'timestamp': '2025-09-30 22:10:09.863110', 'step': 809, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:09.902502', 'step': 809, 'epoch': 1} {'type': 'loss', 'content': 0.015332608483731747, 'timestamp': '2025-09-30 22:10:09.907137', 'step': 810, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:09.945104', 'step': 810, 'epoch': 1} {'type': 'loss', 'content': 0.02074304223060608, 'timestamp': '2025-09-30 22:10:09.948780', 'step': 811, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:09.987526', 'step': 811, 'epoch': 1} {'type': 'loss', 'content': 0.012077617458999157, 'timestamp': '2025-09-30 22:10:10.013670', 'step': 812, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:10.054173', 'step': 812, 'epoch': 1} {'type': 'loss', 'content': 0.02191738598048687, 'timestamp': '2025-09-30 22:10:10.061034', 'step': 813, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:10.096190', 'step': 813, 'epoch': 1} {'type': 'loss', 'content': 0.0168896596878767, 'timestamp': '2025-09-30 22:10:10.099881', 'step': 814, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:10.139451', 'step': 814, 'epoch': 1} {'type': 'loss', 'content': 0.00681196479126811, 'timestamp': '2025-09-30 22:10:10.142371', 'step': 815, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:10.179859', 'step': 815, 'epoch': 1} {'type': 'loss', 'content': 0.0110024968162179, 'timestamp': '2025-09-30 22:10:10.203904', 'step': 816, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:10.241164', 'step': 816, 'epoch': 1} {'type': 'loss', 'content': 0.031019117683172226, 'timestamp': '2025-09-30 22:10:10.244077', 'step': 817, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:10.284673', 'step': 817, 'epoch': 1} {'type': 'loss', 'content': 0.014357957057654858, 'timestamp': '2025-09-30 22:10:10.288383', 'step': 818, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:10.332432', 'step': 818, 'epoch': 1} {'type': 'loss', 'content': 0.0071715316735208035, 'timestamp': '2025-09-30 22:10:10.335084', 'step': 819, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:10.374737', 'step': 819, 'epoch': 1} {'type': 'loss', 'content': 0.01265205442905426, 'timestamp': '2025-09-30 22:10:10.399174', 'step': 820, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:10.445249', 'step': 820, 'epoch': 1} {'type': 'loss', 'content': 0.011570468544960022, 'timestamp': '2025-09-30 22:10:10.448305', 'step': 821, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:10.488167', 'step': 821, 'epoch': 1} {'type': 'loss', 'content': 0.012007759883999825, 'timestamp': '2025-09-30 22:10:10.496619', 'step': 822, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:10.529828', 'step': 822, 'epoch': 1} {'type': 'loss', 'content': 0.01156451366841793, 'timestamp': '2025-09-30 22:10:10.533774', 'step': 823, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:10.566844', 'step': 823, 'epoch': 1} {'type': 'loss', 'content': 0.027540789917111397, 'timestamp': '2025-09-30 22:10:10.591111', 'step': 824, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:10.629161', 'step': 824, 'epoch': 1} {'type': 'loss', 'content': 0.009911253117024899, 'timestamp': '2025-09-30 22:10:10.632734', 'step': 825, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:10.668861', 'step': 825, 'epoch': 1} {'type': 'loss', 'content': 0.016379550099372864, 'timestamp': '2025-09-30 22:10:10.671556', 'step': 826, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:10.708865', 'step': 826, 'epoch': 1} {'type': 'loss', 'content': 0.02639281377196312, 'timestamp': '2025-09-30 22:10:10.712158', 'step': 827, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:10.756565', 'step': 827, 'epoch': 1} {'type': 'loss', 'content': 0.012039894238114357, 'timestamp': '2025-09-30 22:10:10.781013', 'step': 828, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:10.812058', 'step': 828, 'epoch': 1} {'type': 'loss', 'content': 0.014376694336533546, 'timestamp': '2025-09-30 22:10:10.814848', 'step': 829, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:10.846815', 'step': 829, 'epoch': 1} {'type': 'loss', 'content': 0.004688502289354801, 'timestamp': '2025-09-30 22:10:10.849426', 'step': 830, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:10.882726', 'step': 830, 'epoch': 1} {'type': 'loss', 'content': 0.007329548709094524, 'timestamp': '2025-09-30 22:10:10.885788', 'step': 831, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:10.917811', 'step': 831, 'epoch': 1} {'type': 'loss', 'content': 0.006236738059669733, 'timestamp': '2025-09-30 22:10:10.950301', 'step': 832, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:10.984538', 'step': 832, 'epoch': 1} {'type': 'loss', 'content': 0.028410309925675392, 'timestamp': '2025-09-30 22:10:10.987981', 'step': 833, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:11.022303', 'step': 833, 'epoch': 1} {'type': 'loss', 'content': 0.011889264918863773, 'timestamp': '2025-09-30 22:10:11.026246', 'step': 834, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:11.062382', 'step': 834, 'epoch': 1} {'type': 'loss', 'content': 0.0037209205329418182, 'timestamp': '2025-09-30 22:10:11.066107', 'step': 835, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:11.107789', 'step': 835, 'epoch': 1} {'type': 'loss', 'content': 0.00844674650579691, 'timestamp': '2025-09-30 22:10:11.132646', 'step': 836, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:11.191113', 'step': 836, 'epoch': 1} {'type': 'loss', 'content': 0.03872412443161011, 'timestamp': '2025-09-30 22:10:11.195191', 'step': 837, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:11.231328', 'step': 837, 'epoch': 1} {'type': 'loss', 'content': 0.0209845881909132, 'timestamp': '2025-09-30 22:10:11.235063', 'step': 838, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:11.271720', 'step': 838, 'epoch': 1} {'type': 'loss', 'content': 0.03015008568763733, 'timestamp': '2025-09-30 22:10:11.276000', 'step': 839, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:11.317072', 'step': 839, 'epoch': 1} {'type': 'loss', 'content': 0.012239743955433369, 'timestamp': '2025-09-30 22:10:11.342734', 'step': 840, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:11.383789', 'step': 840, 'epoch': 1} {'type': 'loss', 'content': 0.014199174009263515, 'timestamp': '2025-09-30 22:10:11.388549', 'step': 841, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:11.423823', 'step': 841, 'epoch': 1} {'type': 'loss', 'content': 0.02206958830356598, 'timestamp': '2025-09-30 22:10:11.426955', 'step': 842, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:11.467847', 'step': 842, 'epoch': 1} {'type': 'loss', 'content': 0.017280694097280502, 'timestamp': '2025-09-30 22:10:11.471294', 'step': 843, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:11.504526', 'step': 843, 'epoch': 1} {'type': 'loss', 'content': 0.01756494864821434, 'timestamp': '2025-09-30 22:10:11.529370', 'step': 844, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:11.569818', 'step': 844, 'epoch': 1} {'type': 'loss', 'content': 0.006199831608682871, 'timestamp': '2025-09-30 22:10:11.572811', 'step': 845, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:11.606421', 'step': 845, 'epoch': 1} {'type': 'loss', 'content': 0.001826465129852295, 'timestamp': '2025-09-30 22:10:11.609304', 'step': 846, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:11.645536', 'step': 846, 'epoch': 1} {'type': 'loss', 'content': 0.02917005680501461, 'timestamp': '2025-09-30 22:10:11.648355', 'step': 847, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:11.681622', 'step': 847, 'epoch': 1} {'type': 'loss', 'content': 0.02054743655025959, 'timestamp': '2025-09-30 22:10:11.706800', 'step': 848, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:11.751607', 'step': 848, 'epoch': 1} {'type': 'loss', 'content': 0.003462132764980197, 'timestamp': '2025-09-30 22:10:11.754761', 'step': 849, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:11.799828', 'step': 849, 'epoch': 1} {'type': 'loss', 'content': 0.0037231442984193563, 'timestamp': '2025-09-30 22:10:11.802639', 'step': 850, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:11.844274', 'step': 850, 'epoch': 1} {'type': 'loss', 'content': 0.0038203117437660694, 'timestamp': '2025-09-30 22:10:11.857370', 'step': 851, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:11.898968', 'step': 851, 'epoch': 1} {'type': 'loss', 'content': 0.012988201342523098, 'timestamp': '2025-09-30 22:10:11.923772', 'step': 852, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:11.960516', 'step': 852, 'epoch': 1} {'type': 'loss', 'content': 0.005469611845910549, 'timestamp': '2025-09-30 22:10:11.963453', 'step': 853, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:12.003814', 'step': 853, 'epoch': 1} {'type': 'loss', 'content': 0.02971557341516018, 'timestamp': '2025-09-30 22:10:12.006786', 'step': 854, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:12.049344', 'step': 854, 'epoch': 1} {'type': 'loss', 'content': 0.004186688922345638, 'timestamp': '2025-09-30 22:10:12.051818', 'step': 855, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-30 22:10:13.067796', 'step': 855, 'epoch': 1} {'type': 'pplx', 'content': 76245623.29374737, 'timestamp': '2025-09-30 22:10:13.071528', 'step': 855, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:13.103783', 'step': 855, 'epoch': 1} {'type': 'loss', 'content': 0.004449806176126003, 'timestamp': '2025-09-30 22:10:13.136091', 'step': 856, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:13.170275', 'step': 856, 'epoch': 1} {'type': 'loss', 'content': 0.014926557429134846, 'timestamp': '2025-09-30 22:10:13.179783', 'step': 857, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:13.212624', 'step': 857, 'epoch': 1} {'type': 'loss', 'content': 0.009851276874542236, 'timestamp': '2025-09-30 22:10:13.215334', 'step': 858, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:13.253845', 'step': 858, 'epoch': 1} {'type': 'loss', 'content': 0.012923565693199635, 'timestamp': '2025-09-30 22:10:13.264991', 'step': 859, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:13.308503', 'step': 859, 'epoch': 1} {'type': 'loss', 'content': 0.026185166090726852, 'timestamp': '2025-09-30 22:10:13.338000', 'step': 860, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:13.392677', 'step': 860, 'epoch': 1} {'type': 'loss', 'content': 0.014344602823257446, 'timestamp': '2025-09-30 22:10:13.395173', 'step': 861, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:13.441472', 'step': 861, 'epoch': 1} {'type': 'loss', 'content': 0.006821572780609131, 'timestamp': '2025-09-30 22:10:13.444125', 'step': 862, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:13.483678', 'step': 862, 'epoch': 1} {'type': 'loss', 'content': 0.012688002549111843, 'timestamp': '2025-09-30 22:10:13.494659', 'step': 863, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:13.530177', 'step': 863, 'epoch': 1} {'type': 'loss', 'content': 0.008426748216152191, 'timestamp': '2025-09-30 22:10:13.554357', 'step': 864, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:13.587828', 'step': 864, 'epoch': 1} {'type': 'loss', 'content': 0.02354290522634983, 'timestamp': '2025-09-30 22:10:13.591844', 'step': 865, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:13.627896', 'step': 865, 'epoch': 1} {'type': 'loss', 'content': 0.0057960678823292255, 'timestamp': '2025-09-30 22:10:13.631410', 'step': 866, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:13.666119', 'step': 866, 'epoch': 1} {'type': 'loss', 'content': 0.0035158873070031404, 'timestamp': '2025-09-30 22:10:13.668876', 'step': 867, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:13.702186', 'step': 867, 'epoch': 1} {'type': 'loss', 'content': 0.01108395867049694, 'timestamp': '2025-09-30 22:10:13.727805', 'step': 868, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:13.770813', 'step': 868, 'epoch': 1} {'type': 'loss', 'content': 0.016525190323591232, 'timestamp': '2025-09-30 22:10:13.773329', 'step': 869, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:13.812572', 'step': 869, 'epoch': 1} {'type': 'loss', 'content': 0.04078942909836769, 'timestamp': '2025-09-30 22:10:13.816809', 'step': 870, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:13.849193', 'step': 870, 'epoch': 1} {'type': 'loss', 'content': 0.006157420575618744, 'timestamp': '2025-09-30 22:10:13.851316', 'step': 871, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:13.885135', 'step': 871, 'epoch': 1} {'type': 'loss', 'content': 0.0009152033017016947, 'timestamp': '2025-09-30 22:10:13.909702', 'step': 872, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:13.945406', 'step': 872, 'epoch': 1} {'type': 'loss', 'content': 0.012125137262046337, 'timestamp': '2025-09-30 22:10:13.948015', 'step': 873, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:13.985528', 'step': 873, 'epoch': 1} {'type': 'loss', 'content': 0.006319773383438587, 'timestamp': '2025-09-30 22:10:13.995904', 'step': 874, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:14.030647', 'step': 874, 'epoch': 1} {'type': 'loss', 'content': 0.023734863847494125, 'timestamp': '2025-09-30 22:10:14.033912', 'step': 875, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:14.070585', 'step': 875, 'epoch': 1} {'type': 'loss', 'content': 0.03549584373831749, 'timestamp': '2025-09-30 22:10:14.096078', 'step': 876, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:14.137320', 'step': 876, 'epoch': 1} {'type': 'loss', 'content': 0.003112002043053508, 'timestamp': '2025-09-30 22:10:14.140961', 'step': 877, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:14.175720', 'step': 877, 'epoch': 1} {'type': 'loss', 'content': 0.0011412525782361627, 'timestamp': '2025-09-30 22:10:14.180402', 'step': 878, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:14.214505', 'step': 878, 'epoch': 1} {'type': 'loss', 'content': 0.0346827395260334, 'timestamp': '2025-09-30 22:10:14.218881', 'step': 879, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:14.268462', 'step': 879, 'epoch': 1} {'type': 'loss', 'content': 0.05454268679022789, 'timestamp': '2025-09-30 22:10:14.293747', 'step': 880, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:14.328093', 'step': 880, 'epoch': 1} {'type': 'loss', 'content': 0.023915329948067665, 'timestamp': '2025-09-30 22:10:14.331235', 'step': 881, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:14.368028', 'step': 881, 'epoch': 1} {'type': 'loss', 'content': 0.03430422767996788, 'timestamp': '2025-09-30 22:10:14.370830', 'step': 882, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:14.412088', 'step': 882, 'epoch': 1} {'type': 'loss', 'content': 0.025995483621954918, 'timestamp': '2025-09-30 22:10:14.415882', 'step': 883, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:14.454009', 'step': 883, 'epoch': 1} {'type': 'loss', 'content': 0.0020150647033005953, 'timestamp': '2025-09-30 22:10:14.479840', 'step': 884, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:14.514068', 'step': 884, 'epoch': 1} {'type': 'loss', 'content': 0.02836998924612999, 'timestamp': '2025-09-30 22:10:14.517649', 'step': 885, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:14.550930', 'step': 885, 'epoch': 1} {'type': 'loss', 'content': 0.011780476197600365, 'timestamp': '2025-09-30 22:10:14.553230', 'step': 886, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:14.587832', 'step': 886, 'epoch': 1} {'type': 'loss', 'content': 0.02879740297794342, 'timestamp': '2025-09-30 22:10:14.591053', 'step': 887, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:14.640613', 'step': 887, 'epoch': 1} {'type': 'loss', 'content': 0.03621939569711685, 'timestamp': '2025-09-30 22:10:14.666454', 'step': 888, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:14.700269', 'step': 888, 'epoch': 1} {'type': 'loss', 'content': 0.019999349489808083, 'timestamp': '2025-09-30 22:10:14.711139', 'step': 889, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:14.746012', 'step': 889, 'epoch': 1} {'type': 'loss', 'content': 0.05889752879738808, 'timestamp': '2025-09-30 22:10:14.748870', 'step': 890, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:14.786331', 'step': 890, 'epoch': 1} {'type': 'loss', 'content': 0.011419662274420261, 'timestamp': '2025-09-30 22:10:14.790768', 'step': 891, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:14.825971', 'step': 891, 'epoch': 1} {'type': 'loss', 'content': 0.020933421328663826, 'timestamp': '2025-09-30 22:10:14.851145', 'step': 892, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:14.889528', 'step': 892, 'epoch': 1} {'type': 'loss', 'content': 0.02317097596824169, 'timestamp': '2025-09-30 22:10:14.893180', 'step': 893, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:14.929331', 'step': 893, 'epoch': 1} {'type': 'loss', 'content': 0.04982530325651169, 'timestamp': '2025-09-30 22:10:14.932743', 'step': 894, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:14.970798', 'step': 894, 'epoch': 1} {'type': 'loss', 'content': 0.014119095169007778, 'timestamp': '2025-09-30 22:10:14.979403', 'step': 895, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:15.013199', 'step': 895, 'epoch': 1} {'type': 'loss', 'content': 0.02401905320584774, 'timestamp': '2025-09-30 22:10:15.037581', 'step': 896, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:15.072554', 'step': 896, 'epoch': 1} {'type': 'loss', 'content': 0.01428347546607256, 'timestamp': '2025-09-30 22:10:15.076634', 'step': 897, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:15.116302', 'step': 897, 'epoch': 1} {'type': 'loss', 'content': 0.016454854980111122, 'timestamp': '2025-09-30 22:10:15.121019', 'step': 898, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:15.154705', 'step': 898, 'epoch': 1} {'type': 'loss', 'content': 0.016421593725681305, 'timestamp': '2025-09-30 22:10:15.157974', 'step': 899, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:15.190313', 'step': 899, 'epoch': 1} {'type': 'loss', 'content': 0.0145410830155015, 'timestamp': '2025-09-30 22:10:15.214308', 'step': 900, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:15.255178', 'step': 900, 'epoch': 1} {'type': 'loss', 'content': 0.014178029261529446, 'timestamp': '2025-09-30 22:10:15.258194', 'step': 901, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:15.297815', 'step': 901, 'epoch': 1} {'type': 'loss', 'content': 0.00803608912974596, 'timestamp': '2025-09-30 22:10:15.300276', 'step': 902, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:15.334865', 'step': 902, 'epoch': 1} {'type': 'loss', 'content': 0.015483580529689789, 'timestamp': '2025-09-30 22:10:15.337619', 'step': 903, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:15.377575', 'step': 903, 'epoch': 1} {'type': 'loss', 'content': 0.026707107201218605, 'timestamp': '2025-09-30 22:10:15.401993', 'step': 904, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:15.435159', 'step': 904, 'epoch': 1} {'type': 'loss', 'content': 0.0035559723619371653, 'timestamp': '2025-09-30 22:10:15.437922', 'step': 905, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:15.473476', 'step': 905, 'epoch': 1} {'type': 'loss', 'content': 0.006338414270430803, 'timestamp': '2025-09-30 22:10:15.476934', 'step': 906, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:15.511633', 'step': 906, 'epoch': 1} {'type': 'loss', 'content': 0.039536114782094955, 'timestamp': '2025-09-30 22:10:15.517593', 'step': 907, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:15.553172', 'step': 907, 'epoch': 1} {'type': 'loss', 'content': 0.02256901189684868, 'timestamp': '2025-09-30 22:10:15.584110', 'step': 908, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:15.625112', 'step': 908, 'epoch': 1} {'type': 'loss', 'content': 0.02677801065146923, 'timestamp': '2025-09-30 22:10:15.627782', 'step': 909, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:15.663550', 'step': 909, 'epoch': 1} {'type': 'loss', 'content': 0.0022334277164191008, 'timestamp': '2025-09-30 22:10:15.670463', 'step': 910, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:15.716567', 'step': 910, 'epoch': 1} {'type': 'loss', 'content': 0.015281999483704567, 'timestamp': '2025-09-30 22:10:15.719979', 'step': 911, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:15.763388', 'step': 911, 'epoch': 1} {'type': 'loss', 'content': 0.0033361141104251146, 'timestamp': '2025-09-30 22:10:15.793277', 'step': 912, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-30 22:10:16.748223', 'step': 912, 'epoch': 1} {'type': 'pplx', 'content': 62558746.91204297, 'timestamp': '2025-09-30 22:10:16.751089', 'step': 912, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:16.792362', 'step': 912, 'epoch': 1} {'type': 'loss', 'content': 0.040188103914260864, 'timestamp': '2025-09-30 22:10:16.794586', 'step': 913, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:16.827625', 'step': 913, 'epoch': 1} {'type': 'loss', 'content': 0.008862501941621304, 'timestamp': '2025-09-30 22:10:16.834192', 'step': 914, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:16.866079', 'step': 914, 'epoch': 1} {'type': 'loss', 'content': 0.004693312104791403, 'timestamp': '2025-09-30 22:10:16.870428', 'step': 915, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:16.905282', 'step': 915, 'epoch': 1} {'type': 'loss', 'content': 0.005864806938916445, 'timestamp': '2025-09-30 22:10:16.928897', 'step': 916, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:16.963601', 'step': 916, 'epoch': 1} {'type': 'loss', 'content': 0.030392948538064957, 'timestamp': '2025-09-30 22:10:16.967509', 'step': 917, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:17.029177', 'step': 917, 'epoch': 2} {'type': 'loss', 'content': 0.04122446849942207, 'timestamp': '2025-09-30 22:10:17.033503', 'step': 918, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:17.068472', 'step': 918, 'epoch': 2} {'type': 'loss', 'content': 0.017605489119887352, 'timestamp': '2025-09-30 22:10:17.073010', 'step': 919, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:17.108665', 'step': 919, 'epoch': 2} {'type': 'loss', 'content': 0.043698132038116455, 'timestamp': '2025-09-30 22:10:17.133330', 'step': 920, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:17.171458', 'step': 920, 'epoch': 2} {'type': 'loss', 'content': 0.01505519263446331, 'timestamp': '2025-09-30 22:10:17.174223', 'step': 921, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:17.213216', 'step': 921, 'epoch': 2} {'type': 'loss', 'content': 0.03668801113963127, 'timestamp': '2025-09-30 22:10:17.218407', 'step': 922, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:17.255918', 'step': 922, 'epoch': 2} {'type': 'loss', 'content': 0.02062925696372986, 'timestamp': '2025-09-30 22:10:17.259624', 'step': 923, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:17.298606', 'step': 923, 'epoch': 2} {'type': 'loss', 'content': 0.031664784997701645, 'timestamp': '2025-09-30 22:10:17.324678', 'step': 924, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:17.362897', 'step': 924, 'epoch': 2} {'type': 'loss', 'content': 0.015291580930352211, 'timestamp': '2025-09-30 22:10:17.369368', 'step': 925, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:17.403591', 'step': 925, 'epoch': 2} {'type': 'loss', 'content': 0.01260668970644474, 'timestamp': '2025-09-30 22:10:17.415503', 'step': 926, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:17.461527', 'step': 926, 'epoch': 2} {'type': 'loss', 'content': 0.020603179931640625, 'timestamp': '2025-09-30 22:10:17.464519', 'step': 927, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:17.506816', 'step': 927, 'epoch': 2} {'type': 'loss', 'content': 0.018412886187434196, 'timestamp': '2025-09-30 22:10:17.531909', 'step': 928, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:17.565854', 'step': 928, 'epoch': 2} {'type': 'loss', 'content': 0.014865756034851074, 'timestamp': '2025-09-30 22:10:17.572333', 'step': 929, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:17.613432', 'step': 929, 'epoch': 2} {'type': 'loss', 'content': 0.019409114494919777, 'timestamp': '2025-09-30 22:10:17.616608', 'step': 930, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:17.655128', 'step': 930, 'epoch': 2} {'type': 'loss', 'content': 0.016556749120354652, 'timestamp': '2025-09-30 22:10:17.657337', 'step': 931, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:17.689814', 'step': 931, 'epoch': 2} {'type': 'loss', 'content': 0.025179879739880562, 'timestamp': '2025-09-30 22:10:17.716195', 'step': 932, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:17.754860', 'step': 932, 'epoch': 2} {'type': 'loss', 'content': 0.019688468426465988, 'timestamp': '2025-09-30 22:10:17.757557', 'step': 933, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:17.791526', 'step': 933, 'epoch': 2} {'type': 'loss', 'content': 0.028649795800447464, 'timestamp': '2025-09-30 22:10:17.795370', 'step': 934, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:17.834687', 'step': 934, 'epoch': 2} {'type': 'loss', 'content': 0.013120375573635101, 'timestamp': '2025-09-30 22:10:17.845972', 'step': 935, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:17.890825', 'step': 935, 'epoch': 2} {'type': 'loss', 'content': 0.024192016571760178, 'timestamp': '2025-09-30 22:10:17.915480', 'step': 936, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:17.954377', 'step': 936, 'epoch': 2} {'type': 'loss', 'content': 0.018348783254623413, 'timestamp': '2025-09-30 22:10:17.960764', 'step': 937, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:18.013060', 'step': 937, 'epoch': 2} {'type': 'loss', 'content': 0.021533623337745667, 'timestamp': '2025-09-30 22:10:18.017018', 'step': 938, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:18.050118', 'step': 938, 'epoch': 2} {'type': 'loss', 'content': 0.026502570137381554, 'timestamp': '2025-09-30 22:10:18.054381', 'step': 939, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:18.093759', 'step': 939, 'epoch': 2} {'type': 'loss', 'content': 0.018051905557513237, 'timestamp': '2025-09-30 22:10:18.120524', 'step': 940, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:18.168520', 'step': 940, 'epoch': 2} {'type': 'loss', 'content': 0.02367454580962658, 'timestamp': '2025-09-30 22:10:18.173392', 'step': 941, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:18.211846', 'step': 941, 'epoch': 2} {'type': 'loss', 'content': 0.01433511357754469, 'timestamp': '2025-09-30 22:10:18.215841', 'step': 942, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:18.265855', 'step': 942, 'epoch': 2} {'type': 'loss', 'content': 0.013081178069114685, 'timestamp': '2025-09-30 22:10:18.268716', 'step': 943, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:18.302341', 'step': 943, 'epoch': 2} {'type': 'loss', 'content': 0.015170074068009853, 'timestamp': '2025-09-30 22:10:18.326621', 'step': 944, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:18.374861', 'step': 944, 'epoch': 2} {'type': 'loss', 'content': 0.023984912782907486, 'timestamp': '2025-09-30 22:10:18.388290', 'step': 945, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:18.437354', 'step': 945, 'epoch': 2} {'type': 'loss', 'content': 0.016848895698785782, 'timestamp': '2025-09-30 22:10:18.440282', 'step': 946, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:18.482779', 'step': 946, 'epoch': 2} {'type': 'loss', 'content': 0.017187591642141342, 'timestamp': '2025-09-30 22:10:18.486116', 'step': 947, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:18.524752', 'step': 947, 'epoch': 2} {'type': 'loss', 'content': 0.023166237398982048, 'timestamp': '2025-09-30 22:10:18.550134', 'step': 948, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:18.586866', 'step': 948, 'epoch': 2} {'type': 'loss', 'content': 0.018485546112060547, 'timestamp': '2025-09-30 22:10:18.590276', 'step': 949, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:18.625690', 'step': 949, 'epoch': 2} {'type': 'loss', 'content': 0.0056577036157250404, 'timestamp': '2025-09-30 22:10:18.629779', 'step': 950, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:18.676779', 'step': 950, 'epoch': 2} {'type': 'loss', 'content': 0.03402547910809517, 'timestamp': '2025-09-30 22:10:18.681353', 'step': 951, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:18.719368', 'step': 951, 'epoch': 2} {'type': 'loss', 'content': 0.007531954441219568, 'timestamp': '2025-09-30 22:10:18.751320', 'step': 952, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:18.784831', 'step': 952, 'epoch': 2} {'type': 'loss', 'content': 0.01770077273249626, 'timestamp': '2025-09-30 22:10:18.788229', 'step': 953, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:18.824869', 'step': 953, 'epoch': 2} {'type': 'loss', 'content': 0.013906643725931644, 'timestamp': '2025-09-30 22:10:18.837494', 'step': 954, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:18.878918', 'step': 954, 'epoch': 2} {'type': 'loss', 'content': 0.011043167673051357, 'timestamp': '2025-09-30 22:10:18.882603', 'step': 955, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:18.918145', 'step': 955, 'epoch': 2} {'type': 'loss', 'content': 0.006359127350151539, 'timestamp': '2025-09-30 22:10:18.946118', 'step': 956, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:18.980317', 'step': 956, 'epoch': 2} {'type': 'loss', 'content': 0.008600293658673763, 'timestamp': '2025-09-30 22:10:18.984720', 'step': 957, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:19.029161', 'step': 957, 'epoch': 2} {'type': 'loss', 'content': 0.004958875942975283, 'timestamp': '2025-09-30 22:10:19.035639', 'step': 958, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:19.073838', 'step': 958, 'epoch': 2} {'type': 'loss', 'content': 0.020593535155057907, 'timestamp': '2025-09-30 22:10:19.084805', 'step': 959, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:19.141702', 'step': 959, 'epoch': 2} {'type': 'loss', 'content': 0.017098385840654373, 'timestamp': '2025-09-30 22:10:19.166911', 'step': 960, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:19.200471', 'step': 960, 'epoch': 2} {'type': 'loss', 'content': 0.0009686704725027084, 'timestamp': '2025-09-30 22:10:19.203051', 'step': 961, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:19.243207', 'step': 961, 'epoch': 2} {'type': 'loss', 'content': 0.002368885325267911, 'timestamp': '2025-09-30 22:10:19.246636', 'step': 962, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:19.280102', 'step': 962, 'epoch': 2} {'type': 'loss', 'content': 0.0030674913432449102, 'timestamp': '2025-09-30 22:10:19.284029', 'step': 963, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:19.318671', 'step': 963, 'epoch': 2} {'type': 'loss', 'content': 0.015550940297544003, 'timestamp': '2025-09-30 22:10:19.344411', 'step': 964, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:19.413774', 'step': 964, 'epoch': 2} {'type': 'loss', 'content': 0.005516111385077238, 'timestamp': '2025-09-30 22:10:19.416740', 'step': 965, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:19.456668', 'step': 965, 'epoch': 2} {'type': 'loss', 'content': 0.0011931475019082427, 'timestamp': '2025-09-30 22:10:19.460544', 'step': 966, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:19.495950', 'step': 966, 'epoch': 2} {'type': 'loss', 'content': 0.02176191285252571, 'timestamp': '2025-09-30 22:10:19.499482', 'step': 967, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:19.545594', 'step': 967, 'epoch': 2} {'type': 'loss', 'content': 0.0032643016893416643, 'timestamp': '2025-09-30 22:10:19.569621', 'step': 968, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:19.602255', 'step': 968, 'epoch': 2} {'type': 'loss', 'content': 0.04372279718518257, 'timestamp': '2025-09-30 22:10:19.618747', 'step': 969, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-30 22:10:20.657409', 'step': 969, 'epoch': 2} {'type': 'pplx', 'content': 65788041.33313765, 'timestamp': '2025-09-30 22:10:20.661987', 'step': 969, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:20.693631', 'step': 969, 'epoch': 2} {'type': 'loss', 'content': 0.033423688262701035, 'timestamp': '2025-09-30 22:10:20.698865', 'step': 970, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:20.738094', 'step': 970, 'epoch': 2} {'type': 'loss', 'content': 0.030283639207482338, 'timestamp': '2025-09-30 22:10:20.744640', 'step': 971, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:20.779689', 'step': 971, 'epoch': 2} {'type': 'loss', 'content': 0.015880217775702477, 'timestamp': '2025-09-30 22:10:20.803928', 'step': 972, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:20.844820', 'step': 972, 'epoch': 2} {'type': 'loss', 'content': 0.022766422480344772, 'timestamp': '2025-09-30 22:10:20.848562', 'step': 973, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:20.883473', 'step': 973, 'epoch': 2} {'type': 'loss', 'content': 0.013303340412676334, 'timestamp': '2025-09-30 22:10:20.887367', 'step': 974, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:20.921327', 'step': 974, 'epoch': 2} {'type': 'loss', 'content': 0.012033392675220966, 'timestamp': '2025-09-30 22:10:20.926664', 'step': 975, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:20.963256', 'step': 975, 'epoch': 2} {'type': 'loss', 'content': 0.020994653925299644, 'timestamp': '2025-09-30 22:10:20.990087', 'step': 976, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:21.028001', 'step': 976, 'epoch': 2} {'type': 'loss', 'content': 0.035628240555524826, 'timestamp': '2025-09-30 22:10:21.033008', 'step': 977, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:21.067850', 'step': 977, 'epoch': 2} {'type': 'loss', 'content': 0.008654872886836529, 'timestamp': '2025-09-30 22:10:21.072430', 'step': 978, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:21.110669', 'step': 978, 'epoch': 2} {'type': 'loss', 'content': 0.016928749158978462, 'timestamp': '2025-09-30 22:10:21.115382', 'step': 979, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:21.159459', 'step': 979, 'epoch': 2} {'type': 'loss', 'content': 0.002923868829384446, 'timestamp': '2025-09-30 22:10:21.185317', 'step': 980, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:21.220838', 'step': 980, 'epoch': 2} {'type': 'loss', 'content': 0.005316472612321377, 'timestamp': '2025-09-30 22:10:21.224686', 'step': 981, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:21.270451', 'step': 981, 'epoch': 2} {'type': 'loss', 'content': 0.005148599855601788, 'timestamp': '2025-09-30 22:10:21.280768', 'step': 982, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:21.316564', 'step': 982, 'epoch': 2} {'type': 'loss', 'content': 0.03270730748772621, 'timestamp': '2025-09-30 22:10:21.320244', 'step': 983, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:21.357069', 'step': 983, 'epoch': 2} {'type': 'loss', 'content': 0.005614656023681164, 'timestamp': '2025-09-30 22:10:21.382552', 'step': 984, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:21.420065', 'step': 984, 'epoch': 2} {'type': 'loss', 'content': 0.01812359131872654, 'timestamp': '2025-09-30 22:10:21.423886', 'step': 985, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:21.465694', 'step': 985, 'epoch': 2} {'type': 'loss', 'content': 0.02315768599510193, 'timestamp': '2025-09-30 22:10:21.470914', 'step': 986, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:21.508553', 'step': 986, 'epoch': 2} {'type': 'loss', 'content': 0.030391912907361984, 'timestamp': '2025-09-30 22:10:21.512422', 'step': 987, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:21.548808', 'step': 987, 'epoch': 2} {'type': 'loss', 'content': 0.0035188833717256784, 'timestamp': '2025-09-30 22:10:21.573594', 'step': 988, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:21.613076', 'step': 988, 'epoch': 2} {'type': 'loss', 'content': 0.03694489970803261, 'timestamp': '2025-09-30 22:10:21.618536', 'step': 989, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:21.661314', 'step': 989, 'epoch': 2} {'type': 'loss', 'content': 0.00450171111151576, 'timestamp': '2025-09-30 22:10:21.666952', 'step': 990, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:21.705355', 'step': 990, 'epoch': 2} {'type': 'loss', 'content': 0.041242681443691254, 'timestamp': '2025-09-30 22:10:21.711881', 'step': 991, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:21.751963', 'step': 991, 'epoch': 2} {'type': 'loss', 'content': 0.019213315099477768, 'timestamp': '2025-09-30 22:10:21.778208', 'step': 992, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:21.816531', 'step': 992, 'epoch': 2} {'type': 'loss', 'content': 0.021348316222429276, 'timestamp': '2025-09-30 22:10:21.819610', 'step': 993, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:21.862860', 'step': 993, 'epoch': 2} {'type': 'loss', 'content': 0.025651155039668083, 'timestamp': '2025-09-30 22:10:21.866330', 'step': 994, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:21.902766', 'step': 994, 'epoch': 2} {'type': 'loss', 'content': 0.017335962504148483, 'timestamp': '2025-09-30 22:10:21.906886', 'step': 995, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:21.942458', 'step': 995, 'epoch': 2} {'type': 'loss', 'content': 0.02041892521083355, 'timestamp': '2025-09-30 22:10:21.968595', 'step': 996, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:22.002063', 'step': 996, 'epoch': 2} {'type': 'loss', 'content': 0.019564051181077957, 'timestamp': '2025-09-30 22:10:22.006678', 'step': 997, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:22.040636', 'step': 997, 'epoch': 2} {'type': 'loss', 'content': 0.028469962999224663, 'timestamp': '2025-09-30 22:10:22.044203', 'step': 998, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:22.081715', 'step': 998, 'epoch': 2} {'type': 'loss', 'content': 0.02833707444369793, 'timestamp': '2025-09-30 22:10:22.084474', 'step': 999, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:22.119282', 'step': 999, 'epoch': 2} {'type': 'loss', 'content': 0.0041335872374475, 'timestamp': '2025-09-30 22:10:22.144197', 'step': 1000, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 1000', 'timestamp': '2025-09-30 22:10:28.959668', 'step': 1000, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:29.015366', 'step': 1000, 'epoch': 2} {'type': 'loss', 'content': 0.02058243192732334, 'timestamp': '2025-09-30 22:10:29.018041', 'step': 1001, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:29.057796', 'step': 1001, 'epoch': 2} {'type': 'loss', 'content': 0.019590478390455246, 'timestamp': '2025-09-30 22:10:29.062434', 'step': 1002, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:29.100356', 'step': 1002, 'epoch': 2} {'type': 'loss', 'content': 0.02721039578318596, 'timestamp': '2025-09-30 22:10:29.102644', 'step': 1003, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:29.138940', 'step': 1003, 'epoch': 2} {'type': 'loss', 'content': 0.008601582609117031, 'timestamp': '2025-09-30 22:10:29.163786', 'step': 1004, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:29.194502', 'step': 1004, 'epoch': 2} {'type': 'loss', 'content': 0.01570257730782032, 'timestamp': '2025-09-30 22:10:29.197243', 'step': 1005, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:29.229643', 'step': 1005, 'epoch': 2} {'type': 'loss', 'content': 0.024934740737080574, 'timestamp': '2025-09-30 22:10:29.235385', 'step': 1006, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:29.267684', 'step': 1006, 'epoch': 2} {'type': 'loss', 'content': 0.02274438552558422, 'timestamp': '2025-09-30 22:10:29.273751', 'step': 1007, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:29.314877', 'step': 1007, 'epoch': 2} {'type': 'loss', 'content': 0.03345928713679314, 'timestamp': '2025-09-30 22:10:29.340342', 'step': 1008, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:29.372546', 'step': 1008, 'epoch': 2} {'type': 'loss', 'content': 0.023872241377830505, 'timestamp': '2025-09-30 22:10:29.374923', 'step': 1009, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:29.410290', 'step': 1009, 'epoch': 2} {'type': 'loss', 'content': 0.013263803906738758, 'timestamp': '2025-09-30 22:10:29.412737', 'step': 1010, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:29.449315', 'step': 1010, 'epoch': 2} {'type': 'loss', 'content': 0.008637587539851665, 'timestamp': '2025-09-30 22:10:29.452054', 'step': 1011, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:29.483839', 'step': 1011, 'epoch': 2} {'type': 'loss', 'content': 0.025991391390562057, 'timestamp': '2025-09-30 22:10:29.507270', 'step': 1012, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:29.539243', 'step': 1012, 'epoch': 2} {'type': 'loss', 'content': 0.03899989277124405, 'timestamp': '2025-09-30 22:10:29.541750', 'step': 1013, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:29.572992', 'step': 1013, 'epoch': 2} {'type': 'loss', 'content': 0.023212125524878502, 'timestamp': '2025-09-30 22:10:29.576482', 'step': 1014, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:29.608541', 'step': 1014, 'epoch': 2} {'type': 'loss', 'content': 0.016156161203980446, 'timestamp': '2025-09-30 22:10:29.610741', 'step': 1015, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:29.642214', 'step': 1015, 'epoch': 2} {'type': 'loss', 'content': 0.005735450424253941, 'timestamp': '2025-09-30 22:10:29.666732', 'step': 1016, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:29.699084', 'step': 1016, 'epoch': 2} {'type': 'loss', 'content': 0.022412346675992012, 'timestamp': '2025-09-30 22:10:29.701194', 'step': 1017, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:29.731552', 'step': 1017, 'epoch': 2} {'type': 'loss', 'content': 0.02042771875858307, 'timestamp': '2025-09-30 22:10:29.733969', 'step': 1018, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:29.766593', 'step': 1018, 'epoch': 2} {'type': 'loss', 'content': 0.011804310604929924, 'timestamp': '2025-09-30 22:10:29.770584', 'step': 1019, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:29.803087', 'step': 1019, 'epoch': 2} {'type': 'loss', 'content': 0.018140995875000954, 'timestamp': '2025-09-30 22:10:29.831209', 'step': 1020, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:29.862791', 'step': 1020, 'epoch': 2} {'type': 'loss', 'content': 0.026703549548983574, 'timestamp': '2025-09-30 22:10:29.867303', 'step': 1021, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:29.904426', 'step': 1021, 'epoch': 2} {'type': 'loss', 'content': 0.038636982440948486, 'timestamp': '2025-09-30 22:10:29.907011', 'step': 1022, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:29.938166', 'step': 1022, 'epoch': 2} {'type': 'loss', 'content': 0.0022895862348377705, 'timestamp': '2025-09-30 22:10:29.940260', 'step': 1023, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:29.972463', 'step': 1023, 'epoch': 2} {'type': 'loss', 'content': 0.010432879440486431, 'timestamp': '2025-09-30 22:10:29.996867', 'step': 1024, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:30.029207', 'step': 1024, 'epoch': 2} {'type': 'loss', 'content': 0.04155639931559563, 'timestamp': '2025-09-30 22:10:30.034069', 'step': 1025, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:30.064974', 'step': 1025, 'epoch': 2} {'type': 'loss', 'content': 0.05263324826955795, 'timestamp': '2025-09-30 22:10:30.067364', 'step': 1026, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-30 22:10:30.910493', 'step': 1026, 'epoch': 2} {'type': 'pplx', 'content': 60215314.01973439, 'timestamp': '2025-09-30 22:10:30.913813', 'step': 1026, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:10:30.945897', 'step': 1026, 'epoch': 2} {'type': 'loss', 'content': 0.0612885057926178, 'timestamp': '2025-09-30 22:10:30.948322', 'step': 1027, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:30.982165', 'step': 1027, 'epoch': 2} {'type': 'loss', 'content': 0.002322312444448471, 'timestamp': '2025-09-30 22:10:31.005970', 'step': 1028, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:31.040105', 'step': 1028, 'epoch': 2} {'type': 'loss', 'content': 0.01675250194966793, 'timestamp': '2025-09-30 22:10:31.042778', 'step': 1029, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:31.075684', 'step': 1029, 'epoch': 2} {'type': 'loss', 'content': 0.012638731859624386, 'timestamp': '2025-09-30 22:10:31.079022', 'step': 1030, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:31.123037', 'step': 1030, 'epoch': 2} {'type': 'loss', 'content': 0.012278775684535503, 'timestamp': '2025-09-30 22:10:31.130874', 'step': 1031, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:31.165879', 'step': 1031, 'epoch': 2} {'type': 'loss', 'content': 0.0036765080876648426, 'timestamp': '2025-09-30 22:10:31.191433', 'step': 1032, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:31.227516', 'step': 1032, 'epoch': 2} {'type': 'loss', 'content': 0.017122618854045868, 'timestamp': '2025-09-30 22:10:31.231045', 'step': 1033, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:31.264383', 'step': 1033, 'epoch': 2} {'type': 'loss', 'content': 0.01866624876856804, 'timestamp': '2025-09-30 22:10:31.267352', 'step': 1034, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:31.309810', 'step': 1034, 'epoch': 2} {'type': 'loss', 'content': 0.017900409176945686, 'timestamp': '2025-09-30 22:10:31.312897', 'step': 1035, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:31.346729', 'step': 1035, 'epoch': 2} {'type': 'loss', 'content': 0.006202004384249449, 'timestamp': '2025-09-30 22:10:31.371977', 'step': 1036, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:31.407889', 'step': 1036, 'epoch': 2} {'type': 'loss', 'content': 0.0101151829585433, 'timestamp': '2025-09-30 22:10:31.412188', 'step': 1037, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:31.444937', 'step': 1037, 'epoch': 2} {'type': 'loss', 'content': 0.038425859063863754, 'timestamp': '2025-09-30 22:10:31.448352', 'step': 1038, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:31.480931', 'step': 1038, 'epoch': 2} {'type': 'loss', 'content': 0.01929587312042713, 'timestamp': '2025-09-30 22:10:31.483725', 'step': 1039, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:31.518714', 'step': 1039, 'epoch': 2} {'type': 'loss', 'content': 0.024310678243637085, 'timestamp': '2025-09-30 22:10:31.543127', 'step': 1040, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:31.578292', 'step': 1040, 'epoch': 2} {'type': 'loss', 'content': 0.018113570287823677, 'timestamp': '2025-09-30 22:10:31.581908', 'step': 1041, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:31.623671', 'step': 1041, 'epoch': 2} {'type': 'loss', 'content': 0.009870938025414944, 'timestamp': '2025-09-30 22:10:31.626544', 'step': 1042, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:31.661505', 'step': 1042, 'epoch': 2} {'type': 'loss', 'content': 0.03137834742665291, 'timestamp': '2025-09-30 22:10:31.669968', 'step': 1043, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:31.705016', 'step': 1043, 'epoch': 2} {'type': 'loss', 'content': 0.016941692680120468, 'timestamp': '2025-09-30 22:10:31.730020', 'step': 1044, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:31.765093', 'step': 1044, 'epoch': 2} {'type': 'loss', 'content': 0.008219278417527676, 'timestamp': '2025-09-30 22:10:31.768958', 'step': 1045, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:31.804876', 'step': 1045, 'epoch': 2} {'type': 'loss', 'content': 0.012652714736759663, 'timestamp': '2025-09-30 22:10:31.807304', 'step': 1046, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:31.854911', 'step': 1046, 'epoch': 2} {'type': 'loss', 'content': 0.03563131019473076, 'timestamp': '2025-09-30 22:10:31.858971', 'step': 1047, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:31.894960', 'step': 1047, 'epoch': 2} {'type': 'loss', 'content': 0.01216198317706585, 'timestamp': '2025-09-30 22:10:31.920815', 'step': 1048, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:31.955629', 'step': 1048, 'epoch': 2} {'type': 'loss', 'content': 0.010561487637460232, 'timestamp': '2025-09-30 22:10:31.958977', 'step': 1049, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:31.992295', 'step': 1049, 'epoch': 2} {'type': 'loss', 'content': 0.011876196600496769, 'timestamp': '2025-09-30 22:10:31.995172', 'step': 1050, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:32.027651', 'step': 1050, 'epoch': 2} {'type': 'loss', 'content': 0.030352991074323654, 'timestamp': '2025-09-30 22:10:32.030753', 'step': 1051, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:32.071016', 'step': 1051, 'epoch': 2} {'type': 'loss', 'content': 0.009673316963016987, 'timestamp': '2025-09-30 22:10:32.096968', 'step': 1052, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:32.140199', 'step': 1052, 'epoch': 2} {'type': 'loss', 'content': 0.020116552710533142, 'timestamp': '2025-09-30 22:10:32.143690', 'step': 1053, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:32.181024', 'step': 1053, 'epoch': 2} {'type': 'loss', 'content': 0.02689552865922451, 'timestamp': '2025-09-30 22:10:32.190727', 'step': 1054, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:32.232982', 'step': 1054, 'epoch': 2} {'type': 'loss', 'content': 0.02992871031165123, 'timestamp': '2025-09-30 22:10:32.239639', 'step': 1055, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:32.283318', 'step': 1055, 'epoch': 2} {'type': 'loss', 'content': 0.03253794461488724, 'timestamp': '2025-09-30 22:10:32.310398', 'step': 1056, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:32.346640', 'step': 1056, 'epoch': 2} {'type': 'loss', 'content': 0.020867738872766495, 'timestamp': '2025-09-30 22:10:32.350082', 'step': 1057, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:32.382053', 'step': 1057, 'epoch': 2} {'type': 'loss', 'content': 0.02346712537109852, 'timestamp': '2025-09-30 22:10:32.384750', 'step': 1058, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:32.423038', 'step': 1058, 'epoch': 2} {'type': 'loss', 'content': 0.03169684857130051, 'timestamp': '2025-09-30 22:10:32.427230', 'step': 1059, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:32.470540', 'step': 1059, 'epoch': 2} {'type': 'loss', 'content': 0.014368179254233837, 'timestamp': '2025-09-30 22:10:32.513006', 'step': 1060, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:32.568627', 'step': 1060, 'epoch': 2} {'type': 'loss', 'content': 0.01384419109672308, 'timestamp': '2025-09-30 22:10:32.571411', 'step': 1061, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:32.608139', 'step': 1061, 'epoch': 2} {'type': 'loss', 'content': 0.01648283377289772, 'timestamp': '2025-09-30 22:10:32.612430', 'step': 1062, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:32.651956', 'step': 1062, 'epoch': 2} {'type': 'loss', 'content': 0.006230255123227835, 'timestamp': '2025-09-30 22:10:32.655386', 'step': 1063, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:32.689999', 'step': 1063, 'epoch': 2} {'type': 'loss', 'content': 0.010819241404533386, 'timestamp': '2025-09-30 22:10:32.714781', 'step': 1064, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:32.746809', 'step': 1064, 'epoch': 2} {'type': 'loss', 'content': 0.03453322499990463, 'timestamp': '2025-09-30 22:10:32.751294', 'step': 1065, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:32.787396', 'step': 1065, 'epoch': 2} {'type': 'loss', 'content': 0.02853448875248432, 'timestamp': '2025-09-30 22:10:32.790825', 'step': 1066, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:32.839603', 'step': 1066, 'epoch': 2} {'type': 'loss', 'content': 0.027251912280917168, 'timestamp': '2025-09-30 22:10:32.842832', 'step': 1067, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:32.882368', 'step': 1067, 'epoch': 2} {'type': 'loss', 'content': 0.009377564303576946, 'timestamp': '2025-09-30 22:10:32.906767', 'step': 1068, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:32.951650', 'step': 1068, 'epoch': 2} {'type': 'loss', 'content': 0.004282251000404358, 'timestamp': '2025-09-30 22:10:32.954283', 'step': 1069, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:32.989830', 'step': 1069, 'epoch': 2} {'type': 'loss', 'content': 0.011062629520893097, 'timestamp': '2025-09-30 22:10:33.000125', 'step': 1070, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:33.042147', 'step': 1070, 'epoch': 2} {'type': 'loss', 'content': 0.02192043885588646, 'timestamp': '2025-09-30 22:10:33.045852', 'step': 1071, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:33.079439', 'step': 1071, 'epoch': 2} {'type': 'loss', 'content': 0.028622159734368324, 'timestamp': '2025-09-30 22:10:33.103224', 'step': 1072, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:33.139353', 'step': 1072, 'epoch': 2} {'type': 'loss', 'content': 0.010093754157423973, 'timestamp': '2025-09-30 22:10:33.142699', 'step': 1073, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:33.179786', 'step': 1073, 'epoch': 2} {'type': 'loss', 'content': 0.0439641959965229, 'timestamp': '2025-09-30 22:10:33.193079', 'step': 1074, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:33.243822', 'step': 1074, 'epoch': 2} {'type': 'loss', 'content': 0.0537438802421093, 'timestamp': '2025-09-30 22:10:33.252776', 'step': 1075, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:33.291132', 'step': 1075, 'epoch': 2} {'type': 'loss', 'content': 0.050223346799612045, 'timestamp': '2025-09-30 22:10:33.316240', 'step': 1076, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:33.355997', 'step': 1076, 'epoch': 2} {'type': 'loss', 'content': 0.02958056330680847, 'timestamp': '2025-09-30 22:10:33.360699', 'step': 1077, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:33.394024', 'step': 1077, 'epoch': 2} {'type': 'loss', 'content': 0.020796921104192734, 'timestamp': '2025-09-30 22:10:33.399000', 'step': 1078, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:33.434126', 'step': 1078, 'epoch': 2} {'type': 'loss', 'content': 0.016962138935923576, 'timestamp': '2025-09-30 22:10:33.437605', 'step': 1079, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:33.478263', 'step': 1079, 'epoch': 2} {'type': 'loss', 'content': 0.01815619505941868, 'timestamp': '2025-09-30 22:10:33.502407', 'step': 1080, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:33.541546', 'step': 1080, 'epoch': 2} {'type': 'loss', 'content': 0.01795378513634205, 'timestamp': '2025-09-30 22:10:33.544428', 'step': 1081, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:33.588742', 'step': 1081, 'epoch': 2} {'type': 'loss', 'content': 0.028307702392339706, 'timestamp': '2025-09-30 22:10:33.600113', 'step': 1082, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:33.641511', 'step': 1082, 'epoch': 2} {'type': 'loss', 'content': 0.020032325759530067, 'timestamp': '2025-09-30 22:10:33.645522', 'step': 1083, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-30 22:10:34.585328', 'step': 1083, 'epoch': 2} {'type': 'pplx', 'content': 50986278.02673556, 'timestamp': '2025-09-30 22:10:34.588298', 'step': 1083, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:34.625695', 'step': 1083, 'epoch': 2} {'type': 'loss', 'content': 0.014734785072505474, 'timestamp': '2025-09-30 22:10:34.649595', 'step': 1084, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:34.683388', 'step': 1084, 'epoch': 2} {'type': 'loss', 'content': 0.005796567536890507, 'timestamp': '2025-09-30 22:10:34.686190', 'step': 1085, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:34.729315', 'step': 1085, 'epoch': 2} {'type': 'loss', 'content': 0.013309342786669731, 'timestamp': '2025-09-30 22:10:34.737398', 'step': 1086, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:34.773113', 'step': 1086, 'epoch': 2} {'type': 'loss', 'content': 0.02818778157234192, 'timestamp': '2025-09-30 22:10:34.776889', 'step': 1087, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:34.813976', 'step': 1087, 'epoch': 2} {'type': 'loss', 'content': 0.009719455614686012, 'timestamp': '2025-09-30 22:10:34.839046', 'step': 1088, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:34.878814', 'step': 1088, 'epoch': 2} {'type': 'loss', 'content': 0.032015398144721985, 'timestamp': '2025-09-30 22:10:34.882086', 'step': 1089, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:34.922540', 'step': 1089, 'epoch': 2} {'type': 'loss', 'content': 0.017986981198191643, 'timestamp': '2025-09-30 22:10:34.927130', 'step': 1090, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:34.959720', 'step': 1090, 'epoch': 2} {'type': 'loss', 'content': 0.041368067264556885, 'timestamp': '2025-09-30 22:10:34.962550', 'step': 1091, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:35.001883', 'step': 1091, 'epoch': 2} {'type': 'loss', 'content': 0.007110454607754946, 'timestamp': '2025-09-30 22:10:35.034183', 'step': 1092, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:35.067238', 'step': 1092, 'epoch': 2} {'type': 'loss', 'content': 0.01783628761768341, 'timestamp': '2025-09-30 22:10:35.070875', 'step': 1093, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:35.116127', 'step': 1093, 'epoch': 2} {'type': 'loss', 'content': 0.013988708145916462, 'timestamp': '2025-09-30 22:10:35.119130', 'step': 1094, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:35.155784', 'step': 1094, 'epoch': 2} {'type': 'loss', 'content': 0.010679191909730434, 'timestamp': '2025-09-30 22:10:35.158039', 'step': 1095, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:35.193822', 'step': 1095, 'epoch': 2} {'type': 'loss', 'content': 0.006078497972339392, 'timestamp': '2025-09-30 22:10:35.222861', 'step': 1096, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:35.254498', 'step': 1096, 'epoch': 2} {'type': 'loss', 'content': 0.015893442556262016, 'timestamp': '2025-09-30 22:10:35.257176', 'step': 1097, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:35.288882', 'step': 1097, 'epoch': 2} {'type': 'loss', 'content': 0.022868171334266663, 'timestamp': '2025-09-30 22:10:35.292145', 'step': 1098, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:35.324755', 'step': 1098, 'epoch': 2} {'type': 'loss', 'content': 0.01892166957259178, 'timestamp': '2025-09-30 22:10:35.327699', 'step': 1099, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:35.361135', 'step': 1099, 'epoch': 2} {'type': 'loss', 'content': 0.024770379066467285, 'timestamp': '2025-09-30 22:10:35.385763', 'step': 1100, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:35.429158', 'step': 1100, 'epoch': 2} {'type': 'loss', 'content': 0.025294585153460503, 'timestamp': '2025-09-30 22:10:35.432535', 'step': 1101, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:35.465816', 'step': 1101, 'epoch': 2} {'type': 'loss', 'content': 0.010897427797317505, 'timestamp': '2025-09-30 22:10:35.468891', 'step': 1102, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:35.501198', 'step': 1102, 'epoch': 2} {'type': 'loss', 'content': 0.018574992194771767, 'timestamp': '2025-09-30 22:10:35.503695', 'step': 1103, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:35.534900', 'step': 1103, 'epoch': 2} {'type': 'loss', 'content': 0.015447559766471386, 'timestamp': '2025-09-30 22:10:35.559792', 'step': 1104, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:35.593465', 'step': 1104, 'epoch': 2} {'type': 'loss', 'content': 0.011934633366763592, 'timestamp': '2025-09-30 22:10:35.596557', 'step': 1105, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:35.633372', 'step': 1105, 'epoch': 2} {'type': 'loss', 'content': 0.009000704623758793, 'timestamp': '2025-09-30 22:10:35.635916', 'step': 1106, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:35.677150', 'step': 1106, 'epoch': 2} {'type': 'loss', 'content': 0.02742808684706688, 'timestamp': '2025-09-30 22:10:35.679880', 'step': 1107, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:35.714544', 'step': 1107, 'epoch': 2} {'type': 'loss', 'content': 0.02327750436961651, 'timestamp': '2025-09-30 22:10:35.739124', 'step': 1108, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:35.773463', 'step': 1108, 'epoch': 2} {'type': 'loss', 'content': 0.011305660009384155, 'timestamp': '2025-09-30 22:10:35.776833', 'step': 1109, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:35.814782', 'step': 1109, 'epoch': 2} {'type': 'loss', 'content': 0.020335091277956963, 'timestamp': '2025-09-30 22:10:35.817105', 'step': 1110, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:35.850752', 'step': 1110, 'epoch': 2} {'type': 'loss', 'content': 0.019419783726334572, 'timestamp': '2025-09-30 22:10:35.853649', 'step': 1111, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:35.890700', 'step': 1111, 'epoch': 2} {'type': 'loss', 'content': 0.029895620420575142, 'timestamp': '2025-09-30 22:10:35.914372', 'step': 1112, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:35.948609', 'step': 1112, 'epoch': 2} {'type': 'loss', 'content': 0.02316124178469181, 'timestamp': '2025-09-30 22:10:35.951953', 'step': 1113, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:35.984088', 'step': 1113, 'epoch': 2} {'type': 'loss', 'content': 0.011370803229510784, 'timestamp': '2025-09-30 22:10:35.987455', 'step': 1114, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:36.021144', 'step': 1114, 'epoch': 2} {'type': 'loss', 'content': 0.020221199840307236, 'timestamp': '2025-09-30 22:10:36.024007', 'step': 1115, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:36.056691', 'step': 1115, 'epoch': 2} {'type': 'loss', 'content': 0.016114765778183937, 'timestamp': '2025-09-30 22:10:36.081163', 'step': 1116, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:36.113278', 'step': 1116, 'epoch': 2} {'type': 'loss', 'content': 0.023773958906531334, 'timestamp': '2025-09-30 22:10:36.115790', 'step': 1117, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:36.147247', 'step': 1117, 'epoch': 2} {'type': 'loss', 'content': 0.012325162068009377, 'timestamp': '2025-09-30 22:10:36.150566', 'step': 1118, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:36.190399', 'step': 1118, 'epoch': 2} {'type': 'loss', 'content': 0.018607692793011665, 'timestamp': '2025-09-30 22:10:36.192791', 'step': 1119, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:36.225227', 'step': 1119, 'epoch': 2} {'type': 'loss', 'content': 0.027039309963583946, 'timestamp': '2025-09-30 22:10:36.249980', 'step': 1120, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:36.282406', 'step': 1120, 'epoch': 2} {'type': 'loss', 'content': 0.025002550333738327, 'timestamp': '2025-09-30 22:10:36.284766', 'step': 1121, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:36.315779', 'step': 1121, 'epoch': 2} {'type': 'loss', 'content': 0.00902754720300436, 'timestamp': '2025-09-30 22:10:36.321046', 'step': 1122, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:36.353064', 'step': 1122, 'epoch': 2} {'type': 'loss', 'content': 0.012607453390955925, 'timestamp': '2025-09-30 22:10:36.358797', 'step': 1123, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:36.390125', 'step': 1123, 'epoch': 2} {'type': 'loss', 'content': 0.015060718171298504, 'timestamp': '2025-09-30 22:10:36.413933', 'step': 1124, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:36.445249', 'step': 1124, 'epoch': 2} {'type': 'loss', 'content': 0.006960070692002773, 'timestamp': '2025-09-30 22:10:36.447616', 'step': 1125, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:36.478822', 'step': 1125, 'epoch': 2} {'type': 'loss', 'content': 0.024447765201330185, 'timestamp': '2025-09-30 22:10:36.482178', 'step': 1126, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:36.522154', 'step': 1126, 'epoch': 2} {'type': 'loss', 'content': 0.030384311452507973, 'timestamp': '2025-09-30 22:10:36.524127', 'step': 1127, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:36.558656', 'step': 1127, 'epoch': 2} {'type': 'loss', 'content': 0.006873700302094221, 'timestamp': '2025-09-30 22:10:36.582615', 'step': 1128, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:36.613650', 'step': 1128, 'epoch': 2} {'type': 'loss', 'content': 0.04473824054002762, 'timestamp': '2025-09-30 22:10:36.616690', 'step': 1129, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:36.650084', 'step': 1129, 'epoch': 2} {'type': 'loss', 'content': 0.03146163001656532, 'timestamp': '2025-09-30 22:10:36.652845', 'step': 1130, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:36.685683', 'step': 1130, 'epoch': 2} {'type': 'loss', 'content': 0.012721200473606586, 'timestamp': '2025-09-30 22:10:36.694070', 'step': 1131, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:36.728145', 'step': 1131, 'epoch': 2} {'type': 'loss', 'content': 0.011350330896675587, 'timestamp': '2025-09-30 22:10:36.753076', 'step': 1132, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:36.785770', 'step': 1132, 'epoch': 2} {'type': 'loss', 'content': 0.023074982687830925, 'timestamp': '2025-09-30 22:10:36.788665', 'step': 1133, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:36.829646', 'step': 1133, 'epoch': 2} {'type': 'loss', 'content': 0.01878601871430874, 'timestamp': '2025-09-30 22:10:36.838847', 'step': 1134, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:36.876824', 'step': 1134, 'epoch': 2} {'type': 'loss', 'content': 0.021561546251177788, 'timestamp': '2025-09-30 22:10:36.879271', 'step': 1135, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:36.921248', 'step': 1135, 'epoch': 2} {'type': 'loss', 'content': 0.010309815406799316, 'timestamp': '2025-09-30 22:10:36.949775', 'step': 1136, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:36.987081', 'step': 1136, 'epoch': 2} {'type': 'loss', 'content': 0.006379503291100264, 'timestamp': '2025-09-30 22:10:36.992987', 'step': 1137, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:37.033910', 'step': 1137, 'epoch': 2} {'type': 'loss', 'content': 0.01991991326212883, 'timestamp': '2025-09-30 22:10:37.036186', 'step': 1138, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:37.075130', 'step': 1138, 'epoch': 2} {'type': 'loss', 'content': 0.011105488054454327, 'timestamp': '2025-09-30 22:10:37.078528', 'step': 1139, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:37.111543', 'step': 1139, 'epoch': 2} {'type': 'loss', 'content': 0.0210857093334198, 'timestamp': '2025-09-30 22:10:37.135257', 'step': 1140, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-30 22:10:38.006888', 'step': 1140, 'epoch': 2} {'type': 'pplx', 'content': 53247525.687543064, 'timestamp': '2025-09-30 22:10:38.011543', 'step': 1140, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:38.041302', 'step': 1140, 'epoch': 2} {'type': 'loss', 'content': 0.025197362527251244, 'timestamp': '2025-09-30 22:10:38.043813', 'step': 1141, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:38.076848', 'step': 1141, 'epoch': 2} {'type': 'loss', 'content': 0.009555136784911156, 'timestamp': '2025-09-30 22:10:38.079531', 'step': 1142, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:38.113845', 'step': 1142, 'epoch': 2} {'type': 'loss', 'content': 0.019450949504971504, 'timestamp': '2025-09-30 22:10:38.116681', 'step': 1143, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:38.149134', 'step': 1143, 'epoch': 2} {'type': 'loss', 'content': 0.031687263399362564, 'timestamp': '2025-09-30 22:10:38.173690', 'step': 1144, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:38.205600', 'step': 1144, 'epoch': 2} {'type': 'loss', 'content': 0.0040702177211642265, 'timestamp': '2025-09-30 22:10:38.210686', 'step': 1145, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:38.242816', 'step': 1145, 'epoch': 2} {'type': 'loss', 'content': 0.0202677883207798, 'timestamp': '2025-09-30 22:10:38.244984', 'step': 1146, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:38.276778', 'step': 1146, 'epoch': 2} {'type': 'loss', 'content': 0.010932053439319134, 'timestamp': '2025-09-30 22:10:38.283354', 'step': 1147, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:38.317640', 'step': 1147, 'epoch': 2} {'type': 'loss', 'content': 0.00964405108243227, 'timestamp': '2025-09-30 22:10:38.342124', 'step': 1148, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:38.384688', 'step': 1148, 'epoch': 2} {'type': 'loss', 'content': 0.02433297224342823, 'timestamp': '2025-09-30 22:10:38.387023', 'step': 1149, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:38.426732', 'step': 1149, 'epoch': 2} {'type': 'loss', 'content': 0.01564393751323223, 'timestamp': '2025-09-30 22:10:38.432005', 'step': 1150, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:38.477250', 'step': 1150, 'epoch': 2} {'type': 'loss', 'content': 0.03086140565574169, 'timestamp': '2025-09-30 22:10:38.481359', 'step': 1151, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:38.523262', 'step': 1151, 'epoch': 2} {'type': 'loss', 'content': 0.01326081994920969, 'timestamp': '2025-09-30 22:10:38.547718', 'step': 1152, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:38.580312', 'step': 1152, 'epoch': 2} {'type': 'loss', 'content': 0.015160572715103626, 'timestamp': '2025-09-30 22:10:38.582645', 'step': 1153, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:38.615338', 'step': 1153, 'epoch': 2} {'type': 'loss', 'content': 0.009467801079154015, 'timestamp': '2025-09-30 22:10:38.617796', 'step': 1154, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:38.653777', 'step': 1154, 'epoch': 2} {'type': 'loss', 'content': 0.025078266859054565, 'timestamp': '2025-09-30 22:10:38.657845', 'step': 1155, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:38.691646', 'step': 1155, 'epoch': 2} {'type': 'loss', 'content': 0.02187494933605194, 'timestamp': '2025-09-30 22:10:38.715282', 'step': 1156, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:38.746226', 'step': 1156, 'epoch': 2} {'type': 'loss', 'content': 0.013467269018292427, 'timestamp': '2025-09-30 22:10:38.748824', 'step': 1157, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:38.780337', 'step': 1157, 'epoch': 2} {'type': 'loss', 'content': 0.008198106661438942, 'timestamp': '2025-09-30 22:10:38.782715', 'step': 1158, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:38.817716', 'step': 1158, 'epoch': 2} {'type': 'loss', 'content': 0.012998079881072044, 'timestamp': '2025-09-30 22:10:38.826299', 'step': 1159, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:38.857572', 'step': 1159, 'epoch': 2} {'type': 'loss', 'content': 0.009026200510561466, 'timestamp': '2025-09-30 22:10:38.882266', 'step': 1160, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:38.913813', 'step': 1160, 'epoch': 2} {'type': 'loss', 'content': 0.009720182977616787, 'timestamp': '2025-09-30 22:10:38.916040', 'step': 1161, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:38.952656', 'step': 1161, 'epoch': 2} {'type': 'loss', 'content': 0.025005189701914787, 'timestamp': '2025-09-30 22:10:38.955306', 'step': 1162, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:38.992268', 'step': 1162, 'epoch': 2} {'type': 'loss', 'content': 0.01020416896790266, 'timestamp': '2025-09-30 22:10:38.998210', 'step': 1163, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:39.032297', 'step': 1163, 'epoch': 2} {'type': 'loss', 'content': 0.020846812054514885, 'timestamp': '2025-09-30 22:10:39.055949', 'step': 1164, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:39.092365', 'step': 1164, 'epoch': 2} {'type': 'loss', 'content': 0.014476138167083263, 'timestamp': '2025-09-30 22:10:39.095173', 'step': 1165, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:39.127332', 'step': 1165, 'epoch': 2} {'type': 'loss', 'content': 0.016266265884041786, 'timestamp': '2025-09-30 22:10:39.130837', 'step': 1166, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:39.165870', 'step': 1166, 'epoch': 2} {'type': 'loss', 'content': 0.009876398369669914, 'timestamp': '2025-09-30 22:10:39.168650', 'step': 1167, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:39.204173', 'step': 1167, 'epoch': 2} {'type': 'loss', 'content': 0.03922674432396889, 'timestamp': '2025-09-30 22:10:39.228783', 'step': 1168, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:39.260215', 'step': 1168, 'epoch': 2} {'type': 'loss', 'content': 0.016972580924630165, 'timestamp': '2025-09-30 22:10:39.262676', 'step': 1169, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:39.295458', 'step': 1169, 'epoch': 2} {'type': 'loss', 'content': 0.010892154648900032, 'timestamp': '2025-09-30 22:10:39.298615', 'step': 1170, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:39.331862', 'step': 1170, 'epoch': 2} {'type': 'loss', 'content': 0.01617332175374031, 'timestamp': '2025-09-30 22:10:39.335230', 'step': 1171, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:39.372021', 'step': 1171, 'epoch': 2} {'type': 'loss', 'content': 0.00916491262614727, 'timestamp': '2025-09-30 22:10:39.396075', 'step': 1172, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:10:39.430431', 'step': 1172, 'epoch': 2} {'type': 'loss', 'content': 0.013127158395946026, 'timestamp': '2025-09-30 22:10:39.433742', 'step': 1173, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:39.475734', 'step': 1173, 'epoch': 2} {'type': 'loss', 'content': 0.011959279887378216, 'timestamp': '2025-09-30 22:10:39.485008', 'step': 1174, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:39.518082', 'step': 1174, 'epoch': 2} {'type': 'loss', 'content': 0.02032916434109211, 'timestamp': '2025-09-30 22:10:39.521480', 'step': 1175, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:39.558384', 'step': 1175, 'epoch': 2} {'type': 'loss', 'content': 0.008914561942219734, 'timestamp': '2025-09-30 22:10:39.583458', 'step': 1176, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:39.616504', 'step': 1176, 'epoch': 2} {'type': 'loss', 'content': 0.0263129323720932, 'timestamp': '2025-09-30 22:10:39.619293', 'step': 1177, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:39.652131', 'step': 1177, 'epoch': 2} {'type': 'loss', 'content': 0.014628012664616108, 'timestamp': '2025-09-30 22:10:39.660690', 'step': 1178, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:39.697143', 'step': 1178, 'epoch': 2} {'type': 'loss', 'content': 0.03191584721207619, 'timestamp': '2025-09-30 22:10:39.704771', 'step': 1179, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:39.739703', 'step': 1179, 'epoch': 2} {'type': 'loss', 'content': 0.012460844591259956, 'timestamp': '2025-09-30 22:10:39.770718', 'step': 1180, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:39.813104', 'step': 1180, 'epoch': 2} {'type': 'loss', 'content': 0.030424175783991814, 'timestamp': '2025-09-30 22:10:39.816536', 'step': 1181, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:39.851579', 'step': 1181, 'epoch': 2} {'type': 'loss', 'content': 0.03738018870353699, 'timestamp': '2025-09-30 22:10:39.855520', 'step': 1182, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:39.891473', 'step': 1182, 'epoch': 2} {'type': 'loss', 'content': 0.01585381105542183, 'timestamp': '2025-09-30 22:10:39.894459', 'step': 1183, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:39.928384', 'step': 1183, 'epoch': 2} {'type': 'loss', 'content': 0.005708039738237858, 'timestamp': '2025-09-30 22:10:39.952491', 'step': 1184, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:39.983259', 'step': 1184, 'epoch': 2} {'type': 'loss', 'content': 0.024410611018538475, 'timestamp': '2025-09-30 22:10:39.986286', 'step': 1185, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:40.019579', 'step': 1185, 'epoch': 2} {'type': 'loss', 'content': 0.013256765902042389, 'timestamp': '2025-09-30 22:10:40.022452', 'step': 1186, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:40.060396', 'step': 1186, 'epoch': 2} {'type': 'loss', 'content': 0.0037524027284234762, 'timestamp': '2025-09-30 22:10:40.063713', 'step': 1187, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:40.095383', 'step': 1187, 'epoch': 2} {'type': 'loss', 'content': 0.013813267461955547, 'timestamp': '2025-09-30 22:10:40.119098', 'step': 1188, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:40.151785', 'step': 1188, 'epoch': 2} {'type': 'loss', 'content': 0.008720348589122295, 'timestamp': '2025-09-30 22:10:40.154135', 'step': 1189, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:40.194273', 'step': 1189, 'epoch': 2} {'type': 'loss', 'content': 0.03764233738183975, 'timestamp': '2025-09-30 22:10:40.197284', 'step': 1190, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:40.229457', 'step': 1190, 'epoch': 2} {'type': 'loss', 'content': 0.008992317132651806, 'timestamp': '2025-09-30 22:10:40.231659', 'step': 1191, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:40.270538', 'step': 1191, 'epoch': 2} {'type': 'loss', 'content': 0.022100094705820084, 'timestamp': '2025-09-30 22:10:40.294475', 'step': 1192, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:40.336864', 'step': 1192, 'epoch': 2} {'type': 'loss', 'content': 0.014298759400844574, 'timestamp': '2025-09-30 22:10:40.339011', 'step': 1193, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:40.369955', 'step': 1193, 'epoch': 2} {'type': 'loss', 'content': 0.0014098555548116565, 'timestamp': '2025-09-30 22:10:40.372422', 'step': 1194, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:40.405316', 'step': 1194, 'epoch': 2} {'type': 'loss', 'content': 0.04216473549604416, 'timestamp': '2025-09-30 22:10:40.407930', 'step': 1195, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:40.444291', 'step': 1195, 'epoch': 2} {'type': 'loss', 'content': 0.01196068711578846, 'timestamp': '2025-09-30 22:10:40.467886', 'step': 1196, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:40.502332', 'step': 1196, 'epoch': 2} {'type': 'loss', 'content': 0.052920062094926834, 'timestamp': '2025-09-30 22:10:40.504536', 'step': 1197, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-30 22:10:41.305801', 'step': 1197, 'epoch': 2} {'type': 'pplx', 'content': 59698730.953466825, 'timestamp': '2025-09-30 22:10:41.308000', 'step': 1197, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:41.338068', 'step': 1197, 'epoch': 2} {'type': 'loss', 'content': 0.008483153767883778, 'timestamp': '2025-09-30 22:10:41.340400', 'step': 1198, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:41.370878', 'step': 1198, 'epoch': 2} {'type': 'loss', 'content': 0.014434531331062317, 'timestamp': '2025-09-30 22:10:41.372898', 'step': 1199, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:41.406684', 'step': 1199, 'epoch': 2} {'type': 'loss', 'content': 0.024134354665875435, 'timestamp': '2025-09-30 22:10:41.430663', 'step': 1200, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:41.461622', 'step': 1200, 'epoch': 2} {'type': 'loss', 'content': 0.04929163679480553, 'timestamp': '2025-09-30 22:10:41.463650', 'step': 1201, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:41.494393', 'step': 1201, 'epoch': 2} {'type': 'loss', 'content': 0.003844610182568431, 'timestamp': '2025-09-30 22:10:41.496374', 'step': 1202, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:41.527507', 'step': 1202, 'epoch': 2} {'type': 'loss', 'content': 0.002458305796608329, 'timestamp': '2025-09-30 22:10:41.529486', 'step': 1203, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:10:41.560332', 'step': 1203, 'epoch': 2} {'type': 'loss', 'content': 0.004756701644510031, 'timestamp': '2025-09-30 22:10:41.584038', 'step': 1204, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:41.619849', 'step': 1204, 'epoch': 2} {'type': 'loss', 'content': 0.03479059413075447, 'timestamp': '2025-09-30 22:10:41.622446', 'step': 1205, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:41.656968', 'step': 1205, 'epoch': 2} {'type': 'loss', 'content': 0.03355806693434715, 'timestamp': '2025-09-30 22:10:41.660136', 'step': 1206, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:41.691619', 'step': 1206, 'epoch': 2} {'type': 'loss', 'content': 0.005543780978769064, 'timestamp': '2025-09-30 22:10:41.695076', 'step': 1207, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:41.727961', 'step': 1207, 'epoch': 2} {'type': 'loss', 'content': 0.015537229366600513, 'timestamp': '2025-09-30 22:10:41.751862', 'step': 1208, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:41.782765', 'step': 1208, 'epoch': 2} {'type': 'loss', 'content': 0.015018069185316563, 'timestamp': '2025-09-30 22:10:41.786493', 'step': 1209, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:41.816754', 'step': 1209, 'epoch': 2} {'type': 'loss', 'content': 0.021458139643073082, 'timestamp': '2025-09-30 22:10:41.818847', 'step': 1210, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:41.850116', 'step': 1210, 'epoch': 2} {'type': 'loss', 'content': 0.02499069646000862, 'timestamp': '2025-09-30 22:10:41.852460', 'step': 1211, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:41.886423', 'step': 1211, 'epoch': 2} {'type': 'loss', 'content': 0.018161950632929802, 'timestamp': '2025-09-30 22:10:41.910214', 'step': 1212, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:41.941920', 'step': 1212, 'epoch': 2} {'type': 'loss', 'content': 0.011252237483859062, 'timestamp': '2025-09-30 22:10:41.944264', 'step': 1213, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:41.975427', 'step': 1213, 'epoch': 2} {'type': 'loss', 'content': 0.012444576248526573, 'timestamp': '2025-09-30 22:10:41.977920', 'step': 1214, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:42.012716', 'step': 1214, 'epoch': 2} {'type': 'loss', 'content': 0.017678599804639816, 'timestamp': '2025-09-30 22:10:42.015014', 'step': 1215, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:42.051354', 'step': 1215, 'epoch': 2} {'type': 'loss', 'content': 0.0251413993537426, 'timestamp': '2025-09-30 22:10:42.075701', 'step': 1216, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:42.115875', 'step': 1216, 'epoch': 2} {'type': 'loss', 'content': 0.016345879063010216, 'timestamp': '2025-09-30 22:10:42.118125', 'step': 1217, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:42.148326', 'step': 1217, 'epoch': 2} {'type': 'loss', 'content': 0.004925866145640612, 'timestamp': '2025-09-30 22:10:42.150898', 'step': 1218, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:42.182156', 'step': 1218, 'epoch': 2} {'type': 'loss', 'content': 0.010515101253986359, 'timestamp': '2025-09-30 22:10:42.184457', 'step': 1219, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:42.216338', 'step': 1219, 'epoch': 2} {'type': 'loss', 'content': 0.012195057235658169, 'timestamp': '2025-09-30 22:10:42.240431', 'step': 1220, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:42.271192', 'step': 1220, 'epoch': 2} {'type': 'loss', 'content': 0.010490444488823414, 'timestamp': '2025-09-30 22:10:42.273297', 'step': 1221, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:42.303573', 'step': 1221, 'epoch': 2} {'type': 'loss', 'content': 0.00895109586417675, 'timestamp': '2025-09-30 22:10:42.312752', 'step': 1222, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:42.343995', 'step': 1222, 'epoch': 2} {'type': 'loss', 'content': 0.00880126841366291, 'timestamp': '2025-09-30 22:10:42.346324', 'step': 1223, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:42.377020', 'step': 1223, 'epoch': 2} {'type': 'loss', 'content': 0.00783101562410593, 'timestamp': '2025-09-30 22:10:42.400496', 'step': 1224, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:42.434686', 'step': 1224, 'epoch': 2} {'type': 'loss', 'content': 0.017458317801356316, 'timestamp': '2025-09-30 22:10:42.436971', 'step': 1225, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:42.467577', 'step': 1225, 'epoch': 2} {'type': 'loss', 'content': 0.004970931448042393, 'timestamp': '2025-09-30 22:10:42.469901', 'step': 1226, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:42.499557', 'step': 1226, 'epoch': 2} {'type': 'loss', 'content': 0.008084835484623909, 'timestamp': '2025-09-30 22:10:42.501741', 'step': 1227, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:42.531672', 'step': 1227, 'epoch': 2} {'type': 'loss', 'content': 0.01121762115508318, 'timestamp': '2025-09-30 22:10:42.555461', 'step': 1228, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:42.585346', 'step': 1228, 'epoch': 2} {'type': 'loss', 'content': 0.00550410570576787, 'timestamp': '2025-09-30 22:10:42.587240', 'step': 1229, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:42.620449', 'step': 1229, 'epoch': 2} {'type': 'loss', 'content': 0.031140608713030815, 'timestamp': '2025-09-30 22:10:42.625597', 'step': 1230, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:42.656422', 'step': 1230, 'epoch': 2} {'type': 'loss', 'content': 0.009130788035690784, 'timestamp': '2025-09-30 22:10:42.658985', 'step': 1231, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:42.690752', 'step': 1231, 'epoch': 2} {'type': 'loss', 'content': 0.018223153427243233, 'timestamp': '2025-09-30 22:10:42.714475', 'step': 1232, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:42.745899', 'step': 1232, 'epoch': 2} {'type': 'loss', 'content': 0.002786520402878523, 'timestamp': '2025-09-30 22:10:42.749478', 'step': 1233, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:42.781570', 'step': 1233, 'epoch': 2} {'type': 'loss', 'content': 0.008198206312954426, 'timestamp': '2025-09-30 22:10:42.783841', 'step': 1234, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:42.813392', 'step': 1234, 'epoch': 2} {'type': 'loss', 'content': 0.019522110000252724, 'timestamp': '2025-09-30 22:10:42.815411', 'step': 1235, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:42.845737', 'step': 1235, 'epoch': 2} {'type': 'loss', 'content': 0.009420646354556084, 'timestamp': '2025-09-30 22:10:42.869305', 'step': 1236, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:42.900018', 'step': 1236, 'epoch': 2} {'type': 'loss', 'content': 0.013096547685563564, 'timestamp': '2025-09-30 22:10:42.903026', 'step': 1237, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:42.933896', 'step': 1237, 'epoch': 2} {'type': 'loss', 'content': 0.01162264309823513, 'timestamp': '2025-09-30 22:10:42.935993', 'step': 1238, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:42.967244', 'step': 1238, 'epoch': 2} {'type': 'loss', 'content': 0.027476562187075615, 'timestamp': '2025-09-30 22:10:42.969326', 'step': 1239, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:42.999832', 'step': 1239, 'epoch': 2} {'type': 'loss', 'content': 0.023600177839398384, 'timestamp': '2025-09-30 22:10:43.023629', 'step': 1240, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:43.056349', 'step': 1240, 'epoch': 2} {'type': 'loss', 'content': 0.008278830908238888, 'timestamp': '2025-09-30 22:10:43.058887', 'step': 1241, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:43.089761', 'step': 1241, 'epoch': 2} {'type': 'loss', 'content': 0.01494657527655363, 'timestamp': '2025-09-30 22:10:43.092061', 'step': 1242, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:43.123047', 'step': 1242, 'epoch': 2} {'type': 'loss', 'content': 0.021537726745009422, 'timestamp': '2025-09-30 22:10:43.125760', 'step': 1243, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:43.155387', 'step': 1243, 'epoch': 2} {'type': 'loss', 'content': 0.030687665566802025, 'timestamp': '2025-09-30 22:10:43.179595', 'step': 1244, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:10:43.211432', 'step': 1244, 'epoch': 2} {'type': 'loss', 'content': 0.025700166821479797, 'timestamp': '2025-09-30 22:10:43.218187', 'step': 1245, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:43.254092', 'step': 1245, 'epoch': 2} {'type': 'loss', 'content': 0.024236468598246574, 'timestamp': '2025-09-30 22:10:43.255983', 'step': 1246, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:43.286476', 'step': 1246, 'epoch': 2} {'type': 'loss', 'content': 0.030260343104600906, 'timestamp': '2025-09-30 22:10:43.288651', 'step': 1247, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:43.320263', 'step': 1247, 'epoch': 2} {'type': 'loss', 'content': 0.010783525183796883, 'timestamp': '2025-09-30 22:10:43.343949', 'step': 1248, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:43.374180', 'step': 1248, 'epoch': 2} {'type': 'loss', 'content': 0.02348669432103634, 'timestamp': '2025-09-30 22:10:43.376269', 'step': 1249, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:43.406746', 'step': 1249, 'epoch': 2} {'type': 'loss', 'content': 0.029539231210947037, 'timestamp': '2025-09-30 22:10:43.408829', 'step': 1250, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:43.445946', 'step': 1250, 'epoch': 2} {'type': 'loss', 'content': 0.022312434390187263, 'timestamp': '2025-09-30 22:10:43.449359', 'step': 1251, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:43.481052', 'step': 1251, 'epoch': 2} {'type': 'loss', 'content': 0.009221461601555347, 'timestamp': '2025-09-30 22:10:43.504805', 'step': 1252, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:10:43.535884', 'step': 1252, 'epoch': 2} {'type': 'loss', 'content': 0.0020505916327238083, 'timestamp': '2025-09-30 22:10:43.538140', 'step': 1253, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:43.570161', 'step': 1253, 'epoch': 2} {'type': 'loss', 'content': 0.008239880204200745, 'timestamp': '2025-09-30 22:10:43.572483', 'step': 1254, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-30 22:10:44.406119', 'step': 1254, 'epoch': 2} {'type': 'pplx', 'content': 60760063.312181026, 'timestamp': '2025-09-30 22:10:44.408997', 'step': 1254, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:44.438030', 'step': 1254, 'epoch': 2} {'type': 'loss', 'content': 0.005631845910102129, 'timestamp': '2025-09-30 22:10:44.441744', 'step': 1255, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:44.473214', 'step': 1255, 'epoch': 2} {'type': 'loss', 'content': 0.022505655884742737, 'timestamp': '2025-09-30 22:10:44.501650', 'step': 1256, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:44.533046', 'step': 1256, 'epoch': 2} {'type': 'loss', 'content': 0.01569953002035618, 'timestamp': '2025-09-30 22:10:44.535803', 'step': 1257, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:44.568670', 'step': 1257, 'epoch': 2} {'type': 'loss', 'content': 0.023941362276673317, 'timestamp': '2025-09-30 22:10:44.573306', 'step': 1258, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:44.605137', 'step': 1258, 'epoch': 2} {'type': 'loss', 'content': 0.007991237565875053, 'timestamp': '2025-09-30 22:10:44.607342', 'step': 1259, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:44.641178', 'step': 1259, 'epoch': 2} {'type': 'loss', 'content': 0.01361407432705164, 'timestamp': '2025-09-30 22:10:44.665513', 'step': 1260, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:44.709863', 'step': 1260, 'epoch': 2} {'type': 'loss', 'content': 0.05445224419236183, 'timestamp': '2025-09-30 22:10:44.712354', 'step': 1261, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:44.744995', 'step': 1261, 'epoch': 2} {'type': 'loss', 'content': 0.010495486669242382, 'timestamp': '2025-09-30 22:10:44.747463', 'step': 1262, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:44.781899', 'step': 1262, 'epoch': 2} {'type': 'loss', 'content': 0.018684295937418938, 'timestamp': '2025-09-30 22:10:44.784744', 'step': 1263, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:44.819523', 'step': 1263, 'epoch': 2} {'type': 'loss', 'content': 0.016452034935355186, 'timestamp': '2025-09-30 22:10:44.843542', 'step': 1264, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:44.877758', 'step': 1264, 'epoch': 2} {'type': 'loss', 'content': 0.0076084258034825325, 'timestamp': '2025-09-30 22:10:44.882246', 'step': 1265, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:44.916878', 'step': 1265, 'epoch': 2} {'type': 'loss', 'content': 0.012740017846226692, 'timestamp': '2025-09-30 22:10:44.920311', 'step': 1266, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:44.956443', 'step': 1266, 'epoch': 2} {'type': 'loss', 'content': 0.012665815651416779, 'timestamp': '2025-09-30 22:10:44.960314', 'step': 1267, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:44.996538', 'step': 1267, 'epoch': 2} {'type': 'loss', 'content': 0.024495363235473633, 'timestamp': '2025-09-30 22:10:45.021516', 'step': 1268, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:45.060223', 'step': 1268, 'epoch': 2} {'type': 'loss', 'content': 0.012718215584754944, 'timestamp': '2025-09-30 22:10:45.067857', 'step': 1269, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:45.102912', 'step': 1269, 'epoch': 2} {'type': 'loss', 'content': 0.021051540970802307, 'timestamp': '2025-09-30 22:10:45.106336', 'step': 1270, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:45.140191', 'step': 1270, 'epoch': 2} {'type': 'loss', 'content': 0.007944755256175995, 'timestamp': '2025-09-30 22:10:45.148093', 'step': 1271, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:45.182766', 'step': 1271, 'epoch': 2} {'type': 'loss', 'content': 0.016895083710551262, 'timestamp': '2025-09-30 22:10:45.206638', 'step': 1272, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:45.244320', 'step': 1272, 'epoch': 2} {'type': 'loss', 'content': 0.008962833322584629, 'timestamp': '2025-09-30 22:10:45.249275', 'step': 1273, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:45.281906', 'step': 1273, 'epoch': 2} {'type': 'loss', 'content': 0.019458699971437454, 'timestamp': '2025-09-30 22:10:45.284724', 'step': 1274, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:45.317936', 'step': 1274, 'epoch': 2} {'type': 'loss', 'content': 0.00804336927831173, 'timestamp': '2025-09-30 22:10:45.320649', 'step': 1275, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:45.358060', 'step': 1275, 'epoch': 2} {'type': 'loss', 'content': 0.005566149018704891, 'timestamp': '2025-09-30 22:10:45.386145', 'step': 1276, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:45.423307', 'step': 1276, 'epoch': 2} {'type': 'loss', 'content': 0.017086612060666084, 'timestamp': '2025-09-30 22:10:45.426371', 'step': 1277, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:45.459093', 'step': 1277, 'epoch': 2} {'type': 'loss', 'content': 0.012655085884034634, 'timestamp': '2025-09-30 22:10:45.461910', 'step': 1278, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:45.497560', 'step': 1278, 'epoch': 2} {'type': 'loss', 'content': 0.03651884198188782, 'timestamp': '2025-09-30 22:10:45.500756', 'step': 1279, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:45.533443', 'step': 1279, 'epoch': 2} {'type': 'loss', 'content': 0.025326358154416084, 'timestamp': '2025-09-30 22:10:45.557557', 'step': 1280, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:45.590639', 'step': 1280, 'epoch': 2} {'type': 'loss', 'content': 0.011267936788499355, 'timestamp': '2025-09-30 22:10:45.597551', 'step': 1281, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:45.639706', 'step': 1281, 'epoch': 2} {'type': 'loss', 'content': 0.013515127822756767, 'timestamp': '2025-09-30 22:10:45.642723', 'step': 1282, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:45.675781', 'step': 1282, 'epoch': 2} {'type': 'loss', 'content': 0.009024390950798988, 'timestamp': '2025-09-30 22:10:45.677851', 'step': 1283, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:10:45.710782', 'step': 1283, 'epoch': 2} {'type': 'loss', 'content': 0.007083322387188673, 'timestamp': '2025-09-30 22:10:45.737351', 'step': 1284, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:45.769479', 'step': 1284, 'epoch': 2} {'type': 'loss', 'content': 0.023868506774306297, 'timestamp': '2025-09-30 22:10:45.772424', 'step': 1285, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:45.822934', 'step': 1285, 'epoch': 2} {'type': 'loss', 'content': 0.004573192447423935, 'timestamp': '2025-09-30 22:10:45.831560', 'step': 1286, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:45.867270', 'step': 1286, 'epoch': 2} {'type': 'loss', 'content': 0.011483915150165558, 'timestamp': '2025-09-30 22:10:45.874612', 'step': 1287, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:45.913935', 'step': 1287, 'epoch': 2} {'type': 'loss', 'content': 0.0037527200765907764, 'timestamp': '2025-09-30 22:10:45.938270', 'step': 1288, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:45.974913', 'step': 1288, 'epoch': 2} {'type': 'loss', 'content': 0.030221473425626755, 'timestamp': '2025-09-30 22:10:45.980152', 'step': 1289, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:46.016618', 'step': 1289, 'epoch': 2} {'type': 'loss', 'content': 0.010256985202431679, 'timestamp': '2025-09-30 22:10:46.021314', 'step': 1290, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:46.077989', 'step': 1290, 'epoch': 2} {'type': 'loss', 'content': 0.015660548582673073, 'timestamp': '2025-09-30 22:10:46.084527', 'step': 1291, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:46.128720', 'step': 1291, 'epoch': 2} {'type': 'loss', 'content': 0.015375054441392422, 'timestamp': '2025-09-30 22:10:46.158159', 'step': 1292, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:46.203007', 'step': 1292, 'epoch': 2} {'type': 'loss', 'content': 0.003718825289979577, 'timestamp': '2025-09-30 22:10:46.213348', 'step': 1293, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:46.253664', 'step': 1293, 'epoch': 2} {'type': 'loss', 'content': 0.0035421750508248806, 'timestamp': '2025-09-30 22:10:46.258505', 'step': 1294, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:46.294634', 'step': 1294, 'epoch': 2} {'type': 'loss', 'content': 0.006106370594352484, 'timestamp': '2025-09-30 22:10:46.310314', 'step': 1295, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:46.368561', 'step': 1295, 'epoch': 2} {'type': 'loss', 'content': 0.02283380925655365, 'timestamp': '2025-09-30 22:10:46.398307', 'step': 1296, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:46.440175', 'step': 1296, 'epoch': 2} {'type': 'loss', 'content': 0.014118282124400139, 'timestamp': '2025-09-30 22:10:46.442889', 'step': 1297, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:46.475786', 'step': 1297, 'epoch': 2} {'type': 'loss', 'content': 0.02216426469385624, 'timestamp': '2025-09-30 22:10:46.485250', 'step': 1298, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:46.535659', 'step': 1298, 'epoch': 2} {'type': 'loss', 'content': 0.018284201622009277, 'timestamp': '2025-09-30 22:10:46.539908', 'step': 1299, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:46.596624', 'step': 1299, 'epoch': 2} {'type': 'loss', 'content': 0.012446001172065735, 'timestamp': '2025-09-30 22:10:46.622156', 'step': 1300, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:46.660980', 'step': 1300, 'epoch': 2} {'type': 'loss', 'content': 0.009262526407837868, 'timestamp': '2025-09-30 22:10:46.672054', 'step': 1301, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:46.710138', 'step': 1301, 'epoch': 2} {'type': 'loss', 'content': 0.006927129812538624, 'timestamp': '2025-09-30 22:10:46.713504', 'step': 1302, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:46.749162', 'step': 1302, 'epoch': 2} {'type': 'loss', 'content': 0.012467254884541035, 'timestamp': '2025-09-30 22:10:46.759740', 'step': 1303, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:46.802645', 'step': 1303, 'epoch': 2} {'type': 'loss', 'content': 0.0042889113537967205, 'timestamp': '2025-09-30 22:10:46.829066', 'step': 1304, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:46.865477', 'step': 1304, 'epoch': 2} {'type': 'loss', 'content': 0.009332495741546154, 'timestamp': '2025-09-30 22:10:46.868527', 'step': 1305, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:46.913940', 'step': 1305, 'epoch': 2} {'type': 'loss', 'content': 0.0059282719157636166, 'timestamp': '2025-09-30 22:10:46.919164', 'step': 1306, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:46.956720', 'step': 1306, 'epoch': 2} {'type': 'loss', 'content': 0.014585415832698345, 'timestamp': '2025-09-30 22:10:46.959773', 'step': 1307, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:47.003962', 'step': 1307, 'epoch': 2} {'type': 'loss', 'content': 0.013048151507973671, 'timestamp': '2025-09-30 22:10:47.027621', 'step': 1308, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:47.065208', 'step': 1308, 'epoch': 2} {'type': 'loss', 'content': 0.005672777537256479, 'timestamp': '2025-09-30 22:10:47.074406', 'step': 1309, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:47.122705', 'step': 1309, 'epoch': 2} {'type': 'loss', 'content': 0.017160657793283463, 'timestamp': '2025-09-30 22:10:47.125949', 'step': 1310, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:10:47.167950', 'step': 1310, 'epoch': 2} {'type': 'loss', 'content': 0.023994240909814835, 'timestamp': '2025-09-30 22:10:47.173891', 'step': 1311, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-30 22:10:48.202623', 'step': 1311, 'epoch': 2} {'type': 'pplx', 'content': 66824242.172381386, 'timestamp': '2025-09-30 22:10:48.207345', 'step': 1311, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:48.242280', 'step': 1311, 'epoch': 2} {'type': 'loss', 'content': 0.0038081335369497538, 'timestamp': '2025-09-30 22:10:48.267180', 'step': 1312, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:48.302871', 'step': 1312, 'epoch': 2} {'type': 'loss', 'content': 0.004983414430171251, 'timestamp': '2025-09-30 22:10:48.305432', 'step': 1313, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:48.343167', 'step': 1313, 'epoch': 2} {'type': 'loss', 'content': 0.0009271339513361454, 'timestamp': '2025-09-30 22:10:48.346308', 'step': 1314, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:48.382985', 'step': 1314, 'epoch': 2} {'type': 'loss', 'content': 0.014029703103005886, 'timestamp': '2025-09-30 22:10:48.386537', 'step': 1315, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:48.422652', 'step': 1315, 'epoch': 2} {'type': 'loss', 'content': 0.005625714082270861, 'timestamp': '2025-09-30 22:10:48.455947', 'step': 1316, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:48.491080', 'step': 1316, 'epoch': 2} {'type': 'loss', 'content': 0.010734512470662594, 'timestamp': '2025-09-30 22:10:48.494651', 'step': 1317, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:48.536411', 'step': 1317, 'epoch': 2} {'type': 'loss', 'content': 0.0022723597940057516, 'timestamp': '2025-09-30 22:10:48.540395', 'step': 1318, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:48.575420', 'step': 1318, 'epoch': 2} {'type': 'loss', 'content': 0.018299564719200134, 'timestamp': '2025-09-30 22:10:48.583003', 'step': 1319, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:48.620587', 'step': 1319, 'epoch': 2} {'type': 'loss', 'content': 0.005241929553449154, 'timestamp': '2025-09-30 22:10:48.646522', 'step': 1320, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:48.679720', 'step': 1320, 'epoch': 2} {'type': 'loss', 'content': 0.04034224525094032, 'timestamp': '2025-09-30 22:10:48.689463', 'step': 1321, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:48.737257', 'step': 1321, 'epoch': 2} {'type': 'loss', 'content': 0.02214857004582882, 'timestamp': '2025-09-30 22:10:48.740982', 'step': 1322, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:48.778418', 'step': 1322, 'epoch': 2} {'type': 'loss', 'content': 0.008105490356683731, 'timestamp': '2025-09-30 22:10:48.781765', 'step': 1323, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:48.822059', 'step': 1323, 'epoch': 2} {'type': 'loss', 'content': 0.005707655567675829, 'timestamp': '2025-09-30 22:10:48.848712', 'step': 1324, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:48.886114', 'step': 1324, 'epoch': 2} {'type': 'loss', 'content': 0.0042085289023816586, 'timestamp': '2025-09-30 22:10:48.891192', 'step': 1325, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:48.929306', 'step': 1325, 'epoch': 2} {'type': 'loss', 'content': 0.0041105556301772594, 'timestamp': '2025-09-30 22:10:48.933573', 'step': 1326, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:48.968358', 'step': 1326, 'epoch': 2} {'type': 'loss', 'content': 0.003951088059693575, 'timestamp': '2025-09-30 22:10:48.971861', 'step': 1327, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:49.012993', 'step': 1327, 'epoch': 2} {'type': 'loss', 'content': 0.009346050210297108, 'timestamp': '2025-09-30 22:10:49.036961', 'step': 1328, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:49.072001', 'step': 1328, 'epoch': 2} {'type': 'loss', 'content': 0.01684584841132164, 'timestamp': '2025-09-30 22:10:49.081270', 'step': 1329, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:49.123029', 'step': 1329, 'epoch': 2} {'type': 'loss', 'content': 0.004063504748046398, 'timestamp': '2025-09-30 22:10:49.127669', 'step': 1330, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:49.172245', 'step': 1330, 'epoch': 2} {'type': 'loss', 'content': 0.04242516681551933, 'timestamp': '2025-09-30 22:10:49.176195', 'step': 1331, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:49.217937', 'step': 1331, 'epoch': 2} {'type': 'loss', 'content': 0.012350103817880154, 'timestamp': '2025-09-30 22:10:49.242947', 'step': 1332, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:49.283401', 'step': 1332, 'epoch': 2} {'type': 'loss', 'content': 0.012545420788228512, 'timestamp': '2025-09-30 22:10:49.286125', 'step': 1333, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:49.334088', 'step': 1333, 'epoch': 2} {'type': 'loss', 'content': 0.01438872516155243, 'timestamp': '2025-09-30 22:10:49.338025', 'step': 1334, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:49.384735', 'step': 1334, 'epoch': 2} {'type': 'loss', 'content': 0.028075823560357094, 'timestamp': '2025-09-30 22:10:49.393902', 'step': 1335, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:49.431481', 'step': 1335, 'epoch': 2} {'type': 'loss', 'content': 0.003635299624875188, 'timestamp': '2025-09-30 22:10:49.456582', 'step': 1336, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:49.490976', 'step': 1336, 'epoch': 2} {'type': 'loss', 'content': 0.008153866045176983, 'timestamp': '2025-09-30 22:10:49.494167', 'step': 1337, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:49.532889', 'step': 1337, 'epoch': 2} {'type': 'loss', 'content': 0.015067125670611858, 'timestamp': '2025-09-30 22:10:49.536980', 'step': 1338, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:49.570021', 'step': 1338, 'epoch': 2} {'type': 'loss', 'content': 0.0033039043191820383, 'timestamp': '2025-09-30 22:10:49.573742', 'step': 1339, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:49.608200', 'step': 1339, 'epoch': 2} {'type': 'loss', 'content': 0.012563040480017662, 'timestamp': '2025-09-30 22:10:49.633472', 'step': 1340, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:49.675718', 'step': 1340, 'epoch': 2} {'type': 'loss', 'content': 0.01576489582657814, 'timestamp': '2025-09-30 22:10:49.678356', 'step': 1341, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:49.733087', 'step': 1341, 'epoch': 2} {'type': 'loss', 'content': 0.009753124788403511, 'timestamp': '2025-09-30 22:10:49.740931', 'step': 1342, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:49.774846', 'step': 1342, 'epoch': 2} {'type': 'loss', 'content': 0.007622593082487583, 'timestamp': '2025-09-30 22:10:49.777507', 'step': 1343, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:49.832110', 'step': 1343, 'epoch': 2} {'type': 'loss', 'content': 0.01353259664028883, 'timestamp': '2025-09-30 22:10:49.856790', 'step': 1344, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:49.899581', 'step': 1344, 'epoch': 2} {'type': 'loss', 'content': 0.0013351899106055498, 'timestamp': '2025-09-30 22:10:49.901858', 'step': 1345, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:49.938395', 'step': 1345, 'epoch': 2} {'type': 'loss', 'content': 0.01693071238696575, 'timestamp': '2025-09-30 22:10:49.946484', 'step': 1346, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:49.979673', 'step': 1346, 'epoch': 2} {'type': 'loss', 'content': 0.013451658189296722, 'timestamp': '2025-09-30 22:10:49.986298', 'step': 1347, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:50.024003', 'step': 1347, 'epoch': 2} {'type': 'loss', 'content': 0.001634464249946177, 'timestamp': '2025-09-30 22:10:50.048966', 'step': 1348, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:50.084320', 'step': 1348, 'epoch': 2} {'type': 'loss', 'content': 0.0013204816495999694, 'timestamp': '2025-09-30 22:10:50.087187', 'step': 1349, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:50.123799', 'step': 1349, 'epoch': 2} {'type': 'loss', 'content': 0.00696495920419693, 'timestamp': '2025-09-30 22:10:50.131005', 'step': 1350, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:10:50.167568', 'step': 1350, 'epoch': 2} {'type': 'loss', 'content': 0.007256153970956802, 'timestamp': '2025-09-30 22:10:50.176238', 'step': 1351, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:50.210832', 'step': 1351, 'epoch': 2} {'type': 'loss', 'content': 0.0019330900395289063, 'timestamp': '2025-09-30 22:10:50.242473', 'step': 1352, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:50.279094', 'step': 1352, 'epoch': 2} {'type': 'loss', 'content': 0.004168027546256781, 'timestamp': '2025-09-30 22:10:50.281649', 'step': 1353, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:50.328346', 'step': 1353, 'epoch': 2} {'type': 'loss', 'content': 0.02218509279191494, 'timestamp': '2025-09-30 22:10:50.330895', 'step': 1354, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:50.371467', 'step': 1354, 'epoch': 2} {'type': 'loss', 'content': 0.01591980643570423, 'timestamp': '2025-09-30 22:10:50.374148', 'step': 1355, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:50.416321', 'step': 1355, 'epoch': 2} {'type': 'loss', 'content': 0.03547831252217293, 'timestamp': '2025-09-30 22:10:50.440145', 'step': 1356, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:50.490194', 'step': 1356, 'epoch': 2} {'type': 'loss', 'content': 0.009803591296076775, 'timestamp': '2025-09-30 22:10:50.493087', 'step': 1357, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:50.537096', 'step': 1357, 'epoch': 2} {'type': 'loss', 'content': 0.005764176603406668, 'timestamp': '2025-09-30 22:10:50.547221', 'step': 1358, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:50.592621', 'step': 1358, 'epoch': 2} {'type': 'loss', 'content': 0.02682340517640114, 'timestamp': '2025-09-30 22:10:50.597519', 'step': 1359, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:50.634731', 'step': 1359, 'epoch': 2} {'type': 'loss', 'content': 0.002818641485646367, 'timestamp': '2025-09-30 22:10:50.659155', 'step': 1360, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:50.694661', 'step': 1360, 'epoch': 2} {'type': 'loss', 'content': 0.036229345947504044, 'timestamp': '2025-09-30 22:10:50.697825', 'step': 1361, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:50.732111', 'step': 1361, 'epoch': 2} {'type': 'loss', 'content': 0.014132755808532238, 'timestamp': '2025-09-30 22:10:50.734400', 'step': 1362, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:50.768505', 'step': 1362, 'epoch': 2} {'type': 'loss', 'content': 0.003775624791160226, 'timestamp': '2025-09-30 22:10:50.771889', 'step': 1363, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:50.808951', 'step': 1363, 'epoch': 2} {'type': 'loss', 'content': 0.002429934684187174, 'timestamp': '2025-09-30 22:10:50.835118', 'step': 1364, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:50.868781', 'step': 1364, 'epoch': 2} {'type': 'loss', 'content': 0.016024207696318626, 'timestamp': '2025-09-30 22:10:50.871955', 'step': 1365, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:50.918122', 'step': 1365, 'epoch': 2} {'type': 'loss', 'content': 0.005808067973703146, 'timestamp': '2025-09-30 22:10:50.921867', 'step': 1366, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:50.954084', 'step': 1366, 'epoch': 2} {'type': 'loss', 'content': 0.009768741205334663, 'timestamp': '2025-09-30 22:10:50.956200', 'step': 1367, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:50.988868', 'step': 1367, 'epoch': 2} {'type': 'loss', 'content': 0.00779004255309701, 'timestamp': '2025-09-30 22:10:51.015256', 'step': 1368, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-30 22:10:51.950880', 'step': 1368, 'epoch': 2} {'type': 'pplx', 'content': 68275654.32620457, 'timestamp': '2025-09-30 22:10:51.956112', 'step': 1368, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:51.987256', 'step': 1368, 'epoch': 2} {'type': 'loss', 'content': 0.0018717555794864893, 'timestamp': '2025-09-30 22:10:51.991273', 'step': 1369, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:52.027133', 'step': 1369, 'epoch': 2} {'type': 'loss', 'content': 0.0035747247748076916, 'timestamp': '2025-09-30 22:10:52.031292', 'step': 1370, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:10:52.083305', 'step': 1370, 'epoch': 2} {'type': 'loss', 'content': 0.0063920291140675545, 'timestamp': '2025-09-30 22:10:52.086659', 'step': 1371, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:52.120613', 'step': 1371, 'epoch': 2} {'type': 'loss', 'content': 0.005606912542134523, 'timestamp': '2025-09-30 22:10:52.146591', 'step': 1372, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:52.179528', 'step': 1372, 'epoch': 2} {'type': 'loss', 'content': 0.008563164621591568, 'timestamp': '2025-09-30 22:10:52.187159', 'step': 1373, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:52.222249', 'step': 1373, 'epoch': 2} {'type': 'loss', 'content': 0.01809045672416687, 'timestamp': '2025-09-30 22:10:52.224578', 'step': 1374, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:52.272221', 'step': 1374, 'epoch': 2} {'type': 'loss', 'content': 0.02882465533912182, 'timestamp': '2025-09-30 22:10:52.280493', 'step': 1375, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:52.314708', 'step': 1375, 'epoch': 2} {'type': 'loss', 'content': 0.006099649704992771, 'timestamp': '2025-09-30 22:10:52.340026', 'step': 1376, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:52.377763', 'step': 1376, 'epoch': 2} {'type': 'loss', 'content': 0.020366067066788673, 'timestamp': '2025-09-30 22:10:52.381414', 'step': 1377, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:52.420571', 'step': 1377, 'epoch': 2} {'type': 'loss', 'content': 0.011057457886636257, 'timestamp': '2025-09-30 22:10:52.423638', 'step': 1378, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:52.462322', 'step': 1378, 'epoch': 2} {'type': 'loss', 'content': 0.020656252279877663, 'timestamp': '2025-09-30 22:10:52.464958', 'step': 1379, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:52.499500', 'step': 1379, 'epoch': 2} {'type': 'loss', 'content': 0.002947145840153098, 'timestamp': '2025-09-30 22:10:52.524874', 'step': 1380, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:52.563038', 'step': 1380, 'epoch': 2} {'type': 'loss', 'content': 0.027359262108802795, 'timestamp': '2025-09-30 22:10:52.565331', 'step': 1381, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:52.599782', 'step': 1381, 'epoch': 2} {'type': 'loss', 'content': 0.014876100234687328, 'timestamp': '2025-09-30 22:10:52.603080', 'step': 1382, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:52.635654', 'step': 1382, 'epoch': 2} {'type': 'loss', 'content': 0.008522089570760727, 'timestamp': '2025-09-30 22:10:52.638261', 'step': 1383, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:52.672741', 'step': 1383, 'epoch': 2} {'type': 'loss', 'content': 0.0012803823919966817, 'timestamp': '2025-09-30 22:10:52.697085', 'step': 1384, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:52.732716', 'step': 1384, 'epoch': 2} {'type': 'loss', 'content': 0.01757989265024662, 'timestamp': '2025-09-30 22:10:52.736475', 'step': 1385, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:52.774988', 'step': 1385, 'epoch': 2} {'type': 'loss', 'content': 0.025001874193549156, 'timestamp': '2025-09-30 22:10:52.783637', 'step': 1386, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:52.816538', 'step': 1386, 'epoch': 2} {'type': 'loss', 'content': 0.031573910266160965, 'timestamp': '2025-09-30 22:10:52.820073', 'step': 1387, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:52.853933', 'step': 1387, 'epoch': 2} {'type': 'loss', 'content': 0.0021813209168612957, 'timestamp': '2025-09-30 22:10:52.877779', 'step': 1388, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:52.920130', 'step': 1388, 'epoch': 2} {'type': 'loss', 'content': 0.01097036711871624, 'timestamp': '2025-09-30 22:10:52.926908', 'step': 1389, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:52.966004', 'step': 1389, 'epoch': 2} {'type': 'loss', 'content': 0.00787901971489191, 'timestamp': '2025-09-30 22:10:52.968824', 'step': 1390, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:53.012265', 'step': 1390, 'epoch': 2} {'type': 'loss', 'content': 0.004948236979544163, 'timestamp': '2025-09-30 22:10:53.015520', 'step': 1391, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:53.048414', 'step': 1391, 'epoch': 2} {'type': 'loss', 'content': 0.003085443750023842, 'timestamp': '2025-09-30 22:10:53.073497', 'step': 1392, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:53.109575', 'step': 1392, 'epoch': 2} {'type': 'loss', 'content': 0.00811847299337387, 'timestamp': '2025-09-30 22:10:53.117885', 'step': 1393, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:53.158134', 'step': 1393, 'epoch': 2} {'type': 'loss', 'content': 0.016569631174206734, 'timestamp': '2025-09-30 22:10:53.162009', 'step': 1394, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:53.204349', 'step': 1394, 'epoch': 2} {'type': 'loss', 'content': 0.004678026307374239, 'timestamp': '2025-09-30 22:10:53.213866', 'step': 1395, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:53.260702', 'step': 1395, 'epoch': 2} {'type': 'loss', 'content': 0.007153411395847797, 'timestamp': '2025-09-30 22:10:53.289648', 'step': 1396, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:53.324476', 'step': 1396, 'epoch': 2} {'type': 'loss', 'content': 0.012572051957249641, 'timestamp': '2025-09-30 22:10:53.327164', 'step': 1397, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:53.361500', 'step': 1397, 'epoch': 2} {'type': 'loss', 'content': 0.05144510790705681, 'timestamp': '2025-09-30 22:10:53.364437', 'step': 1398, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:53.402787', 'step': 1398, 'epoch': 2} {'type': 'loss', 'content': 0.006638978607952595, 'timestamp': '2025-09-30 22:10:53.405669', 'step': 1399, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:53.441726', 'step': 1399, 'epoch': 2} {'type': 'loss', 'content': 0.00618391390889883, 'timestamp': '2025-09-30 22:10:53.465970', 'step': 1400, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:53.497937', 'step': 1400, 'epoch': 2} {'type': 'loss', 'content': 0.012263186275959015, 'timestamp': '2025-09-30 22:10:53.501353', 'step': 1401, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:53.543767', 'step': 1401, 'epoch': 2} {'type': 'loss', 'content': 0.011834419332444668, 'timestamp': '2025-09-30 22:10:53.546308', 'step': 1402, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:53.580114', 'step': 1402, 'epoch': 2} {'type': 'loss', 'content': 0.008952699601650238, 'timestamp': '2025-09-30 22:10:53.582584', 'step': 1403, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:53.621201', 'step': 1403, 'epoch': 2} {'type': 'loss', 'content': 0.017776764929294586, 'timestamp': '2025-09-30 22:10:53.650322', 'step': 1404, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:53.696311', 'step': 1404, 'epoch': 2} {'type': 'loss', 'content': 0.01339375227689743, 'timestamp': '2025-09-30 22:10:53.702887', 'step': 1405, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:53.738280', 'step': 1405, 'epoch': 2} {'type': 'loss', 'content': 0.02056540735065937, 'timestamp': '2025-09-30 22:10:53.742047', 'step': 1406, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:53.777952', 'step': 1406, 'epoch': 2} {'type': 'loss', 'content': 0.0029556031804531813, 'timestamp': '2025-09-30 22:10:53.788392', 'step': 1407, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:53.823234', 'step': 1407, 'epoch': 2} {'type': 'loss', 'content': 0.022973816841840744, 'timestamp': '2025-09-30 22:10:53.848310', 'step': 1408, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:53.892283', 'step': 1408, 'epoch': 2} {'type': 'loss', 'content': 0.018975449725985527, 'timestamp': '2025-09-30 22:10:53.895142', 'step': 1409, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:53.930629', 'step': 1409, 'epoch': 2} {'type': 'loss', 'content': 0.0014696232974529266, 'timestamp': '2025-09-30 22:10:53.942499', 'step': 1410, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:53.982401', 'step': 1410, 'epoch': 2} {'type': 'loss', 'content': 0.013965344987809658, 'timestamp': '2025-09-30 22:10:53.985202', 'step': 1411, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:54.018772', 'step': 1411, 'epoch': 2} {'type': 'loss', 'content': 0.003091490129008889, 'timestamp': '2025-09-30 22:10:54.043665', 'step': 1412, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:54.077265', 'step': 1412, 'epoch': 2} {'type': 'loss', 'content': 0.003465210786089301, 'timestamp': '2025-09-30 22:10:54.080642', 'step': 1413, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:54.114294', 'step': 1413, 'epoch': 2} {'type': 'loss', 'content': 0.0012734970077872276, 'timestamp': '2025-09-30 22:10:54.117913', 'step': 1414, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:54.155608', 'step': 1414, 'epoch': 2} {'type': 'loss', 'content': 0.014250697568058968, 'timestamp': '2025-09-30 22:10:54.158252', 'step': 1415, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:54.191240', 'step': 1415, 'epoch': 2} {'type': 'loss', 'content': 0.007287542801350355, 'timestamp': '2025-09-30 22:10:54.215670', 'step': 1416, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:54.250943', 'step': 1416, 'epoch': 2} {'type': 'loss', 'content': 0.0006267238059081137, 'timestamp': '2025-09-30 22:10:54.255574', 'step': 1417, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:54.306179', 'step': 1417, 'epoch': 2} {'type': 'loss', 'content': 0.0023221937008202076, 'timestamp': '2025-09-30 22:10:54.314373', 'step': 1418, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:54.351413', 'step': 1418, 'epoch': 2} {'type': 'loss', 'content': 0.003638636786490679, 'timestamp': '2025-09-30 22:10:54.355848', 'step': 1419, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:54.393505', 'step': 1419, 'epoch': 2} {'type': 'loss', 'content': 0.006300566252321005, 'timestamp': '2025-09-30 22:10:54.431529', 'step': 1420, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:54.471257', 'step': 1420, 'epoch': 2} {'type': 'loss', 'content': 0.008367580361664295, 'timestamp': '2025-09-30 22:10:54.477559', 'step': 1421, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:54.525967', 'step': 1421, 'epoch': 2} {'type': 'loss', 'content': 0.0011981696588918567, 'timestamp': '2025-09-30 22:10:54.532343', 'step': 1422, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:54.570463', 'step': 1422, 'epoch': 2} {'type': 'loss', 'content': 0.04026754945516586, 'timestamp': '2025-09-30 22:10:54.574047', 'step': 1423, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:54.625052', 'step': 1423, 'epoch': 2} {'type': 'loss', 'content': 0.019819127395749092, 'timestamp': '2025-09-30 22:10:54.651277', 'step': 1424, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:54.695388', 'step': 1424, 'epoch': 2} {'type': 'loss', 'content': 0.02602306380867958, 'timestamp': '2025-09-30 22:10:54.703042', 'step': 1425, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-30 22:10:55.675171', 'step': 1425, 'epoch': 2} {'type': 'pplx', 'content': 74128275.07486075, 'timestamp': '2025-09-30 22:10:55.679938', 'step': 1425, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:55.713635', 'step': 1425, 'epoch': 2} {'type': 'loss', 'content': 0.0006774549256078899, 'timestamp': '2025-09-30 22:10:55.718167', 'step': 1426, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:55.757076', 'step': 1426, 'epoch': 2} {'type': 'loss', 'content': 0.00207727262750268, 'timestamp': '2025-09-30 22:10:55.759095', 'step': 1427, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:55.793027', 'step': 1427, 'epoch': 2} {'type': 'loss', 'content': 0.03250009939074516, 'timestamp': '2025-09-30 22:10:55.817911', 'step': 1428, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:55.856172', 'step': 1428, 'epoch': 2} {'type': 'loss', 'content': 0.0018771073082461953, 'timestamp': '2025-09-30 22:10:55.860216', 'step': 1429, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:55.902186', 'step': 1429, 'epoch': 2} {'type': 'loss', 'content': 0.010951467789709568, 'timestamp': '2025-09-30 22:10:55.915378', 'step': 1430, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:55.951685', 'step': 1430, 'epoch': 2} {'type': 'loss', 'content': 0.007400455418974161, 'timestamp': '2025-09-30 22:10:55.955913', 'step': 1431, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:55.990628', 'step': 1431, 'epoch': 2} {'type': 'loss', 'content': 0.004835831932723522, 'timestamp': '2025-09-30 22:10:56.015274', 'step': 1432, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:56.050286', 'step': 1432, 'epoch': 2} {'type': 'loss', 'content': 0.03025592677295208, 'timestamp': '2025-09-30 22:10:56.054517', 'step': 1433, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:56.095446', 'step': 1433, 'epoch': 2} {'type': 'loss', 'content': 0.01778782345354557, 'timestamp': '2025-09-30 22:10:56.099054', 'step': 1434, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:56.133860', 'step': 1434, 'epoch': 2} {'type': 'loss', 'content': 0.007224774919450283, 'timestamp': '2025-09-30 22:10:56.138054', 'step': 1435, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:56.181078', 'step': 1435, 'epoch': 2} {'type': 'loss', 'content': 0.0025212056934833527, 'timestamp': '2025-09-30 22:10:56.207729', 'step': 1436, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:56.241100', 'step': 1436, 'epoch': 2} {'type': 'loss', 'content': 0.016180910170078278, 'timestamp': '2025-09-30 22:10:56.243431', 'step': 1437, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:56.277385', 'step': 1437, 'epoch': 2} {'type': 'loss', 'content': 0.0036104407627135515, 'timestamp': '2025-09-30 22:10:56.280171', 'step': 1438, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:56.317387', 'step': 1438, 'epoch': 2} {'type': 'loss', 'content': 0.0056266081519424915, 'timestamp': '2025-09-30 22:10:56.320965', 'step': 1439, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:56.363989', 'step': 1439, 'epoch': 2} {'type': 'loss', 'content': 0.001078564440831542, 'timestamp': '2025-09-30 22:10:56.389753', 'step': 1440, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:56.430915', 'step': 1440, 'epoch': 2} {'type': 'loss', 'content': 0.002247942378744483, 'timestamp': '2025-09-30 22:10:56.434260', 'step': 1441, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:56.468405', 'step': 1441, 'epoch': 2} {'type': 'loss', 'content': 0.0018794239731505513, 'timestamp': '2025-09-30 22:10:56.471789', 'step': 1442, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:56.509237', 'step': 1442, 'epoch': 2} {'type': 'loss', 'content': 0.011743737384676933, 'timestamp': '2025-09-30 22:10:56.512522', 'step': 1443, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:56.567355', 'step': 1443, 'epoch': 2} {'type': 'loss', 'content': 0.001445922302082181, 'timestamp': '2025-09-30 22:10:56.591852', 'step': 1444, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:56.636602', 'step': 1444, 'epoch': 2} {'type': 'loss', 'content': 0.0063680075109004974, 'timestamp': '2025-09-30 22:10:56.640191', 'step': 1445, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:56.673782', 'step': 1445, 'epoch': 2} {'type': 'loss', 'content': 0.005665498320013285, 'timestamp': '2025-09-30 22:10:56.676828', 'step': 1446, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:56.717952', 'step': 1446, 'epoch': 2} {'type': 'loss', 'content': 0.005664634983986616, 'timestamp': '2025-09-30 22:10:56.721479', 'step': 1447, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:56.762062', 'step': 1447, 'epoch': 2} {'type': 'loss', 'content': 0.006698861718177795, 'timestamp': '2025-09-30 22:10:56.800454', 'step': 1448, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:56.838238', 'step': 1448, 'epoch': 2} {'type': 'loss', 'content': 0.018983641639351845, 'timestamp': '2025-09-30 22:10:56.843558', 'step': 1449, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:56.896188', 'step': 1449, 'epoch': 2} {'type': 'loss', 'content': 0.0048040165565907955, 'timestamp': '2025-09-30 22:10:56.899361', 'step': 1450, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:56.949274', 'step': 1450, 'epoch': 2} {'type': 'loss', 'content': 0.012130377814173698, 'timestamp': '2025-09-30 22:10:56.952827', 'step': 1451, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:57.004081', 'step': 1451, 'epoch': 2} {'type': 'loss', 'content': 0.0032708682119846344, 'timestamp': '2025-09-30 22:10:57.030876', 'step': 1452, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:57.074324', 'step': 1452, 'epoch': 2} {'type': 'loss', 'content': 0.0012729617301374674, 'timestamp': '2025-09-30 22:10:57.079911', 'step': 1453, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:57.116884', 'step': 1453, 'epoch': 2} {'type': 'loss', 'content': 0.010357841849327087, 'timestamp': '2025-09-30 22:10:57.121268', 'step': 1454, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:57.158876', 'step': 1454, 'epoch': 2} {'type': 'loss', 'content': 0.017564745619893074, 'timestamp': '2025-09-30 22:10:57.166280', 'step': 1455, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:57.203792', 'step': 1455, 'epoch': 2} {'type': 'loss', 'content': 0.018127692863345146, 'timestamp': '2025-09-30 22:10:57.229463', 'step': 1456, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:57.274613', 'step': 1456, 'epoch': 2} {'type': 'loss', 'content': 0.004254224244505167, 'timestamp': '2025-09-30 22:10:57.277492', 'step': 1457, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:57.325013', 'step': 1457, 'epoch': 2} {'type': 'loss', 'content': 0.011064024642109871, 'timestamp': '2025-09-30 22:10:57.327995', 'step': 1458, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:57.364112', 'step': 1458, 'epoch': 2} {'type': 'loss', 'content': 0.016593145206570625, 'timestamp': '2025-09-30 22:10:57.374390', 'step': 1459, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:57.425112', 'step': 1459, 'epoch': 2} {'type': 'loss', 'content': 0.0010914544109255075, 'timestamp': '2025-09-30 22:10:57.458658', 'step': 1460, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:57.502887', 'step': 1460, 'epoch': 2} {'type': 'loss', 'content': 0.049089811742305756, 'timestamp': '2025-09-30 22:10:57.507490', 'step': 1461, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:57.548571', 'step': 1461, 'epoch': 2} {'type': 'loss', 'content': 0.014249107800424099, 'timestamp': '2025-09-30 22:10:57.552909', 'step': 1462, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:57.589106', 'step': 1462, 'epoch': 2} {'type': 'loss', 'content': 0.018402233719825745, 'timestamp': '2025-09-30 22:10:57.593388', 'step': 1463, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:57.627742', 'step': 1463, 'epoch': 2} {'type': 'loss', 'content': 0.023724442347884178, 'timestamp': '2025-09-30 22:10:57.653649', 'step': 1464, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:57.688946', 'step': 1464, 'epoch': 2} {'type': 'loss', 'content': 0.011793600395321846, 'timestamp': '2025-09-30 22:10:57.694266', 'step': 1465, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:57.736332', 'step': 1465, 'epoch': 2} {'type': 'loss', 'content': 0.025359628722071648, 'timestamp': '2025-09-30 22:10:57.740096', 'step': 1466, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:57.775963', 'step': 1466, 'epoch': 2} {'type': 'loss', 'content': 0.036889225244522095, 'timestamp': '2025-09-30 22:10:57.780574', 'step': 1467, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:57.818671', 'step': 1467, 'epoch': 2} {'type': 'loss', 'content': 0.012519911862909794, 'timestamp': '2025-09-30 22:10:57.843725', 'step': 1468, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:57.877286', 'step': 1468, 'epoch': 2} {'type': 'loss', 'content': 0.009341800585389137, 'timestamp': '2025-09-30 22:10:57.885916', 'step': 1469, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:57.922661', 'step': 1469, 'epoch': 2} {'type': 'loss', 'content': 0.030676299706101418, 'timestamp': '2025-09-30 22:10:57.925898', 'step': 1470, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:57.960961', 'step': 1470, 'epoch': 2} {'type': 'loss', 'content': 0.02009420283138752, 'timestamp': '2025-09-30 22:10:57.966077', 'step': 1471, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:58.001792', 'step': 1471, 'epoch': 2} {'type': 'loss', 'content': 0.010448752902448177, 'timestamp': '2025-09-30 22:10:58.027635', 'step': 1472, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:58.069987', 'step': 1472, 'epoch': 2} {'type': 'loss', 'content': 0.0042487201280891895, 'timestamp': '2025-09-30 22:10:58.075419', 'step': 1473, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:58.109681', 'step': 1473, 'epoch': 2} {'type': 'loss', 'content': 0.006884237285703421, 'timestamp': '2025-09-30 22:10:58.113516', 'step': 1474, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:58.155343', 'step': 1474, 'epoch': 2} {'type': 'loss', 'content': 0.016830969601869583, 'timestamp': '2025-09-30 22:10:58.158537', 'step': 1475, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:58.194099', 'step': 1475, 'epoch': 2} {'type': 'loss', 'content': 0.013651719316840172, 'timestamp': '2025-09-30 22:10:58.230067', 'step': 1476, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:10:58.276300', 'step': 1476, 'epoch': 2} {'type': 'loss', 'content': 0.0026586789172142744, 'timestamp': '2025-09-30 22:10:58.284168', 'step': 1477, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:58.319931', 'step': 1477, 'epoch': 2} {'type': 'loss', 'content': 0.010279414243996143, 'timestamp': '2025-09-30 22:10:58.322821', 'step': 1478, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:58.356679', 'step': 1478, 'epoch': 2} {'type': 'loss', 'content': 0.020368138328194618, 'timestamp': '2025-09-30 22:10:58.360343', 'step': 1479, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:58.397661', 'step': 1479, 'epoch': 2} {'type': 'loss', 'content': 0.008841685950756073, 'timestamp': '2025-09-30 22:10:58.423122', 'step': 1480, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:58.456754', 'step': 1480, 'epoch': 2} {'type': 'loss', 'content': 0.01653965562582016, 'timestamp': '2025-09-30 22:10:58.460684', 'step': 1481, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:10:58.493653', 'step': 1481, 'epoch': 2} {'type': 'loss', 'content': 0.013136975467205048, 'timestamp': '2025-09-30 22:10:58.496941', 'step': 1482, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-30 22:10:59.623423', 'step': 1482, 'epoch': 2} {'type': 'pplx', 'content': 73202307.14203984, 'timestamp': '2025-09-30 22:10:59.628171', 'step': 1482, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:59.667112', 'step': 1482, 'epoch': 2} {'type': 'loss', 'content': 0.02235250174999237, 'timestamp': '2025-09-30 22:10:59.669992', 'step': 1483, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:59.704190', 'step': 1483, 'epoch': 2} {'type': 'loss', 'content': 0.004239015281200409, 'timestamp': '2025-09-30 22:10:59.730320', 'step': 1484, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:10:59.766635', 'step': 1484, 'epoch': 2} {'type': 'loss', 'content': 0.027366241440176964, 'timestamp': '2025-09-30 22:10:59.773771', 'step': 1485, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:59.811244', 'step': 1485, 'epoch': 2} {'type': 'loss', 'content': 0.010933603160083294, 'timestamp': '2025-09-30 22:10:59.822439', 'step': 1486, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:59.856790', 'step': 1486, 'epoch': 2} {'type': 'loss', 'content': 0.003180326195433736, 'timestamp': '2025-09-30 22:10:59.859920', 'step': 1487, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:59.894794', 'step': 1487, 'epoch': 2} {'type': 'loss', 'content': 0.00700030755251646, 'timestamp': '2025-09-30 22:10:59.928974', 'step': 1488, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:10:59.972007', 'step': 1488, 'epoch': 2} {'type': 'loss', 'content': 0.010965103283524513, 'timestamp': '2025-09-30 22:10:59.976528', 'step': 1489, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:00.019635', 'step': 1489, 'epoch': 2} {'type': 'loss', 'content': 0.022053999826312065, 'timestamp': '2025-09-30 22:11:00.023580', 'step': 1490, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:00.065304', 'step': 1490, 'epoch': 2} {'type': 'loss', 'content': 0.019200291484594345, 'timestamp': '2025-09-30 22:11:00.067944', 'step': 1491, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:00.102150', 'step': 1491, 'epoch': 2} {'type': 'loss', 'content': 0.009958348236978054, 'timestamp': '2025-09-30 22:11:00.127080', 'step': 1492, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:00.159252', 'step': 1492, 'epoch': 2} {'type': 'loss', 'content': 0.006062771193683147, 'timestamp': '2025-09-30 22:11:00.162722', 'step': 1493, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:00.205627', 'step': 1493, 'epoch': 2} {'type': 'loss', 'content': 0.013420102186501026, 'timestamp': '2025-09-30 22:11:00.209482', 'step': 1494, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:11:00.251997', 'step': 1494, 'epoch': 2} {'type': 'loss', 'content': 0.03325999528169632, 'timestamp': '2025-09-30 22:11:00.254501', 'step': 1495, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:00.289027', 'step': 1495, 'epoch': 2} {'type': 'loss', 'content': 0.015703823417425156, 'timestamp': '2025-09-30 22:11:00.314585', 'step': 1496, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:00.356126', 'step': 1496, 'epoch': 2} {'type': 'loss', 'content': 0.008002055808901787, 'timestamp': '2025-09-30 22:11:00.365368', 'step': 1497, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:00.408875', 'step': 1497, 'epoch': 2} {'type': 'loss', 'content': 0.007439378648996353, 'timestamp': '2025-09-30 22:11:00.412191', 'step': 1498, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:00.445336', 'step': 1498, 'epoch': 2} {'type': 'loss', 'content': 0.014068402349948883, 'timestamp': '2025-09-30 22:11:00.449638', 'step': 1499, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:00.483663', 'step': 1499, 'epoch': 2} {'type': 'loss', 'content': 0.017416151240468025, 'timestamp': '2025-09-30 22:11:00.509566', 'step': 1500, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 1500', 'timestamp': '2025-09-30 22:11:07.564602', 'step': 1500, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:07.602746', 'step': 1500, 'epoch': 2} {'type': 'loss', 'content': 0.009199053980410099, 'timestamp': '2025-09-30 22:11:07.606406', 'step': 1501, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:07.649605', 'step': 1501, 'epoch': 2} {'type': 'loss', 'content': 0.007240265142172575, 'timestamp': '2025-09-30 22:11:07.659750', 'step': 1502, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:07.698385', 'step': 1502, 'epoch': 2} {'type': 'loss', 'content': 0.007249581627547741, 'timestamp': '2025-09-30 22:11:07.703753', 'step': 1503, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:07.739018', 'step': 1503, 'epoch': 2} {'type': 'loss', 'content': 0.009428039193153381, 'timestamp': '2025-09-30 22:11:07.765062', 'step': 1504, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:07.802485', 'step': 1504, 'epoch': 2} {'type': 'loss', 'content': 0.026344623416662216, 'timestamp': '2025-09-30 22:11:07.806883', 'step': 1505, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:07.841395', 'step': 1505, 'epoch': 2} {'type': 'loss', 'content': 0.02475654147565365, 'timestamp': '2025-09-30 22:11:07.844454', 'step': 1506, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:07.879920', 'step': 1506, 'epoch': 2} {'type': 'loss', 'content': 0.0358511246740818, 'timestamp': '2025-09-30 22:11:07.884344', 'step': 1507, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:11:07.924027', 'step': 1507, 'epoch': 2} {'type': 'loss', 'content': 0.014702022075653076, 'timestamp': '2025-09-30 22:11:07.953109', 'step': 1508, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:07.989471', 'step': 1508, 'epoch': 2} {'type': 'loss', 'content': 0.006700622383505106, 'timestamp': '2025-09-30 22:11:07.992665', 'step': 1509, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:08.030146', 'step': 1509, 'epoch': 2} {'type': 'loss', 'content': 0.023523863404989243, 'timestamp': '2025-09-30 22:11:08.034068', 'step': 1510, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:08.068480', 'step': 1510, 'epoch': 2} {'type': 'loss', 'content': 0.0009568893583491445, 'timestamp': '2025-09-30 22:11:08.071191', 'step': 1511, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:08.106603', 'step': 1511, 'epoch': 2} {'type': 'loss', 'content': 0.014375868253409863, 'timestamp': '2025-09-30 22:11:08.135573', 'step': 1512, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:08.180939', 'step': 1512, 'epoch': 2} {'type': 'loss', 'content': 0.018592536449432373, 'timestamp': '2025-09-30 22:11:08.189260', 'step': 1513, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:08.226936', 'step': 1513, 'epoch': 2} {'type': 'loss', 'content': 0.02132156677544117, 'timestamp': '2025-09-30 22:11:08.230631', 'step': 1514, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:08.267211', 'step': 1514, 'epoch': 2} {'type': 'loss', 'content': 0.008908395655453205, 'timestamp': '2025-09-30 22:11:08.271990', 'step': 1515, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:08.304478', 'step': 1515, 'epoch': 2} {'type': 'loss', 'content': 0.06693150103092194, 'timestamp': '2025-09-30 22:11:08.329790', 'step': 1516, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:08.366876', 'step': 1516, 'epoch': 2} {'type': 'loss', 'content': 0.005975159350782633, 'timestamp': '2025-09-30 22:11:08.376887', 'step': 1517, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:08.415450', 'step': 1517, 'epoch': 2} {'type': 'loss', 'content': 0.02259109355509281, 'timestamp': '2025-09-30 22:11:08.419032', 'step': 1518, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:08.458249', 'step': 1518, 'epoch': 2} {'type': 'loss', 'content': 0.011762427166104317, 'timestamp': '2025-09-30 22:11:08.461973', 'step': 1519, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:08.496108', 'step': 1519, 'epoch': 2} {'type': 'loss', 'content': 0.0023320745676755905, 'timestamp': '2025-09-30 22:11:08.520023', 'step': 1520, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:08.553310', 'step': 1520, 'epoch': 2} {'type': 'loss', 'content': 0.011356505565345287, 'timestamp': '2025-09-30 22:11:08.556226', 'step': 1521, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:08.599418', 'step': 1521, 'epoch': 2} {'type': 'loss', 'content': 0.007964258082211018, 'timestamp': '2025-09-30 22:11:08.603768', 'step': 1522, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:08.639326', 'step': 1522, 'epoch': 2} {'type': 'loss', 'content': 0.009889090433716774, 'timestamp': '2025-09-30 22:11:08.646238', 'step': 1523, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:08.677783', 'step': 1523, 'epoch': 2} {'type': 'loss', 'content': 0.012808294966816902, 'timestamp': '2025-09-30 22:11:08.703500', 'step': 1524, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:08.738573', 'step': 1524, 'epoch': 2} {'type': 'loss', 'content': 0.018992161378264427, 'timestamp': '2025-09-30 22:11:08.741983', 'step': 1525, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:08.775922', 'step': 1525, 'epoch': 2} {'type': 'loss', 'content': 0.014097621664404869, 'timestamp': '2025-09-30 22:11:08.779284', 'step': 1526, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:08.821142', 'step': 1526, 'epoch': 2} {'type': 'loss', 'content': 0.014756182208657265, 'timestamp': '2025-09-30 22:11:08.825175', 'step': 1527, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:08.859641', 'step': 1527, 'epoch': 2} {'type': 'loss', 'content': 0.023549159988760948, 'timestamp': '2025-09-30 22:11:08.888787', 'step': 1528, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:08.927774', 'step': 1528, 'epoch': 2} {'type': 'loss', 'content': 0.030239734798669815, 'timestamp': '2025-09-30 22:11:08.930176', 'step': 1529, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:08.963505', 'step': 1529, 'epoch': 2} {'type': 'loss', 'content': 0.03541674092411995, 'timestamp': '2025-09-30 22:11:08.966184', 'step': 1530, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:09.000340', 'step': 1530, 'epoch': 2} {'type': 'loss', 'content': 0.009105728007853031, 'timestamp': '2025-09-30 22:11:09.011528', 'step': 1531, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:09.055899', 'step': 1531, 'epoch': 2} {'type': 'loss', 'content': 0.02031215839087963, 'timestamp': '2025-09-30 22:11:09.080691', 'step': 1532, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:09.115100', 'step': 1532, 'epoch': 2} {'type': 'loss', 'content': 0.01983821578323841, 'timestamp': '2025-09-30 22:11:09.117831', 'step': 1533, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:09.150292', 'step': 1533, 'epoch': 2} {'type': 'loss', 'content': 0.0378604419529438, 'timestamp': '2025-09-30 22:11:09.152582', 'step': 1534, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:09.192938', 'step': 1534, 'epoch': 2} {'type': 'loss', 'content': 0.02909739501774311, 'timestamp': '2025-09-30 22:11:09.196176', 'step': 1535, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:09.229038', 'step': 1535, 'epoch': 2} {'type': 'loss', 'content': 0.014408699236810207, 'timestamp': '2025-09-30 22:11:09.252833', 'step': 1536, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:09.284511', 'step': 1536, 'epoch': 2} {'type': 'loss', 'content': 0.0065101939253509045, 'timestamp': '2025-09-30 22:11:09.286804', 'step': 1537, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:09.317880', 'step': 1537, 'epoch': 2} {'type': 'loss', 'content': 0.03613752871751785, 'timestamp': '2025-09-30 22:11:09.320205', 'step': 1538, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:09.352637', 'step': 1538, 'epoch': 2} {'type': 'loss', 'content': 0.0036878264509141445, 'timestamp': '2025-09-30 22:11:09.355902', 'step': 1539, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-30 22:11:10.286447', 'step': 1539, 'epoch': 2} {'type': 'pplx', 'content': 62936644.35703027, 'timestamp': '2025-09-30 22:11:10.289514', 'step': 1539, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:10.319946', 'step': 1539, 'epoch': 2} {'type': 'loss', 'content': 0.011266936548054218, 'timestamp': '2025-09-30 22:11:10.343940', 'step': 1540, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:10.379171', 'step': 1540, 'epoch': 2} {'type': 'loss', 'content': 0.026096221059560776, 'timestamp': '2025-09-30 22:11:10.382644', 'step': 1541, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:10.417284', 'step': 1541, 'epoch': 2} {'type': 'loss', 'content': 0.01162233017385006, 'timestamp': '2025-09-30 22:11:10.421008', 'step': 1542, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:10.453427', 'step': 1542, 'epoch': 2} {'type': 'loss', 'content': 0.001982102869078517, 'timestamp': '2025-09-30 22:11:10.459918', 'step': 1543, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:10.493221', 'step': 1543, 'epoch': 2} {'type': 'loss', 'content': 0.0030516337137669325, 'timestamp': '2025-09-30 22:11:10.517291', 'step': 1544, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:10.552001', 'step': 1544, 'epoch': 2} {'type': 'loss', 'content': 0.003602204378694296, 'timestamp': '2025-09-30 22:11:10.555139', 'step': 1545, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:10.597185', 'step': 1545, 'epoch': 2} {'type': 'loss', 'content': 0.05414646863937378, 'timestamp': '2025-09-30 22:11:10.601437', 'step': 1546, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:10.642372', 'step': 1546, 'epoch': 2} {'type': 'loss', 'content': 0.04593129828572273, 'timestamp': '2025-09-30 22:11:10.644436', 'step': 1547, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:10.678240', 'step': 1547, 'epoch': 2} {'type': 'loss', 'content': 0.005031104665249586, 'timestamp': '2025-09-30 22:11:10.703129', 'step': 1548, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:10.739465', 'step': 1548, 'epoch': 2} {'type': 'loss', 'content': 0.001814266317524016, 'timestamp': '2025-09-30 22:11:10.742847', 'step': 1549, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:10.781075', 'step': 1549, 'epoch': 2} {'type': 'loss', 'content': 0.0005496439407579601, 'timestamp': '2025-09-30 22:11:10.784690', 'step': 1550, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:10.838220', 'step': 1550, 'epoch': 2} {'type': 'loss', 'content': 0.009547079913318157, 'timestamp': '2025-09-30 22:11:10.840886', 'step': 1551, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:10.876943', 'step': 1551, 'epoch': 2} {'type': 'loss', 'content': 0.031712211668491364, 'timestamp': '2025-09-30 22:11:10.902334', 'step': 1552, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:10.951313', 'step': 1552, 'epoch': 2} {'type': 'loss', 'content': 0.00807731319218874, 'timestamp': '2025-09-30 22:11:10.954474', 'step': 1553, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:10.992470', 'step': 1553, 'epoch': 2} {'type': 'loss', 'content': 0.00021573407866526395, 'timestamp': '2025-09-30 22:11:10.995339', 'step': 1554, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:11.027708', 'step': 1554, 'epoch': 2} {'type': 'loss', 'content': 0.043561168015003204, 'timestamp': '2025-09-30 22:11:11.030173', 'step': 1555, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:11.063479', 'step': 1555, 'epoch': 2} {'type': 'loss', 'content': 0.03427037596702576, 'timestamp': '2025-09-30 22:11:11.092796', 'step': 1556, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:11.125340', 'step': 1556, 'epoch': 2} {'type': 'loss', 'content': 0.0011238664155825973, 'timestamp': '2025-09-30 22:11:11.133735', 'step': 1557, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:11.167435', 'step': 1557, 'epoch': 2} {'type': 'loss', 'content': 0.005172597710043192, 'timestamp': '2025-09-30 22:11:11.170400', 'step': 1558, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:11.208142', 'step': 1558, 'epoch': 2} {'type': 'loss', 'content': 0.007295799907296896, 'timestamp': '2025-09-30 22:11:11.210944', 'step': 1559, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:11.248023', 'step': 1559, 'epoch': 2} {'type': 'loss', 'content': 0.0017937627853825688, 'timestamp': '2025-09-30 22:11:11.273349', 'step': 1560, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:11.307673', 'step': 1560, 'epoch': 2} {'type': 'loss', 'content': 0.026663122698664665, 'timestamp': '2025-09-30 22:11:11.310766', 'step': 1561, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:11.343619', 'step': 1561, 'epoch': 2} {'type': 'loss', 'content': 0.00295668700709939, 'timestamp': '2025-09-30 22:11:11.346685', 'step': 1562, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:11.379192', 'step': 1562, 'epoch': 2} {'type': 'loss', 'content': 0.01714220829308033, 'timestamp': '2025-09-30 22:11:11.385761', 'step': 1563, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:11.423772', 'step': 1563, 'epoch': 2} {'type': 'loss', 'content': 0.009915829636156559, 'timestamp': '2025-09-30 22:11:11.447542', 'step': 1564, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:11.479176', 'step': 1564, 'epoch': 2} {'type': 'loss', 'content': 0.00681846309453249, 'timestamp': '2025-09-30 22:11:11.481540', 'step': 1565, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:11.512410', 'step': 1565, 'epoch': 2} {'type': 'loss', 'content': 0.006417275872081518, 'timestamp': '2025-09-30 22:11:11.514536', 'step': 1566, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:11.558353', 'step': 1566, 'epoch': 2} {'type': 'loss', 'content': 0.0019149180734530091, 'timestamp': '2025-09-30 22:11:11.560817', 'step': 1567, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:11.598740', 'step': 1567, 'epoch': 2} {'type': 'loss', 'content': 0.01795736886560917, 'timestamp': '2025-09-30 22:11:11.635532', 'step': 1568, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:11.676499', 'step': 1568, 'epoch': 2} {'type': 'loss', 'content': 0.006271344609558582, 'timestamp': '2025-09-30 22:11:11.679602', 'step': 1569, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:11.718807', 'step': 1569, 'epoch': 2} {'type': 'loss', 'content': 0.011765302158892155, 'timestamp': '2025-09-30 22:11:11.722056', 'step': 1570, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:11.758770', 'step': 1570, 'epoch': 2} {'type': 'loss', 'content': 0.01601121947169304, 'timestamp': '2025-09-30 22:11:11.762378', 'step': 1571, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:11.799696', 'step': 1571, 'epoch': 2} {'type': 'loss', 'content': 0.005938879679888487, 'timestamp': '2025-09-30 22:11:11.833897', 'step': 1572, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:11.881204', 'step': 1572, 'epoch': 2} {'type': 'loss', 'content': 0.0033138389699161053, 'timestamp': '2025-09-30 22:11:11.884875', 'step': 1573, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:11.922584', 'step': 1573, 'epoch': 2} {'type': 'loss', 'content': 0.0007845753571018577, 'timestamp': '2025-09-30 22:11:11.925965', 'step': 1574, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:11.962022', 'step': 1574, 'epoch': 2} {'type': 'loss', 'content': 0.012685197405517101, 'timestamp': '2025-09-30 22:11:11.965371', 'step': 1575, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:11.998559', 'step': 1575, 'epoch': 2} {'type': 'loss', 'content': 0.002849193289875984, 'timestamp': '2025-09-30 22:11:12.023634', 'step': 1576, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:12.058480', 'step': 1576, 'epoch': 2} {'type': 'loss', 'content': 0.008535566739737988, 'timestamp': '2025-09-30 22:11:12.067109', 'step': 1577, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:12.113632', 'step': 1577, 'epoch': 2} {'type': 'loss', 'content': 0.019363412633538246, 'timestamp': '2025-09-30 22:11:12.116527', 'step': 1578, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:12.156439', 'step': 1578, 'epoch': 2} {'type': 'loss', 'content': 0.0015924114268273115, 'timestamp': '2025-09-30 22:11:12.159202', 'step': 1579, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:12.199275', 'step': 1579, 'epoch': 2} {'type': 'loss', 'content': 0.00825516413897276, 'timestamp': '2025-09-30 22:11:12.224151', 'step': 1580, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:12.258110', 'step': 1580, 'epoch': 2} {'type': 'loss', 'content': 0.01482932548969984, 'timestamp': '2025-09-30 22:11:12.261429', 'step': 1581, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:12.301157', 'step': 1581, 'epoch': 2} {'type': 'loss', 'content': 0.004535818938165903, 'timestamp': '2025-09-30 22:11:12.303799', 'step': 1582, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:12.335762', 'step': 1582, 'epoch': 2} {'type': 'loss', 'content': 0.007813365198671818, 'timestamp': '2025-09-30 22:11:12.339349', 'step': 1583, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:12.372781', 'step': 1583, 'epoch': 2} {'type': 'loss', 'content': 0.011519812047481537, 'timestamp': '2025-09-30 22:11:12.397241', 'step': 1584, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:12.437885', 'step': 1584, 'epoch': 2} {'type': 'loss', 'content': 0.011441444046795368, 'timestamp': '2025-09-30 22:11:12.446962', 'step': 1585, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:12.484675', 'step': 1585, 'epoch': 2} {'type': 'loss', 'content': 0.02742001973092556, 'timestamp': '2025-09-30 22:11:12.488882', 'step': 1586, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:12.535036', 'step': 1586, 'epoch': 2} {'type': 'loss', 'content': 0.04134129732847214, 'timestamp': '2025-09-30 22:11:12.538544', 'step': 1587, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:12.586707', 'step': 1587, 'epoch': 2} {'type': 'loss', 'content': 0.0030712636653333902, 'timestamp': '2025-09-30 22:11:12.611174', 'step': 1588, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:12.655183', 'step': 1588, 'epoch': 2} {'type': 'loss', 'content': 0.009427117183804512, 'timestamp': '2025-09-30 22:11:12.669240', 'step': 1589, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:12.711615', 'step': 1589, 'epoch': 2} {'type': 'loss', 'content': 0.020526956766843796, 'timestamp': '2025-09-30 22:11:12.714242', 'step': 1590, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:12.751598', 'step': 1590, 'epoch': 2} {'type': 'loss', 'content': 0.03806446120142937, 'timestamp': '2025-09-30 22:11:12.755662', 'step': 1591, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:12.793941', 'step': 1591, 'epoch': 2} {'type': 'loss', 'content': 0.060422275215387344, 'timestamp': '2025-09-30 22:11:12.817908', 'step': 1592, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:12.861960', 'step': 1592, 'epoch': 2} {'type': 'loss', 'content': 0.008056329563260078, 'timestamp': '2025-09-30 22:11:12.865140', 'step': 1593, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:12.912359', 'step': 1593, 'epoch': 2} {'type': 'loss', 'content': 0.007798563688993454, 'timestamp': '2025-09-30 22:11:12.916058', 'step': 1594, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:12.960644', 'step': 1594, 'epoch': 2} {'type': 'loss', 'content': 0.012549254111945629, 'timestamp': '2025-09-30 22:11:12.963167', 'step': 1595, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:13.006227', 'step': 1595, 'epoch': 2} {'type': 'loss', 'content': 0.019497841596603394, 'timestamp': '2025-09-30 22:11:13.030477', 'step': 1596, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-30 22:11:14.051495', 'step': 1596, 'epoch': 2} {'type': 'pplx', 'content': 47136353.19913118, 'timestamp': '2025-09-30 22:11:14.055979', 'step': 1596, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:14.085866', 'step': 1596, 'epoch': 2} {'type': 'loss', 'content': 0.006792944855988026, 'timestamp': '2025-09-30 22:11:14.094562', 'step': 1597, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:14.127910', 'step': 1597, 'epoch': 2} {'type': 'loss', 'content': 0.007558739744126797, 'timestamp': '2025-09-30 22:11:14.131625', 'step': 1598, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:14.168968', 'step': 1598, 'epoch': 2} {'type': 'loss', 'content': 0.01721787638962269, 'timestamp': '2025-09-30 22:11:14.172328', 'step': 1599, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:14.225494', 'step': 1599, 'epoch': 2} {'type': 'loss', 'content': 0.009689075872302055, 'timestamp': '2025-09-30 22:11:14.251163', 'step': 1600, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:14.285090', 'step': 1600, 'epoch': 2} {'type': 'loss', 'content': 0.0035173031501471996, 'timestamp': '2025-09-30 22:11:14.292633', 'step': 1601, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:14.327312', 'step': 1601, 'epoch': 2} {'type': 'loss', 'content': 0.026356985792517662, 'timestamp': '2025-09-30 22:11:14.330429', 'step': 1602, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:14.368250', 'step': 1602, 'epoch': 2} {'type': 'loss', 'content': 0.018740003928542137, 'timestamp': '2025-09-30 22:11:14.371954', 'step': 1603, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:14.408822', 'step': 1603, 'epoch': 2} {'type': 'loss', 'content': 0.001994580263271928, 'timestamp': '2025-09-30 22:11:14.445054', 'step': 1604, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:14.480880', 'step': 1604, 'epoch': 2} {'type': 'loss', 'content': 0.005098961293697357, 'timestamp': '2025-09-30 22:11:14.483469', 'step': 1605, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:14.525716', 'step': 1605, 'epoch': 2} {'type': 'loss', 'content': 0.032878194004297256, 'timestamp': '2025-09-30 22:11:14.528116', 'step': 1606, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:14.562398', 'step': 1606, 'epoch': 2} {'type': 'loss', 'content': 0.007509057410061359, 'timestamp': '2025-09-30 22:11:14.565653', 'step': 1607, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:14.598373', 'step': 1607, 'epoch': 2} {'type': 'loss', 'content': 0.006099659949541092, 'timestamp': '2025-09-30 22:11:14.622445', 'step': 1608, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:14.657197', 'step': 1608, 'epoch': 2} {'type': 'loss', 'content': 0.007134649902582169, 'timestamp': '2025-09-30 22:11:14.659747', 'step': 1609, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:14.701744', 'step': 1609, 'epoch': 2} {'type': 'loss', 'content': 0.02740248665213585, 'timestamp': '2025-09-30 22:11:14.705235', 'step': 1610, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:14.742201', 'step': 1610, 'epoch': 2} {'type': 'loss', 'content': 0.037114981561899185, 'timestamp': '2025-09-30 22:11:14.745149', 'step': 1611, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:14.795904', 'step': 1611, 'epoch': 2} {'type': 'loss', 'content': 0.004330487456172705, 'timestamp': '2025-09-30 22:11:14.826921', 'step': 1612, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:14.860475', 'step': 1612, 'epoch': 2} {'type': 'loss', 'content': 0.014859907329082489, 'timestamp': '2025-09-30 22:11:14.863817', 'step': 1613, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:14.897312', 'step': 1613, 'epoch': 2} {'type': 'loss', 'content': 0.006890242453664541, 'timestamp': '2025-09-30 22:11:14.900374', 'step': 1614, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:14.934109', 'step': 1614, 'epoch': 2} {'type': 'loss', 'content': 0.004319236613810062, 'timestamp': '2025-09-30 22:11:14.936652', 'step': 1615, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:14.973253', 'step': 1615, 'epoch': 2} {'type': 'loss', 'content': 0.020764382556080818, 'timestamp': '2025-09-30 22:11:14.999435', 'step': 1616, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:15.032944', 'step': 1616, 'epoch': 2} {'type': 'loss', 'content': 0.017804834991693497, 'timestamp': '2025-09-30 22:11:15.035453', 'step': 1617, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:15.069228', 'step': 1617, 'epoch': 2} {'type': 'loss', 'content': 0.003709944197908044, 'timestamp': '2025-09-30 22:11:15.076923', 'step': 1618, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:15.116441', 'step': 1618, 'epoch': 2} {'type': 'loss', 'content': 0.010521622374653816, 'timestamp': '2025-09-30 22:11:15.122817', 'step': 1619, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:15.157877', 'step': 1619, 'epoch': 2} {'type': 'loss', 'content': 0.01603495515882969, 'timestamp': '2025-09-30 22:11:15.182859', 'step': 1620, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:15.232127', 'step': 1620, 'epoch': 2} {'type': 'loss', 'content': 0.0070645250380039215, 'timestamp': '2025-09-30 22:11:15.236730', 'step': 1621, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:15.271333', 'step': 1621, 'epoch': 2} {'type': 'loss', 'content': 0.009963012300431728, 'timestamp': '2025-09-30 22:11:15.274460', 'step': 1622, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:15.308615', 'step': 1622, 'epoch': 2} {'type': 'loss', 'content': 0.001885375240817666, 'timestamp': '2025-09-30 22:11:15.312665', 'step': 1623, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:15.358125', 'step': 1623, 'epoch': 2} {'type': 'loss', 'content': 0.0027446283493191004, 'timestamp': '2025-09-30 22:11:15.382140', 'step': 1624, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:15.426866', 'step': 1624, 'epoch': 2} {'type': 'loss', 'content': 0.010737799108028412, 'timestamp': '2025-09-30 22:11:15.433567', 'step': 1625, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:15.465717', 'step': 1625, 'epoch': 2} {'type': 'loss', 'content': 0.0061384267173707485, 'timestamp': '2025-09-30 22:11:15.468549', 'step': 1626, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:15.506781', 'step': 1626, 'epoch': 2} {'type': 'loss', 'content': 0.04606388881802559, 'timestamp': '2025-09-30 22:11:15.511864', 'step': 1627, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:11:15.547568', 'step': 1627, 'epoch': 2} {'type': 'loss', 'content': 0.007767940405756235, 'timestamp': '2025-09-30 22:11:15.583697', 'step': 1628, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:15.617768', 'step': 1628, 'epoch': 2} {'type': 'loss', 'content': 0.017403727397322655, 'timestamp': '2025-09-30 22:11:15.622463', 'step': 1629, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:15.658000', 'step': 1629, 'epoch': 2} {'type': 'loss', 'content': 0.008258694782853127, 'timestamp': '2025-09-30 22:11:15.662312', 'step': 1630, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:15.697829', 'step': 1630, 'epoch': 2} {'type': 'loss', 'content': 0.008111465722322464, 'timestamp': '2025-09-30 22:11:15.701658', 'step': 1631, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:15.741527', 'step': 1631, 'epoch': 2} {'type': 'loss', 'content': 0.03149692341685295, 'timestamp': '2025-09-30 22:11:15.765403', 'step': 1632, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:15.799399', 'step': 1632, 'epoch': 2} {'type': 'loss', 'content': 0.01508941687643528, 'timestamp': '2025-09-30 22:11:15.802435', 'step': 1633, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:15.843164', 'step': 1633, 'epoch': 2} {'type': 'loss', 'content': 0.007195001933723688, 'timestamp': '2025-09-30 22:11:15.852408', 'step': 1634, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:15.887132', 'step': 1634, 'epoch': 2} {'type': 'loss', 'content': 0.009918026626110077, 'timestamp': '2025-09-30 22:11:15.890909', 'step': 1635, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:15.925086', 'step': 1635, 'epoch': 2} {'type': 'loss', 'content': 0.00782895926386118, 'timestamp': '2025-09-30 22:11:15.949569', 'step': 1636, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:15.994565', 'step': 1636, 'epoch': 2} {'type': 'loss', 'content': 0.027636080980300903, 'timestamp': '2025-09-30 22:11:16.001397', 'step': 1637, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:16.036006', 'step': 1637, 'epoch': 2} {'type': 'loss', 'content': 0.0011494786012917757, 'timestamp': '2025-09-30 22:11:16.045307', 'step': 1638, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:16.083671', 'step': 1638, 'epoch': 2} {'type': 'loss', 'content': 0.0017236818093806505, 'timestamp': '2025-09-30 22:11:16.090881', 'step': 1639, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:16.122671', 'step': 1639, 'epoch': 2} {'type': 'loss', 'content': 0.0035358848981559277, 'timestamp': '2025-09-30 22:11:16.150610', 'step': 1640, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:16.201286', 'step': 1640, 'epoch': 2} {'type': 'loss', 'content': 0.023775506764650345, 'timestamp': '2025-09-30 22:11:16.205223', 'step': 1641, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:16.244562', 'step': 1641, 'epoch': 2} {'type': 'loss', 'content': 0.025196192786097527, 'timestamp': '2025-09-30 22:11:16.253204', 'step': 1642, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:16.288390', 'step': 1642, 'epoch': 2} {'type': 'loss', 'content': 0.006762804929167032, 'timestamp': '2025-09-30 22:11:16.291240', 'step': 1643, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:16.324656', 'step': 1643, 'epoch': 2} {'type': 'loss', 'content': 0.011221730150282383, 'timestamp': '2025-09-30 22:11:16.348986', 'step': 1644, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:16.386936', 'step': 1644, 'epoch': 2} {'type': 'loss', 'content': 0.004417662974447012, 'timestamp': '2025-09-30 22:11:16.389510', 'step': 1645, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:16.429199', 'step': 1645, 'epoch': 2} {'type': 'loss', 'content': 0.005136736668646336, 'timestamp': '2025-09-30 22:11:16.433166', 'step': 1646, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:16.475557', 'step': 1646, 'epoch': 2} {'type': 'loss', 'content': 0.010737181641161442, 'timestamp': '2025-09-30 22:11:16.478448', 'step': 1647, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:16.529688', 'step': 1647, 'epoch': 2} {'type': 'loss', 'content': 0.007168728858232498, 'timestamp': '2025-09-30 22:11:16.554022', 'step': 1648, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:16.588547', 'step': 1648, 'epoch': 2} {'type': 'loss', 'content': 0.02043146826326847, 'timestamp': '2025-09-30 22:11:16.595036', 'step': 1649, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:16.629907', 'step': 1649, 'epoch': 2} {'type': 'loss', 'content': 0.009257582947611809, 'timestamp': '2025-09-30 22:11:16.632516', 'step': 1650, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:16.669616', 'step': 1650, 'epoch': 2} {'type': 'loss', 'content': 0.004939699079841375, 'timestamp': '2025-09-30 22:11:16.675479', 'step': 1651, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:16.708352', 'step': 1651, 'epoch': 2} {'type': 'loss', 'content': 0.0030526297632604837, 'timestamp': '2025-09-30 22:11:16.732305', 'step': 1652, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:16.768422', 'step': 1652, 'epoch': 2} {'type': 'loss', 'content': 0.01626562885940075, 'timestamp': '2025-09-30 22:11:16.779295', 'step': 1653, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-30 22:11:17.777878', 'step': 1653, 'epoch': 2} {'type': 'pplx', 'content': 52579143.438625045, 'timestamp': '2025-09-30 22:11:17.783346', 'step': 1653, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:17.815788', 'step': 1653, 'epoch': 2} {'type': 'loss', 'content': 0.011779659427702427, 'timestamp': '2025-09-30 22:11:17.820263', 'step': 1654, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:17.853790', 'step': 1654, 'epoch': 2} {'type': 'loss', 'content': 0.005387573968619108, 'timestamp': '2025-09-30 22:11:17.858850', 'step': 1655, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:17.898917', 'step': 1655, 'epoch': 2} {'type': 'loss', 'content': 0.005358588416129351, 'timestamp': '2025-09-30 22:11:17.923976', 'step': 1656, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:17.966689', 'step': 1656, 'epoch': 2} {'type': 'loss', 'content': 0.025809431448578835, 'timestamp': '2025-09-30 22:11:17.971246', 'step': 1657, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:18.007610', 'step': 1657, 'epoch': 2} {'type': 'loss', 'content': 0.02251555025577545, 'timestamp': '2025-09-30 22:11:18.011159', 'step': 1658, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:18.054050', 'step': 1658, 'epoch': 2} {'type': 'loss', 'content': 0.0064919935539364815, 'timestamp': '2025-09-30 22:11:18.064814', 'step': 1659, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:18.104791', 'step': 1659, 'epoch': 2} {'type': 'loss', 'content': 0.004880402237176895, 'timestamp': '2025-09-30 22:11:18.129394', 'step': 1660, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:18.173429', 'step': 1660, 'epoch': 2} {'type': 'loss', 'content': 0.03059115633368492, 'timestamp': '2025-09-30 22:11:18.175741', 'step': 1661, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:18.209942', 'step': 1661, 'epoch': 2} {'type': 'loss', 'content': 0.01182292215526104, 'timestamp': '2025-09-30 22:11:18.213649', 'step': 1662, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:18.259001', 'step': 1662, 'epoch': 2} {'type': 'loss', 'content': 0.007961846888065338, 'timestamp': '2025-09-30 22:11:18.266025', 'step': 1663, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:18.305631', 'step': 1663, 'epoch': 2} {'type': 'loss', 'content': 0.0042456877417862415, 'timestamp': '2025-09-30 22:11:18.330757', 'step': 1664, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:18.363205', 'step': 1664, 'epoch': 2} {'type': 'loss', 'content': 0.011246146634221077, 'timestamp': '2025-09-30 22:11:18.365745', 'step': 1665, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:18.410565', 'step': 1665, 'epoch': 2} {'type': 'loss', 'content': 0.010035556741058826, 'timestamp': '2025-09-30 22:11:18.413063', 'step': 1666, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:18.446348', 'step': 1666, 'epoch': 2} {'type': 'loss', 'content': 0.015372427180409431, 'timestamp': '2025-09-30 22:11:18.449732', 'step': 1667, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:18.490277', 'step': 1667, 'epoch': 2} {'type': 'loss', 'content': 0.011080428957939148, 'timestamp': '2025-09-30 22:11:18.514457', 'step': 1668, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:18.555951', 'step': 1668, 'epoch': 2} {'type': 'loss', 'content': 0.03798171877861023, 'timestamp': '2025-09-30 22:11:18.560229', 'step': 1669, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:18.592950', 'step': 1669, 'epoch': 2} {'type': 'loss', 'content': 0.008265224285423756, 'timestamp': '2025-09-30 22:11:18.598976', 'step': 1670, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:18.637168', 'step': 1670, 'epoch': 2} {'type': 'loss', 'content': 0.011475497856736183, 'timestamp': '2025-09-30 22:11:18.639645', 'step': 1671, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:18.671316', 'step': 1671, 'epoch': 2} {'type': 'loss', 'content': 0.009563402272760868, 'timestamp': '2025-09-30 22:11:18.698770', 'step': 1672, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:18.745958', 'step': 1672, 'epoch': 2} {'type': 'loss', 'content': 0.008106688968837261, 'timestamp': '2025-09-30 22:11:18.748927', 'step': 1673, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:18.787684', 'step': 1673, 'epoch': 2} {'type': 'loss', 'content': 0.004956881981343031, 'timestamp': '2025-09-30 22:11:18.790426', 'step': 1674, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:18.833437', 'step': 1674, 'epoch': 2} {'type': 'loss', 'content': 0.002341997344046831, 'timestamp': '2025-09-30 22:11:18.840321', 'step': 1675, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:18.880124', 'step': 1675, 'epoch': 2} {'type': 'loss', 'content': 0.0017724635545164347, 'timestamp': '2025-09-30 22:11:18.910265', 'step': 1676, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:18.947876', 'step': 1676, 'epoch': 2} {'type': 'loss', 'content': 0.008215676061809063, 'timestamp': '2025-09-30 22:11:18.951674', 'step': 1677, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:18.990383', 'step': 1677, 'epoch': 2} {'type': 'loss', 'content': 0.006653092801570892, 'timestamp': '2025-09-30 22:11:18.993539', 'step': 1678, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:11:19.029880', 'step': 1678, 'epoch': 2} {'type': 'loss', 'content': 0.016842633485794067, 'timestamp': '2025-09-30 22:11:19.035171', 'step': 1679, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:19.079920', 'step': 1679, 'epoch': 2} {'type': 'loss', 'content': 0.009505665861070156, 'timestamp': '2025-09-30 22:11:19.107996', 'step': 1680, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:19.147255', 'step': 1680, 'epoch': 2} {'type': 'loss', 'content': 0.0035089331213384867, 'timestamp': '2025-09-30 22:11:19.161010', 'step': 1681, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:19.206903', 'step': 1681, 'epoch': 2} {'type': 'loss', 'content': 0.005087549332529306, 'timestamp': '2025-09-30 22:11:19.211265', 'step': 1682, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:19.261773', 'step': 1682, 'epoch': 2} {'type': 'loss', 'content': 0.0037894020788371563, 'timestamp': '2025-09-30 22:11:19.266639', 'step': 1683, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:19.304258', 'step': 1683, 'epoch': 2} {'type': 'loss', 'content': 0.0036766440607607365, 'timestamp': '2025-09-30 22:11:19.328797', 'step': 1684, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:19.371354', 'step': 1684, 'epoch': 2} {'type': 'loss', 'content': 0.013706843368709087, 'timestamp': '2025-09-30 22:11:19.374073', 'step': 1685, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:19.411114', 'step': 1685, 'epoch': 2} {'type': 'loss', 'content': 0.007448071148246527, 'timestamp': '2025-09-30 22:11:19.413110', 'step': 1686, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:19.444973', 'step': 1686, 'epoch': 2} {'type': 'loss', 'content': 0.03570406511425972, 'timestamp': '2025-09-30 22:11:19.447758', 'step': 1687, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:19.490999', 'step': 1687, 'epoch': 2} {'type': 'loss', 'content': 0.045662470161914825, 'timestamp': '2025-09-30 22:11:19.514896', 'step': 1688, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:19.553825', 'step': 1688, 'epoch': 2} {'type': 'loss', 'content': 0.0003720544627867639, 'timestamp': '2025-09-30 22:11:19.556171', 'step': 1689, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:19.595078', 'step': 1689, 'epoch': 2} {'type': 'loss', 'content': 0.050418563187122345, 'timestamp': '2025-09-30 22:11:19.598579', 'step': 1690, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:19.634837', 'step': 1690, 'epoch': 2} {'type': 'loss', 'content': 0.000317491969326511, 'timestamp': '2025-09-30 22:11:19.638883', 'step': 1691, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:19.675596', 'step': 1691, 'epoch': 2} {'type': 'loss', 'content': 0.029729658737778664, 'timestamp': '2025-09-30 22:11:19.699876', 'step': 1692, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:19.746572', 'step': 1692, 'epoch': 2} {'type': 'loss', 'content': 0.012829835526645184, 'timestamp': '2025-09-30 22:11:19.750386', 'step': 1693, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:19.794800', 'step': 1693, 'epoch': 2} {'type': 'loss', 'content': 0.003971923608332872, 'timestamp': '2025-09-30 22:11:19.799952', 'step': 1694, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:19.839851', 'step': 1694, 'epoch': 2} {'type': 'loss', 'content': 0.0006053160759620368, 'timestamp': '2025-09-30 22:11:19.842887', 'step': 1695, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:19.875733', 'step': 1695, 'epoch': 2} {'type': 'loss', 'content': 0.05187639221549034, 'timestamp': '2025-09-30 22:11:19.899626', 'step': 1696, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:19.931281', 'step': 1696, 'epoch': 2} {'type': 'loss', 'content': 0.0017442479729652405, 'timestamp': '2025-09-30 22:11:19.933872', 'step': 1697, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:19.968020', 'step': 1697, 'epoch': 2} {'type': 'loss', 'content': 0.017671983689069748, 'timestamp': '2025-09-30 22:11:19.971499', 'step': 1698, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:20.006265', 'step': 1698, 'epoch': 2} {'type': 'loss', 'content': 0.002051006769761443, 'timestamp': '2025-09-30 22:11:20.010473', 'step': 1699, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:20.049175', 'step': 1699, 'epoch': 2} {'type': 'loss', 'content': 0.012735229916870594, 'timestamp': '2025-09-30 22:11:20.076512', 'step': 1700, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:20.114131', 'step': 1700, 'epoch': 2} {'type': 'loss', 'content': 0.00960957258939743, 'timestamp': '2025-09-30 22:11:20.116573', 'step': 1701, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:20.149478', 'step': 1701, 'epoch': 2} {'type': 'loss', 'content': 0.0022175023332238197, 'timestamp': '2025-09-30 22:11:20.152937', 'step': 1702, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:20.186794', 'step': 1702, 'epoch': 2} {'type': 'loss', 'content': 0.0005305535742081702, 'timestamp': '2025-09-30 22:11:20.191085', 'step': 1703, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:20.241516', 'step': 1703, 'epoch': 2} {'type': 'loss', 'content': 0.0003927931247744709, 'timestamp': '2025-09-30 22:11:20.265874', 'step': 1704, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:20.302418', 'step': 1704, 'epoch': 2} {'type': 'loss', 'content': 0.006027755327522755, 'timestamp': '2025-09-30 22:11:20.307492', 'step': 1705, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:20.339271', 'step': 1705, 'epoch': 2} {'type': 'loss', 'content': 0.017157012596726418, 'timestamp': '2025-09-30 22:11:20.341891', 'step': 1706, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:20.372977', 'step': 1706, 'epoch': 2} {'type': 'loss', 'content': 0.0005596071714535356, 'timestamp': '2025-09-30 22:11:20.386134', 'step': 1707, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:20.418184', 'step': 1707, 'epoch': 2} {'type': 'loss', 'content': 0.00957457721233368, 'timestamp': '2025-09-30 22:11:20.442210', 'step': 1708, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:20.481850', 'step': 1708, 'epoch': 2} {'type': 'loss', 'content': 0.00753258541226387, 'timestamp': '2025-09-30 22:11:20.486305', 'step': 1709, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:20.542885', 'step': 1709, 'epoch': 2} {'type': 'loss', 'content': 0.0016120340442284942, 'timestamp': '2025-09-30 22:11:20.546828', 'step': 1710, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-30 22:11:22.077267', 'step': 1710, 'epoch': 2} {'type': 'pplx', 'content': 60174026.64577574, 'timestamp': '2025-09-30 22:11:22.080733', 'step': 1710, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:22.152726', 'step': 1710, 'epoch': 2} {'type': 'loss', 'content': 0.010765468701720238, 'timestamp': '2025-09-30 22:11:22.175983', 'step': 1711, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:22.223024', 'step': 1711, 'epoch': 2} {'type': 'loss', 'content': 0.010604661889374256, 'timestamp': '2025-09-30 22:11:22.252521', 'step': 1712, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:22.287267', 'step': 1712, 'epoch': 2} {'type': 'loss', 'content': 0.004304408561438322, 'timestamp': '2025-09-30 22:11:22.290026', 'step': 1713, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:22.327356', 'step': 1713, 'epoch': 2} {'type': 'loss', 'content': 0.0032625349704176188, 'timestamp': '2025-09-30 22:11:22.334879', 'step': 1714, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:22.369771', 'step': 1714, 'epoch': 2} {'type': 'loss', 'content': 0.002788375597447157, 'timestamp': '2025-09-30 22:11:22.373507', 'step': 1715, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:22.421356', 'step': 1715, 'epoch': 2} {'type': 'loss', 'content': 0.004468221217393875, 'timestamp': '2025-09-30 22:11:22.445723', 'step': 1716, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:22.480791', 'step': 1716, 'epoch': 2} {'type': 'loss', 'content': 0.034291721880435944, 'timestamp': '2025-09-30 22:11:22.490757', 'step': 1717, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:22.533177', 'step': 1717, 'epoch': 2} {'type': 'loss', 'content': 0.00639685895293951, 'timestamp': '2025-09-30 22:11:22.537647', 'step': 1718, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:22.586726', 'step': 1718, 'epoch': 2} {'type': 'loss', 'content': 0.014624513685703278, 'timestamp': '2025-09-30 22:11:22.589658', 'step': 1719, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:22.623660', 'step': 1719, 'epoch': 2} {'type': 'loss', 'content': 0.019972490146756172, 'timestamp': '2025-09-30 22:11:22.648502', 'step': 1720, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:22.683821', 'step': 1720, 'epoch': 2} {'type': 'loss', 'content': 0.0026740862522274256, 'timestamp': '2025-09-30 22:11:22.686918', 'step': 1721, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:22.718315', 'step': 1721, 'epoch': 2} {'type': 'loss', 'content': 0.004609130322933197, 'timestamp': '2025-09-30 22:11:22.722070', 'step': 1722, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:22.757462', 'step': 1722, 'epoch': 2} {'type': 'loss', 'content': 0.0044154576025903225, 'timestamp': '2025-09-30 22:11:22.761134', 'step': 1723, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:22.798446', 'step': 1723, 'epoch': 2} {'type': 'loss', 'content': 0.004700851161032915, 'timestamp': '2025-09-30 22:11:22.828064', 'step': 1724, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:22.861723', 'step': 1724, 'epoch': 2} {'type': 'loss', 'content': 0.0020207969937473536, 'timestamp': '2025-09-30 22:11:22.864905', 'step': 1725, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:22.902422', 'step': 1725, 'epoch': 2} {'type': 'loss', 'content': 0.008287390694022179, 'timestamp': '2025-09-30 22:11:22.905044', 'step': 1726, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:22.938181', 'step': 1726, 'epoch': 2} {'type': 'loss', 'content': 0.011976302601397038, 'timestamp': '2025-09-30 22:11:22.941404', 'step': 1727, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:22.975675', 'step': 1727, 'epoch': 2} {'type': 'loss', 'content': 0.0026996787637472153, 'timestamp': '2025-09-30 22:11:23.000534', 'step': 1728, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:23.035217', 'step': 1728, 'epoch': 2} {'type': 'loss', 'content': 0.007718226406723261, 'timestamp': '2025-09-30 22:11:23.046962', 'step': 1729, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:23.083209', 'step': 1729, 'epoch': 2} {'type': 'loss', 'content': 0.007085180841386318, 'timestamp': '2025-09-30 22:11:23.086526', 'step': 1730, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:23.119688', 'step': 1730, 'epoch': 2} {'type': 'loss', 'content': 0.009826818481087685, 'timestamp': '2025-09-30 22:11:23.122632', 'step': 1731, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:23.164476', 'step': 1731, 'epoch': 2} {'type': 'loss', 'content': 0.0005284692742861807, 'timestamp': '2025-09-30 22:11:23.190815', 'step': 1732, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:23.225736', 'step': 1732, 'epoch': 2} {'type': 'loss', 'content': 0.0018098134314641356, 'timestamp': '2025-09-30 22:11:23.229164', 'step': 1733, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:23.263673', 'step': 1733, 'epoch': 2} {'type': 'loss', 'content': 0.0055390470661222935, 'timestamp': '2025-09-30 22:11:23.267756', 'step': 1734, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:23.302555', 'step': 1734, 'epoch': 2} {'type': 'loss', 'content': 0.003144989488646388, 'timestamp': '2025-09-30 22:11:23.310271', 'step': 1735, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:23.344838', 'step': 1735, 'epoch': 2} {'type': 'loss', 'content': 0.005034166853874922, 'timestamp': '2025-09-30 22:11:23.369640', 'step': 1736, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:23.402031', 'step': 1736, 'epoch': 2} {'type': 'loss', 'content': 0.010449747554957867, 'timestamp': '2025-09-30 22:11:23.404838', 'step': 1737, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:23.440348', 'step': 1737, 'epoch': 2} {'type': 'loss', 'content': 0.017017576843500137, 'timestamp': '2025-09-30 22:11:23.448499', 'step': 1738, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:23.483875', 'step': 1738, 'epoch': 2} {'type': 'loss', 'content': 0.00233058026060462, 'timestamp': '2025-09-30 22:11:23.487180', 'step': 1739, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:23.519793', 'step': 1739, 'epoch': 2} {'type': 'loss', 'content': 0.002276728395372629, 'timestamp': '2025-09-30 22:11:23.548173', 'step': 1740, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:23.580676', 'step': 1740, 'epoch': 2} {'type': 'loss', 'content': 0.0223015658557415, 'timestamp': '2025-09-30 22:11:23.593857', 'step': 1741, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:23.627828', 'step': 1741, 'epoch': 2} {'type': 'loss', 'content': 0.004606612492352724, 'timestamp': '2025-09-30 22:11:23.630529', 'step': 1742, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:23.666316', 'step': 1742, 'epoch': 2} {'type': 'loss', 'content': 0.009004906751215458, 'timestamp': '2025-09-30 22:11:23.669336', 'step': 1743, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:23.710375', 'step': 1743, 'epoch': 2} {'type': 'loss', 'content': 0.018749961629509926, 'timestamp': '2025-09-30 22:11:23.734755', 'step': 1744, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:23.767285', 'step': 1744, 'epoch': 2} {'type': 'loss', 'content': 0.0017843234818428755, 'timestamp': '2025-09-30 22:11:23.769754', 'step': 1745, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:23.807368', 'step': 1745, 'epoch': 2} {'type': 'loss', 'content': 0.0022551752626895905, 'timestamp': '2025-09-30 22:11:23.814763', 'step': 1746, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:23.847784', 'step': 1746, 'epoch': 2} {'type': 'loss', 'content': 0.0012390002375468612, 'timestamp': '2025-09-30 22:11:23.851902', 'step': 1747, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:23.891863', 'step': 1747, 'epoch': 2} {'type': 'loss', 'content': 0.002626475179567933, 'timestamp': '2025-09-30 22:11:23.916279', 'step': 1748, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:23.954270', 'step': 1748, 'epoch': 2} {'type': 'loss', 'content': 0.0008998786797747016, 'timestamp': '2025-09-30 22:11:23.965730', 'step': 1749, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:23.999385', 'step': 1749, 'epoch': 2} {'type': 'loss', 'content': 0.002064060652628541, 'timestamp': '2025-09-30 22:11:24.006826', 'step': 1750, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:24.038989', 'step': 1750, 'epoch': 2} {'type': 'loss', 'content': 0.014346698299050331, 'timestamp': '2025-09-30 22:11:24.045224', 'step': 1751, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:24.093531', 'step': 1751, 'epoch': 2} {'type': 'loss', 'content': 0.0009058486321009696, 'timestamp': '2025-09-30 22:11:24.123954', 'step': 1752, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:24.157834', 'step': 1752, 'epoch': 2} {'type': 'loss', 'content': 0.011217797175049782, 'timestamp': '2025-09-30 22:11:24.161244', 'step': 1753, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:24.200558', 'step': 1753, 'epoch': 2} {'type': 'loss', 'content': 0.038053300231695175, 'timestamp': '2025-09-30 22:11:24.204331', 'step': 1754, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:24.236421', 'step': 1754, 'epoch': 2} {'type': 'loss', 'content': 0.0037916922010481358, 'timestamp': '2025-09-30 22:11:24.240045', 'step': 1755, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:24.272620', 'step': 1755, 'epoch': 2} {'type': 'loss', 'content': 0.005685042589902878, 'timestamp': '2025-09-30 22:11:24.297228', 'step': 1756, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:24.328506', 'step': 1756, 'epoch': 2} {'type': 'loss', 'content': 0.0005464585847221315, 'timestamp': '2025-09-30 22:11:24.331344', 'step': 1757, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:24.366826', 'step': 1757, 'epoch': 2} {'type': 'loss', 'content': 0.0022288798354566097, 'timestamp': '2025-09-30 22:11:24.369591', 'step': 1758, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:24.408028', 'step': 1758, 'epoch': 2} {'type': 'loss', 'content': 0.0014701449545100331, 'timestamp': '2025-09-30 22:11:24.411765', 'step': 1759, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:24.456430', 'step': 1759, 'epoch': 2} {'type': 'loss', 'content': 0.018580257892608643, 'timestamp': '2025-09-30 22:11:24.480004', 'step': 1760, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:24.514320', 'step': 1760, 'epoch': 2} {'type': 'loss', 'content': 0.005124218761920929, 'timestamp': '2025-09-30 22:11:24.521142', 'step': 1761, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:24.563696', 'step': 1761, 'epoch': 2} {'type': 'loss', 'content': 0.0004061157815158367, 'timestamp': '2025-09-30 22:11:24.569458', 'step': 1762, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:24.606484', 'step': 1762, 'epoch': 2} {'type': 'loss', 'content': 0.00017968073370866477, 'timestamp': '2025-09-30 22:11:24.609874', 'step': 1763, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:24.647850', 'step': 1763, 'epoch': 2} {'type': 'loss', 'content': 0.008494771085679531, 'timestamp': '2025-09-30 22:11:24.675944', 'step': 1764, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:24.713922', 'step': 1764, 'epoch': 2} {'type': 'loss', 'content': 0.005634578876197338, 'timestamp': '2025-09-30 22:11:24.716879', 'step': 1765, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:24.749668', 'step': 1765, 'epoch': 2} {'type': 'loss', 'content': 0.0004148540028836578, 'timestamp': '2025-09-30 22:11:24.754076', 'step': 1766, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:24.795159', 'step': 1766, 'epoch': 2} {'type': 'loss', 'content': 0.000306047557387501, 'timestamp': '2025-09-30 22:11:24.799871', 'step': 1767, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-30 22:11:25.740622', 'step': 1767, 'epoch': 2} {'type': 'pplx', 'content': 62558342.156648576, 'timestamp': '2025-09-30 22:11:25.744118', 'step': 1767, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:25.774830', 'step': 1767, 'epoch': 2} {'type': 'loss', 'content': 0.00027061282889917493, 'timestamp': '2025-09-30 22:11:25.799665', 'step': 1768, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:25.838783', 'step': 1768, 'epoch': 2} {'type': 'loss', 'content': 0.004808730445802212, 'timestamp': '2025-09-30 22:11:25.849752', 'step': 1769, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:25.882948', 'step': 1769, 'epoch': 2} {'type': 'loss', 'content': 0.002109024440869689, 'timestamp': '2025-09-30 22:11:25.885906', 'step': 1770, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:25.925785', 'step': 1770, 'epoch': 2} {'type': 'loss', 'content': 0.0014152558287605643, 'timestamp': '2025-09-30 22:11:25.935621', 'step': 1771, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:25.984313', 'step': 1771, 'epoch': 2} {'type': 'loss', 'content': 0.00017617108824197203, 'timestamp': '2025-09-30 22:11:26.013188', 'step': 1772, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:26.048944', 'step': 1772, 'epoch': 2} {'type': 'loss', 'content': 0.0008881940157152712, 'timestamp': '2025-09-30 22:11:26.055619', 'step': 1773, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:26.098786', 'step': 1773, 'epoch': 2} {'type': 'loss', 'content': 0.005856442730873823, 'timestamp': '2025-09-30 22:11:26.106763', 'step': 1774, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:26.142555', 'step': 1774, 'epoch': 2} {'type': 'loss', 'content': 0.02501927874982357, 'timestamp': '2025-09-30 22:11:26.145541', 'step': 1775, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:26.180813', 'step': 1775, 'epoch': 2} {'type': 'loss', 'content': 0.0009421083959750831, 'timestamp': '2025-09-30 22:11:26.210943', 'step': 1776, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:26.248770', 'step': 1776, 'epoch': 2} {'type': 'loss', 'content': 0.0019056095043197274, 'timestamp': '2025-09-30 22:11:26.256469', 'step': 1777, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:26.287290', 'step': 1777, 'epoch': 2} {'type': 'loss', 'content': 0.0005593043169938028, 'timestamp': '2025-09-30 22:11:26.315432', 'step': 1778, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:26.349521', 'step': 1778, 'epoch': 2} {'type': 'loss', 'content': 0.0033685010857880116, 'timestamp': '2025-09-30 22:11:26.356792', 'step': 1779, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:26.400591', 'step': 1779, 'epoch': 2} {'type': 'loss', 'content': 0.002042163861915469, 'timestamp': '2025-09-30 22:11:26.424522', 'step': 1780, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:26.460158', 'step': 1780, 'epoch': 2} {'type': 'loss', 'content': 0.00037928626989014447, 'timestamp': '2025-09-30 22:11:26.462324', 'step': 1781, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:26.494213', 'step': 1781, 'epoch': 2} {'type': 'loss', 'content': 0.0018793040653690696, 'timestamp': '2025-09-30 22:11:26.500551', 'step': 1782, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:26.554386', 'step': 1782, 'epoch': 2} {'type': 'loss', 'content': 0.0003356645174790174, 'timestamp': '2025-09-30 22:11:26.566541', 'step': 1783, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:26.608962', 'step': 1783, 'epoch': 2} {'type': 'loss', 'content': 0.0002973336377181113, 'timestamp': '2025-09-30 22:11:26.634295', 'step': 1784, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:26.672928', 'step': 1784, 'epoch': 2} {'type': 'loss', 'content': 0.00013657430827151984, 'timestamp': '2025-09-30 22:11:26.681281', 'step': 1785, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:26.722574', 'step': 1785, 'epoch': 2} {'type': 'loss', 'content': 0.0005276420270092785, 'timestamp': '2025-09-30 22:11:26.725767', 'step': 1786, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:26.756868', 'step': 1786, 'epoch': 2} {'type': 'loss', 'content': 0.009822599589824677, 'timestamp': '2025-09-30 22:11:26.759795', 'step': 1787, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:26.791563', 'step': 1787, 'epoch': 2} {'type': 'loss', 'content': 0.008391637355089188, 'timestamp': '2025-09-30 22:11:26.816052', 'step': 1788, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:26.853075', 'step': 1788, 'epoch': 2} {'type': 'loss', 'content': 0.00027605355717241764, 'timestamp': '2025-09-30 22:11:26.862087', 'step': 1789, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:26.894932', 'step': 1789, 'epoch': 2} {'type': 'loss', 'content': 0.00037099263863638043, 'timestamp': '2025-09-30 22:11:26.898216', 'step': 1790, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:26.931943', 'step': 1790, 'epoch': 2} {'type': 'loss', 'content': 0.0022850236855447292, 'timestamp': '2025-09-30 22:11:26.944325', 'step': 1791, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:26.979724', 'step': 1791, 'epoch': 2} {'type': 'loss', 'content': 0.0019840167369693518, 'timestamp': '2025-09-30 22:11:27.004499', 'step': 1792, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:27.050239', 'step': 1792, 'epoch': 2} {'type': 'loss', 'content': 0.0010736408876255155, 'timestamp': '2025-09-30 22:11:27.054191', 'step': 1793, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:27.106593', 'step': 1793, 'epoch': 2} {'type': 'loss', 'content': 0.0003287459840066731, 'timestamp': '2025-09-30 22:11:27.116116', 'step': 1794, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:27.156132', 'step': 1794, 'epoch': 2} {'type': 'loss', 'content': 0.001558436080813408, 'timestamp': '2025-09-30 22:11:27.159397', 'step': 1795, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:27.202415', 'step': 1795, 'epoch': 2} {'type': 'loss', 'content': 0.0010486978571861982, 'timestamp': '2025-09-30 22:11:27.226969', 'step': 1796, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:27.260541', 'step': 1796, 'epoch': 2} {'type': 'loss', 'content': 0.015961362048983574, 'timestamp': '2025-09-30 22:11:27.263397', 'step': 1797, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:27.297177', 'step': 1797, 'epoch': 2} {'type': 'loss', 'content': 0.043664541095495224, 'timestamp': '2025-09-30 22:11:27.307270', 'step': 1798, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:27.344094', 'step': 1798, 'epoch': 2} {'type': 'loss', 'content': 0.008131652139127254, 'timestamp': '2025-09-30 22:11:27.347524', 'step': 1799, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:27.385153', 'step': 1799, 'epoch': 2} {'type': 'loss', 'content': 0.02529001235961914, 'timestamp': '2025-09-30 22:11:27.410430', 'step': 1800, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:27.446667', 'step': 1800, 'epoch': 2} {'type': 'loss', 'content': 0.00021200468472670764, 'timestamp': '2025-09-30 22:11:27.459388', 'step': 1801, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:27.495432', 'step': 1801, 'epoch': 2} {'type': 'loss', 'content': 0.004534170497208834, 'timestamp': '2025-09-30 22:11:27.498163', 'step': 1802, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:27.542871', 'step': 1802, 'epoch': 2} {'type': 'loss', 'content': 0.003273914335295558, 'timestamp': '2025-09-30 22:11:27.545274', 'step': 1803, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:27.581655', 'step': 1803, 'epoch': 2} {'type': 'loss', 'content': 0.00039792084135115147, 'timestamp': '2025-09-30 22:11:27.608954', 'step': 1804, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:27.642253', 'step': 1804, 'epoch': 2} {'type': 'loss', 'content': 0.006401257123798132, 'timestamp': '2025-09-30 22:11:27.644862', 'step': 1805, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:27.698040', 'step': 1805, 'epoch': 2} {'type': 'loss', 'content': 0.0014091616030782461, 'timestamp': '2025-09-30 22:11:27.700758', 'step': 1806, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:27.740365', 'step': 1806, 'epoch': 2} {'type': 'loss', 'content': 0.004255100153386593, 'timestamp': '2025-09-30 22:11:27.743865', 'step': 1807, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:27.785686', 'step': 1807, 'epoch': 2} {'type': 'loss', 'content': 0.005804498679935932, 'timestamp': '2025-09-30 22:11:27.811782', 'step': 1808, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:27.873326', 'step': 1808, 'epoch': 2} {'type': 'loss', 'content': 0.000618809019215405, 'timestamp': '2025-09-30 22:11:27.883352', 'step': 1809, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:27.925631', 'step': 1809, 'epoch': 2} {'type': 'loss', 'content': 0.0020599744748324156, 'timestamp': '2025-09-30 22:11:27.929742', 'step': 1810, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:27.971440', 'step': 1810, 'epoch': 2} {'type': 'loss', 'content': 0.0024426288437098265, 'timestamp': '2025-09-30 22:11:27.975627', 'step': 1811, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:28.024342', 'step': 1811, 'epoch': 2} {'type': 'loss', 'content': 0.0016171614406630397, 'timestamp': '2025-09-30 22:11:28.050896', 'step': 1812, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:28.084221', 'step': 1812, 'epoch': 2} {'type': 'loss', 'content': 0.0019683155696839094, 'timestamp': '2025-09-30 22:11:28.088533', 'step': 1813, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:28.126055', 'step': 1813, 'epoch': 2} {'type': 'loss', 'content': 0.0032566902227699757, 'timestamp': '2025-09-30 22:11:28.128291', 'step': 1814, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:28.163275', 'step': 1814, 'epoch': 2} {'type': 'loss', 'content': 0.010501679964363575, 'timestamp': '2025-09-30 22:11:28.165758', 'step': 1815, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:28.204593', 'step': 1815, 'epoch': 2} {'type': 'loss', 'content': 0.0007786615751683712, 'timestamp': '2025-09-30 22:11:28.234642', 'step': 1816, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:28.266876', 'step': 1816, 'epoch': 2} {'type': 'loss', 'content': 0.0010455233277752995, 'timestamp': '2025-09-30 22:11:28.269027', 'step': 1817, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:28.315671', 'step': 1817, 'epoch': 2} {'type': 'loss', 'content': 0.045351140201091766, 'timestamp': '2025-09-30 22:11:28.319163', 'step': 1818, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:28.351392', 'step': 1818, 'epoch': 2} {'type': 'loss', 'content': 0.05572187155485153, 'timestamp': '2025-09-30 22:11:28.354615', 'step': 1819, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:28.387521', 'step': 1819, 'epoch': 2} {'type': 'loss', 'content': 0.0026829210110008717, 'timestamp': '2025-09-30 22:11:28.412413', 'step': 1820, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:28.452683', 'step': 1820, 'epoch': 2} {'type': 'loss', 'content': 0.0022102470975369215, 'timestamp': '2025-09-30 22:11:28.457055', 'step': 1821, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:28.488524', 'step': 1821, 'epoch': 2} {'type': 'loss', 'content': 0.010895676910877228, 'timestamp': '2025-09-30 22:11:28.491925', 'step': 1822, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:28.528782', 'step': 1822, 'epoch': 2} {'type': 'loss', 'content': 0.00846769381314516, 'timestamp': '2025-09-30 22:11:28.531937', 'step': 1823, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:28.569305', 'step': 1823, 'epoch': 2} {'type': 'loss', 'content': 0.013364391401410103, 'timestamp': '2025-09-30 22:11:28.593307', 'step': 1824, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-30 22:11:29.466059', 'step': 1824, 'epoch': 2} {'type': 'pplx', 'content': 57843973.11111791, 'timestamp': '2025-09-30 22:11:29.469482', 'step': 1824, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:29.500883', 'step': 1824, 'epoch': 2} {'type': 'loss', 'content': 0.006056804675608873, 'timestamp': '2025-09-30 22:11:29.504191', 'step': 1825, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:29.543140', 'step': 1825, 'epoch': 2} {'type': 'loss', 'content': 0.0024564911145716906, 'timestamp': '2025-09-30 22:11:29.545870', 'step': 1826, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:29.587508', 'step': 1826, 'epoch': 2} {'type': 'loss', 'content': 0.0025678572710603476, 'timestamp': '2025-09-30 22:11:29.590641', 'step': 1827, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:29.623857', 'step': 1827, 'epoch': 2} {'type': 'loss', 'content': 0.011328652501106262, 'timestamp': '2025-09-30 22:11:29.649252', 'step': 1828, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:29.684405', 'step': 1828, 'epoch': 2} {'type': 'loss', 'content': 0.0005400671507231891, 'timestamp': '2025-09-30 22:11:29.687595', 'step': 1829, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:29.726180', 'step': 1829, 'epoch': 2} {'type': 'loss', 'content': 0.040070440620183945, 'timestamp': '2025-09-30 22:11:29.737205', 'step': 1830, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:29.779941', 'step': 1830, 'epoch': 2} {'type': 'loss', 'content': 0.0002562327135819942, 'timestamp': '2025-09-30 22:11:29.782245', 'step': 1831, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:29.817311', 'step': 1831, 'epoch': 2} {'type': 'loss', 'content': 0.004279036540538073, 'timestamp': '2025-09-30 22:11:29.841039', 'step': 1832, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:29.877457', 'step': 1832, 'epoch': 2} {'type': 'loss', 'content': 0.0022578116040676832, 'timestamp': '2025-09-30 22:11:29.881248', 'step': 1833, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:29.914510', 'step': 1833, 'epoch': 2} {'type': 'loss', 'content': 0.003075752407312393, 'timestamp': '2025-09-30 22:11:29.917617', 'step': 1834, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:29.968967', 'step': 1834, 'epoch': 3} {'type': 'loss', 'content': 0.03581417351961136, 'timestamp': '2025-09-30 22:11:29.971908', 'step': 1835, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:30.005282', 'step': 1835, 'epoch': 3} {'type': 'loss', 'content': 0.019369926303625107, 'timestamp': '2025-09-30 22:11:30.032284', 'step': 1836, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:30.070963', 'step': 1836, 'epoch': 3} {'type': 'loss', 'content': 0.007873108610510826, 'timestamp': '2025-09-30 22:11:30.073418', 'step': 1837, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:30.104032', 'step': 1837, 'epoch': 3} {'type': 'loss', 'content': 0.007872181944549084, 'timestamp': '2025-09-30 22:11:30.106911', 'step': 1838, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:30.143721', 'step': 1838, 'epoch': 3} {'type': 'loss', 'content': 0.07405262440443039, 'timestamp': '2025-09-30 22:11:30.147078', 'step': 1839, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:30.184634', 'step': 1839, 'epoch': 3} {'type': 'loss', 'content': 0.022155776619911194, 'timestamp': '2025-09-30 22:11:30.208670', 'step': 1840, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:30.240560', 'step': 1840, 'epoch': 3} {'type': 'loss', 'content': 0.012954522855579853, 'timestamp': '2025-09-30 22:11:30.242675', 'step': 1841, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:30.276199', 'step': 1841, 'epoch': 3} {'type': 'loss', 'content': 0.0008165457402355969, 'timestamp': '2025-09-30 22:11:30.290028', 'step': 1842, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:30.324121', 'step': 1842, 'epoch': 3} {'type': 'loss', 'content': 0.00022883024939801544, 'timestamp': '2025-09-30 22:11:30.333024', 'step': 1843, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:30.366347', 'step': 1843, 'epoch': 3} {'type': 'loss', 'content': 0.046065591275691986, 'timestamp': '2025-09-30 22:11:30.389844', 'step': 1844, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:30.425144', 'step': 1844, 'epoch': 3} {'type': 'loss', 'content': 0.01905796490609646, 'timestamp': '2025-09-30 22:11:30.428573', 'step': 1845, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:30.462015', 'step': 1845, 'epoch': 3} {'type': 'loss', 'content': 0.0009818606777116656, 'timestamp': '2025-09-30 22:11:30.465512', 'step': 1846, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:30.497038', 'step': 1846, 'epoch': 3} {'type': 'loss', 'content': 0.01879790425300598, 'timestamp': '2025-09-30 22:11:30.498499', 'step': 1847, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:30.527899', 'step': 1847, 'epoch': 3} {'type': 'loss', 'content': 0.0275017861276865, 'timestamp': '2025-09-30 22:11:30.552441', 'step': 1848, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:30.585621', 'step': 1848, 'epoch': 3} {'type': 'loss', 'content': 0.013021481223404408, 'timestamp': '2025-09-30 22:11:30.591829', 'step': 1849, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:30.631803', 'step': 1849, 'epoch': 3} {'type': 'loss', 'content': 0.014080002903938293, 'timestamp': '2025-09-30 22:11:30.634092', 'step': 1850, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:30.671207', 'step': 1850, 'epoch': 3} {'type': 'loss', 'content': 0.006925381254404783, 'timestamp': '2025-09-30 22:11:30.674380', 'step': 1851, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:30.708206', 'step': 1851, 'epoch': 3} {'type': 'loss', 'content': 0.0025633336044847965, 'timestamp': '2025-09-30 22:11:30.732466', 'step': 1852, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:30.764571', 'step': 1852, 'epoch': 3} {'type': 'loss', 'content': 0.012529529631137848, 'timestamp': '2025-09-30 22:11:30.779687', 'step': 1853, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:30.821928', 'step': 1853, 'epoch': 3} {'type': 'loss', 'content': 0.007848630659282207, 'timestamp': '2025-09-30 22:11:30.824559', 'step': 1854, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:30.856888', 'step': 1854, 'epoch': 3} {'type': 'loss', 'content': 0.03343925625085831, 'timestamp': '2025-09-30 22:11:30.861037', 'step': 1855, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:30.895355', 'step': 1855, 'epoch': 3} {'type': 'loss', 'content': 0.04466833919286728, 'timestamp': '2025-09-30 22:11:30.933462', 'step': 1856, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:30.976785', 'step': 1856, 'epoch': 3} {'type': 'loss', 'content': 0.008691991679370403, 'timestamp': '2025-09-30 22:11:30.980724', 'step': 1857, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:31.024793', 'step': 1857, 'epoch': 3} {'type': 'loss', 'content': 0.014497383497655392, 'timestamp': '2025-09-30 22:11:31.044058', 'step': 1858, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:31.076221', 'step': 1858, 'epoch': 3} {'type': 'loss', 'content': 0.013723026029765606, 'timestamp': '2025-09-30 22:11:31.079468', 'step': 1859, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:31.113297', 'step': 1859, 'epoch': 3} {'type': 'loss', 'content': 0.008932454511523247, 'timestamp': '2025-09-30 22:11:31.141385', 'step': 1860, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:31.175058', 'step': 1860, 'epoch': 3} {'type': 'loss', 'content': 0.007725898642092943, 'timestamp': '2025-09-30 22:11:31.177865', 'step': 1861, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:31.211411', 'step': 1861, 'epoch': 3} {'type': 'loss', 'content': 0.01678680069744587, 'timestamp': '2025-09-30 22:11:31.218039', 'step': 1862, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:31.250127', 'step': 1862, 'epoch': 3} {'type': 'loss', 'content': 0.0050750188529491425, 'timestamp': '2025-09-30 22:11:31.252823', 'step': 1863, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:31.283374', 'step': 1863, 'epoch': 3} {'type': 'loss', 'content': 0.012287462130188942, 'timestamp': '2025-09-30 22:11:31.310546', 'step': 1864, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:31.345395', 'step': 1864, 'epoch': 3} {'type': 'loss', 'content': 0.014692439697682858, 'timestamp': '2025-09-30 22:11:31.348943', 'step': 1865, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:31.382906', 'step': 1865, 'epoch': 3} {'type': 'loss', 'content': 0.016319891437888145, 'timestamp': '2025-09-30 22:11:31.385310', 'step': 1866, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:31.419345', 'step': 1866, 'epoch': 3} {'type': 'loss', 'content': 0.007593729067593813, 'timestamp': '2025-09-30 22:11:31.421938', 'step': 1867, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:31.453051', 'step': 1867, 'epoch': 3} {'type': 'loss', 'content': 0.015587197616696358, 'timestamp': '2025-09-30 22:11:31.476657', 'step': 1868, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:31.517082', 'step': 1868, 'epoch': 3} {'type': 'loss', 'content': 0.010518831200897694, 'timestamp': '2025-09-30 22:11:31.519589', 'step': 1869, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:31.555541', 'step': 1869, 'epoch': 3} {'type': 'loss', 'content': 0.023400625213980675, 'timestamp': '2025-09-30 22:11:31.565628', 'step': 1870, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:31.625427', 'step': 1870, 'epoch': 3} {'type': 'loss', 'content': 0.013268728740513325, 'timestamp': '2025-09-30 22:11:31.631806', 'step': 1871, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:31.683722', 'step': 1871, 'epoch': 3} {'type': 'loss', 'content': 0.00882081501185894, 'timestamp': '2025-09-30 22:11:31.712258', 'step': 1872, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:31.743884', 'step': 1872, 'epoch': 3} {'type': 'loss', 'content': 0.011401886120438576, 'timestamp': '2025-09-30 22:11:31.746451', 'step': 1873, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:31.777996', 'step': 1873, 'epoch': 3} {'type': 'loss', 'content': 0.01912127062678337, 'timestamp': '2025-09-30 22:11:31.784925', 'step': 1874, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:31.832238', 'step': 1874, 'epoch': 3} {'type': 'loss', 'content': 0.009558220393955708, 'timestamp': '2025-09-30 22:11:31.844336', 'step': 1875, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:31.882936', 'step': 1875, 'epoch': 3} {'type': 'loss', 'content': 0.004013984929770231, 'timestamp': '2025-09-30 22:11:31.906337', 'step': 1876, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:31.937528', 'step': 1876, 'epoch': 3} {'type': 'loss', 'content': 0.008159502409398556, 'timestamp': '2025-09-30 22:11:31.940317', 'step': 1877, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:31.980065', 'step': 1877, 'epoch': 3} {'type': 'loss', 'content': 0.000791891769040376, 'timestamp': '2025-09-30 22:11:31.987844', 'step': 1878, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:32.023641', 'step': 1878, 'epoch': 3} {'type': 'loss', 'content': 0.004595204256474972, 'timestamp': '2025-09-30 22:11:32.028113', 'step': 1879, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:32.061945', 'step': 1879, 'epoch': 3} {'type': 'loss', 'content': 0.0016687085153535008, 'timestamp': '2025-09-30 22:11:32.087917', 'step': 1880, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:32.124629', 'step': 1880, 'epoch': 3} {'type': 'loss', 'content': 0.004651383031159639, 'timestamp': '2025-09-30 22:11:32.127319', 'step': 1881, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-30 22:11:33.074469', 'step': 1881, 'epoch': 3} {'type': 'pplx', 'content': 58300001.57745322, 'timestamp': '2025-09-30 22:11:33.086962', 'step': 1881, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:33.123593', 'step': 1881, 'epoch': 3} {'type': 'loss', 'content': 0.0006952568655833602, 'timestamp': '2025-09-30 22:11:33.132758', 'step': 1882, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:33.164032', 'step': 1882, 'epoch': 3} {'type': 'loss', 'content': 0.000541742134373635, 'timestamp': '2025-09-30 22:11:33.166746', 'step': 1883, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:33.209638', 'step': 1883, 'epoch': 3} {'type': 'loss', 'content': 0.01002055685967207, 'timestamp': '2025-09-30 22:11:33.234969', 'step': 1884, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:33.265653', 'step': 1884, 'epoch': 3} {'type': 'loss', 'content': 0.0002052335039479658, 'timestamp': '2025-09-30 22:11:33.268371', 'step': 1885, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:33.299637', 'step': 1885, 'epoch': 3} {'type': 'loss', 'content': 0.047041479498147964, 'timestamp': '2025-09-30 22:11:33.302040', 'step': 1886, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:33.332696', 'step': 1886, 'epoch': 3} {'type': 'loss', 'content': 0.016662245616316795, 'timestamp': '2025-09-30 22:11:33.335426', 'step': 1887, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:33.365138', 'step': 1887, 'epoch': 3} {'type': 'loss', 'content': 0.02230525016784668, 'timestamp': '2025-09-30 22:11:33.391292', 'step': 1888, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:33.421717', 'step': 1888, 'epoch': 3} {'type': 'loss', 'content': 0.009830297902226448, 'timestamp': '2025-09-30 22:11:33.423857', 'step': 1889, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:33.453551', 'step': 1889, 'epoch': 3} {'type': 'loss', 'content': 0.010316459462046623, 'timestamp': '2025-09-30 22:11:33.456019', 'step': 1890, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:33.485785', 'step': 1890, 'epoch': 3} {'type': 'loss', 'content': 0.006007314659655094, 'timestamp': '2025-09-30 22:11:33.487984', 'step': 1891, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:33.518765', 'step': 1891, 'epoch': 3} {'type': 'loss', 'content': 0.0068843550980091095, 'timestamp': '2025-09-30 22:11:33.543397', 'step': 1892, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:33.588940', 'step': 1892, 'epoch': 3} {'type': 'loss', 'content': 0.006098180077970028, 'timestamp': '2025-09-30 22:11:33.598198', 'step': 1893, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:33.636589', 'step': 1893, 'epoch': 3} {'type': 'loss', 'content': 0.04380368813872337, 'timestamp': '2025-09-30 22:11:33.639790', 'step': 1894, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:33.677421', 'step': 1894, 'epoch': 3} {'type': 'loss', 'content': 0.0040383776649832726, 'timestamp': '2025-09-30 22:11:33.684386', 'step': 1895, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:33.721214', 'step': 1895, 'epoch': 3} {'type': 'loss', 'content': 0.021266529336571693, 'timestamp': '2025-09-30 22:11:33.745076', 'step': 1896, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:33.786419', 'step': 1896, 'epoch': 3} {'type': 'loss', 'content': 0.00014776589523535222, 'timestamp': '2025-09-30 22:11:33.791990', 'step': 1897, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:33.823137', 'step': 1897, 'epoch': 3} {'type': 'loss', 'content': 0.0008200978627428412, 'timestamp': '2025-09-30 22:11:33.828972', 'step': 1898, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:33.861415', 'step': 1898, 'epoch': 3} {'type': 'loss', 'content': 0.0001828258391469717, 'timestamp': '2025-09-30 22:11:33.865791', 'step': 1899, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:33.896717', 'step': 1899, 'epoch': 3} {'type': 'loss', 'content': 0.05202025920152664, 'timestamp': '2025-09-30 22:11:33.921215', 'step': 1900, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:33.952903', 'step': 1900, 'epoch': 3} {'type': 'loss', 'content': 0.0005612968816421926, 'timestamp': '2025-09-30 22:11:33.957467', 'step': 1901, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:33.995578', 'step': 1901, 'epoch': 3} {'type': 'loss', 'content': 0.0028064267244189978, 'timestamp': '2025-09-30 22:11:33.998196', 'step': 1902, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:34.041465', 'step': 1902, 'epoch': 3} {'type': 'loss', 'content': 0.01332185510545969, 'timestamp': '2025-09-30 22:11:34.055049', 'step': 1903, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:34.093277', 'step': 1903, 'epoch': 3} {'type': 'loss', 'content': 0.03658471629023552, 'timestamp': '2025-09-30 22:11:34.116742', 'step': 1904, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:34.147804', 'step': 1904, 'epoch': 3} {'type': 'loss', 'content': 0.00020756521553266793, 'timestamp': '2025-09-30 22:11:34.149961', 'step': 1905, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:34.184187', 'step': 1905, 'epoch': 3} {'type': 'loss', 'content': 0.04848742485046387, 'timestamp': '2025-09-30 22:11:34.186798', 'step': 1906, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:34.217337', 'step': 1906, 'epoch': 3} {'type': 'loss', 'content': 0.0005019632517360151, 'timestamp': '2025-09-30 22:11:34.219518', 'step': 1907, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:34.255246', 'step': 1907, 'epoch': 3} {'type': 'loss', 'content': 0.03832434490323067, 'timestamp': '2025-09-30 22:11:34.278878', 'step': 1908, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:34.308405', 'step': 1908, 'epoch': 3} {'type': 'loss', 'content': 0.007056743372231722, 'timestamp': '2025-09-30 22:11:34.311095', 'step': 1909, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:34.341428', 'step': 1909, 'epoch': 3} {'type': 'loss', 'content': 0.005560364108532667, 'timestamp': '2025-09-30 22:11:34.343587', 'step': 1910, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:34.374874', 'step': 1910, 'epoch': 3} {'type': 'loss', 'content': 0.007207165006548166, 'timestamp': '2025-09-30 22:11:34.377181', 'step': 1911, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:34.408075', 'step': 1911, 'epoch': 3} {'type': 'loss', 'content': 0.002402246231213212, 'timestamp': '2025-09-30 22:11:34.431295', 'step': 1912, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:34.487213', 'step': 1912, 'epoch': 3} {'type': 'loss', 'content': 0.0030392964836210012, 'timestamp': '2025-09-30 22:11:34.489529', 'step': 1913, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:34.521423', 'step': 1913, 'epoch': 3} {'type': 'loss', 'content': 0.014510233886539936, 'timestamp': '2025-09-30 22:11:34.523588', 'step': 1914, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:34.555635', 'step': 1914, 'epoch': 3} {'type': 'loss', 'content': 0.02432306483387947, 'timestamp': '2025-09-30 22:11:34.558991', 'step': 1915, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:34.593283', 'step': 1915, 'epoch': 3} {'type': 'loss', 'content': 0.009666199795901775, 'timestamp': '2025-09-30 22:11:34.617758', 'step': 1916, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:34.650002', 'step': 1916, 'epoch': 3} {'type': 'loss', 'content': 0.001959824236109853, 'timestamp': '2025-09-30 22:11:34.654015', 'step': 1917, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:34.692460', 'step': 1917, 'epoch': 3} {'type': 'loss', 'content': 0.00908661913126707, 'timestamp': '2025-09-30 22:11:34.696389', 'step': 1918, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:34.730218', 'step': 1918, 'epoch': 3} {'type': 'loss', 'content': 0.002566373208537698, 'timestamp': '2025-09-30 22:11:34.735628', 'step': 1919, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:34.768795', 'step': 1919, 'epoch': 3} {'type': 'loss', 'content': 0.03425890579819679, 'timestamp': '2025-09-30 22:11:34.794343', 'step': 1920, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:34.833089', 'step': 1920, 'epoch': 3} {'type': 'loss', 'content': 0.0045762574300169945, 'timestamp': '2025-09-30 22:11:34.836814', 'step': 1921, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:34.877627', 'step': 1921, 'epoch': 3} {'type': 'loss', 'content': 0.003459982108324766, 'timestamp': '2025-09-30 22:11:34.880080', 'step': 1922, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:34.915849', 'step': 1922, 'epoch': 3} {'type': 'loss', 'content': 0.032456815242767334, 'timestamp': '2025-09-30 22:11:34.917558', 'step': 1923, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:34.951722', 'step': 1923, 'epoch': 3} {'type': 'loss', 'content': 0.017405377700924873, 'timestamp': '2025-09-30 22:11:34.975704', 'step': 1924, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:35.006934', 'step': 1924, 'epoch': 3} {'type': 'loss', 'content': 0.02231362648308277, 'timestamp': '2025-09-30 22:11:35.016028', 'step': 1925, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:35.052642', 'step': 1925, 'epoch': 3} {'type': 'loss', 'content': 0.02806389331817627, 'timestamp': '2025-09-30 22:11:35.055126', 'step': 1926, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:35.087151', 'step': 1926, 'epoch': 3} {'type': 'loss', 'content': 0.0009463157621212304, 'timestamp': '2025-09-30 22:11:35.090545', 'step': 1927, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:35.123466', 'step': 1927, 'epoch': 3} {'type': 'loss', 'content': 0.006710561458021402, 'timestamp': '2025-09-30 22:11:35.153766', 'step': 1928, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:35.197962', 'step': 1928, 'epoch': 3} {'type': 'loss', 'content': 0.02229662612080574, 'timestamp': '2025-09-30 22:11:35.204638', 'step': 1929, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:35.239393', 'step': 1929, 'epoch': 3} {'type': 'loss', 'content': 0.048850979655981064, 'timestamp': '2025-09-30 22:11:35.242809', 'step': 1930, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:35.274787', 'step': 1930, 'epoch': 3} {'type': 'loss', 'content': 0.00973884854465723, 'timestamp': '2025-09-30 22:11:35.280935', 'step': 1931, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:35.314391', 'step': 1931, 'epoch': 3} {'type': 'loss', 'content': 0.002838547807186842, 'timestamp': '2025-09-30 22:11:35.338628', 'step': 1932, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:35.370152', 'step': 1932, 'epoch': 3} {'type': 'loss', 'content': 0.0015181986382231116, 'timestamp': '2025-09-30 22:11:35.375264', 'step': 1933, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:35.408177', 'step': 1933, 'epoch': 3} {'type': 'loss', 'content': 0.007999510504305363, 'timestamp': '2025-09-30 22:11:35.409933', 'step': 1934, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:35.440019', 'step': 1934, 'epoch': 3} {'type': 'loss', 'content': 0.0033200494945049286, 'timestamp': '2025-09-30 22:11:35.442597', 'step': 1935, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:35.475465', 'step': 1935, 'epoch': 3} {'type': 'loss', 'content': 0.005467348266392946, 'timestamp': '2025-09-30 22:11:35.499736', 'step': 1936, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:35.534777', 'step': 1936, 'epoch': 3} {'type': 'loss', 'content': 0.006198174320161343, 'timestamp': '2025-09-30 22:11:35.540626', 'step': 1937, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:35.577303', 'step': 1937, 'epoch': 3} {'type': 'loss', 'content': 0.005689568817615509, 'timestamp': '2025-09-30 22:11:35.579727', 'step': 1938, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-30 22:11:36.596669', 'step': 1938, 'epoch': 3} {'type': 'pplx', 'content': 43627380.275860295, 'timestamp': '2025-09-30 22:11:36.599167', 'step': 1938, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:36.631668', 'step': 1938, 'epoch': 3} {'type': 'loss', 'content': 0.041265103965997696, 'timestamp': '2025-09-30 22:11:36.634148', 'step': 1939, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:36.664786', 'step': 1939, 'epoch': 3} {'type': 'loss', 'content': 0.0021771376486867666, 'timestamp': '2025-09-30 22:11:36.690810', 'step': 1940, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:36.728941', 'step': 1940, 'epoch': 3} {'type': 'loss', 'content': 0.009394007734954357, 'timestamp': '2025-09-30 22:11:36.733213', 'step': 1941, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:36.764817', 'step': 1941, 'epoch': 3} {'type': 'loss', 'content': 0.00999508984386921, 'timestamp': '2025-09-30 22:11:36.770123', 'step': 1942, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:36.807388', 'step': 1942, 'epoch': 3} {'type': 'loss', 'content': 0.03857743740081787, 'timestamp': '2025-09-30 22:11:36.812467', 'step': 1943, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:11:36.845704', 'step': 1943, 'epoch': 3} {'type': 'loss', 'content': 0.041628073900938034, 'timestamp': '2025-09-30 22:11:36.869338', 'step': 1944, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:36.914712', 'step': 1944, 'epoch': 3} {'type': 'loss', 'content': 0.0016767786582931876, 'timestamp': '2025-09-30 22:11:36.917931', 'step': 1945, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:36.948917', 'step': 1945, 'epoch': 3} {'type': 'loss', 'content': 0.012962265871465206, 'timestamp': '2025-09-30 22:11:36.954441', 'step': 1946, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:36.993420', 'step': 1946, 'epoch': 3} {'type': 'loss', 'content': 0.003387217177078128, 'timestamp': '2025-09-30 22:11:36.996098', 'step': 1947, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:37.047895', 'step': 1947, 'epoch': 3} {'type': 'loss', 'content': 0.0020589407067745924, 'timestamp': '2025-09-30 22:11:37.074894', 'step': 1948, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:37.109800', 'step': 1948, 'epoch': 3} {'type': 'loss', 'content': 0.0035861472133547068, 'timestamp': '2025-09-30 22:11:37.121958', 'step': 1949, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:37.155186', 'step': 1949, 'epoch': 3} {'type': 'loss', 'content': 0.011754592880606651, 'timestamp': '2025-09-30 22:11:37.159976', 'step': 1950, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:37.198710', 'step': 1950, 'epoch': 3} {'type': 'loss', 'content': 0.012257657945156097, 'timestamp': '2025-09-30 22:11:37.204813', 'step': 1951, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:37.246821', 'step': 1951, 'epoch': 3} {'type': 'loss', 'content': 0.028148228302598, 'timestamp': '2025-09-30 22:11:37.273645', 'step': 1952, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:37.310597', 'step': 1952, 'epoch': 3} {'type': 'loss', 'content': 0.001430458389222622, 'timestamp': '2025-09-30 22:11:37.313023', 'step': 1953, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:37.347053', 'step': 1953, 'epoch': 3} {'type': 'loss', 'content': 0.003840786637738347, 'timestamp': '2025-09-30 22:11:37.349462', 'step': 1954, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:37.381022', 'step': 1954, 'epoch': 3} {'type': 'loss', 'content': 0.04572409391403198, 'timestamp': '2025-09-30 22:11:37.383572', 'step': 1955, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:37.427216', 'step': 1955, 'epoch': 3} {'type': 'loss', 'content': 0.015760038048028946, 'timestamp': '2025-09-30 22:11:37.450976', 'step': 1956, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:37.482989', 'step': 1956, 'epoch': 3} {'type': 'loss', 'content': 0.009832469746470451, 'timestamp': '2025-09-30 22:11:37.486785', 'step': 1957, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:37.519959', 'step': 1957, 'epoch': 3} {'type': 'loss', 'content': 0.0026130271144211292, 'timestamp': '2025-09-30 22:11:37.523027', 'step': 1958, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:37.559954', 'step': 1958, 'epoch': 3} {'type': 'loss', 'content': 0.004075376782566309, 'timestamp': '2025-09-30 22:11:37.562879', 'step': 1959, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:37.600938', 'step': 1959, 'epoch': 3} {'type': 'loss', 'content': 0.04126408323645592, 'timestamp': '2025-09-30 22:11:37.632136', 'step': 1960, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:37.673315', 'step': 1960, 'epoch': 3} {'type': 'loss', 'content': 0.0035435904283076525, 'timestamp': '2025-09-30 22:11:37.675789', 'step': 1961, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:37.708499', 'step': 1961, 'epoch': 3} {'type': 'loss', 'content': 0.004043902270495892, 'timestamp': '2025-09-30 22:11:37.710923', 'step': 1962, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:37.744656', 'step': 1962, 'epoch': 3} {'type': 'loss', 'content': 0.015519635751843452, 'timestamp': '2025-09-30 22:11:37.748859', 'step': 1963, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:37.786265', 'step': 1963, 'epoch': 3} {'type': 'loss', 'content': 0.030089065432548523, 'timestamp': '2025-09-30 22:11:37.810117', 'step': 1964, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:37.843879', 'step': 1964, 'epoch': 3} {'type': 'loss', 'content': 0.007775893900543451, 'timestamp': '2025-09-30 22:11:37.847497', 'step': 1965, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:37.885079', 'step': 1965, 'epoch': 3} {'type': 'loss', 'content': 0.003808345878496766, 'timestamp': '2025-09-30 22:11:37.890297', 'step': 1966, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:37.925134', 'step': 1966, 'epoch': 3} {'type': 'loss', 'content': 0.004614766221493483, 'timestamp': '2025-09-30 22:11:37.928706', 'step': 1967, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:37.963176', 'step': 1967, 'epoch': 3} {'type': 'loss', 'content': 0.017028817906975746, 'timestamp': '2025-09-30 22:11:37.987256', 'step': 1968, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:38.022126', 'step': 1968, 'epoch': 3} {'type': 'loss', 'content': 0.0027420276310294867, 'timestamp': '2025-09-30 22:11:38.024815', 'step': 1969, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:38.058904', 'step': 1969, 'epoch': 3} {'type': 'loss', 'content': 0.0091890012845397, 'timestamp': '2025-09-30 22:11:38.064282', 'step': 1970, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:38.095938', 'step': 1970, 'epoch': 3} {'type': 'loss', 'content': 0.006831838749349117, 'timestamp': '2025-09-30 22:11:38.098149', 'step': 1971, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:38.137643', 'step': 1971, 'epoch': 3} {'type': 'loss', 'content': 0.021977385506033897, 'timestamp': '2025-09-30 22:11:38.161325', 'step': 1972, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:38.201039', 'step': 1972, 'epoch': 3} {'type': 'loss', 'content': 0.008382284082472324, 'timestamp': '2025-09-30 22:11:38.204053', 'step': 1973, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:38.240818', 'step': 1973, 'epoch': 3} {'type': 'loss', 'content': 0.0059802415780723095, 'timestamp': '2025-09-30 22:11:38.243117', 'step': 1974, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:38.275063', 'step': 1974, 'epoch': 3} {'type': 'loss', 'content': 0.019793765619397163, 'timestamp': '2025-09-30 22:11:38.277255', 'step': 1975, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:38.308258', 'step': 1975, 'epoch': 3} {'type': 'loss', 'content': 0.01100704912096262, 'timestamp': '2025-09-30 22:11:38.332794', 'step': 1976, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:38.364649', 'step': 1976, 'epoch': 3} {'type': 'loss', 'content': 0.007676057517528534, 'timestamp': '2025-09-30 22:11:38.367204', 'step': 1977, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:38.402216', 'step': 1977, 'epoch': 3} {'type': 'loss', 'content': 0.0077926525846123695, 'timestamp': '2025-09-30 22:11:38.405555', 'step': 1978, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:38.451705', 'step': 1978, 'epoch': 3} {'type': 'loss', 'content': 0.021432368084788322, 'timestamp': '2025-09-30 22:11:38.455731', 'step': 1979, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:38.491779', 'step': 1979, 'epoch': 3} {'type': 'loss', 'content': 0.003779368242248893, 'timestamp': '2025-09-30 22:11:38.516552', 'step': 1980, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:38.550531', 'step': 1980, 'epoch': 3} {'type': 'loss', 'content': 0.006811430212110281, 'timestamp': '2025-09-30 22:11:38.554924', 'step': 1981, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:38.587888', 'step': 1981, 'epoch': 3} {'type': 'loss', 'content': 0.008039762265980244, 'timestamp': '2025-09-30 22:11:38.589999', 'step': 1982, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:38.621379', 'step': 1982, 'epoch': 3} {'type': 'loss', 'content': 0.010187317617237568, 'timestamp': '2025-09-30 22:11:38.623660', 'step': 1983, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:38.653657', 'step': 1983, 'epoch': 3} {'type': 'loss', 'content': 0.00863056443631649, 'timestamp': '2025-09-30 22:11:38.677326', 'step': 1984, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:38.713997', 'step': 1984, 'epoch': 3} {'type': 'loss', 'content': 0.004583257250487804, 'timestamp': '2025-09-30 22:11:38.716903', 'step': 1985, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:38.748750', 'step': 1985, 'epoch': 3} {'type': 'loss', 'content': 0.00606236606836319, 'timestamp': '2025-09-30 22:11:38.751793', 'step': 1986, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:38.781892', 'step': 1986, 'epoch': 3} {'type': 'loss', 'content': 0.027738112956285477, 'timestamp': '2025-09-30 22:11:38.785210', 'step': 1987, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:38.817077', 'step': 1987, 'epoch': 3} {'type': 'loss', 'content': 0.01934841088950634, 'timestamp': '2025-09-30 22:11:38.840780', 'step': 1988, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:38.871270', 'step': 1988, 'epoch': 3} {'type': 'loss', 'content': 0.0069843316450715065, 'timestamp': '2025-09-30 22:11:38.874007', 'step': 1989, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:38.906349', 'step': 1989, 'epoch': 3} {'type': 'loss', 'content': 0.003509870497509837, 'timestamp': '2025-09-30 22:11:38.908453', 'step': 1990, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:38.941136', 'step': 1990, 'epoch': 3} {'type': 'loss', 'content': 0.031461603939533234, 'timestamp': '2025-09-30 22:11:38.943272', 'step': 1991, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:38.977005', 'step': 1991, 'epoch': 3} {'type': 'loss', 'content': 0.02272350713610649, 'timestamp': '2025-09-30 22:11:39.000736', 'step': 1992, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:39.044288', 'step': 1992, 'epoch': 3} {'type': 'loss', 'content': 0.04014478996396065, 'timestamp': '2025-09-30 22:11:39.046856', 'step': 1993, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:39.077258', 'step': 1993, 'epoch': 3} {'type': 'loss', 'content': 0.0069357166066765785, 'timestamp': '2025-09-30 22:11:39.079878', 'step': 1994, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:39.119734', 'step': 1994, 'epoch': 3} {'type': 'loss', 'content': 0.00451026763767004, 'timestamp': '2025-09-30 22:11:39.124010', 'step': 1995, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-30 22:11:39.890481', 'step': 1995, 'epoch': 3} {'type': 'pplx', 'content': 45151114.894568056, 'timestamp': '2025-09-30 22:11:39.892281', 'step': 1995, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:39.921447', 'step': 1995, 'epoch': 3} {'type': 'loss', 'content': 0.0034852561075240374, 'timestamp': '2025-09-30 22:11:39.945905', 'step': 1996, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:39.980609', 'step': 1996, 'epoch': 3} {'type': 'loss', 'content': 0.0041796243749558926, 'timestamp': '2025-09-30 22:11:39.983239', 'step': 1997, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:40.017253', 'step': 1997, 'epoch': 3} {'type': 'loss', 'content': 0.01105498243123293, 'timestamp': '2025-09-30 22:11:40.019276', 'step': 1998, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:40.049220', 'step': 1998, 'epoch': 3} {'type': 'loss', 'content': 0.02129027433693409, 'timestamp': '2025-09-30 22:11:40.051677', 'step': 1999, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:40.086422', 'step': 1999, 'epoch': 3} {'type': 'loss', 'content': 0.012991896830499172, 'timestamp': '2025-09-30 22:11:40.110596', 'step': 2000, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 2000', 'timestamp': '2025-09-30 22:11:47.120027', 'step': 2000, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:47.171133', 'step': 2000, 'epoch': 3} {'type': 'loss', 'content': 0.011763842776417732, 'timestamp': '2025-09-30 22:11:47.173258', 'step': 2001, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:47.205359', 'step': 2001, 'epoch': 3} {'type': 'loss', 'content': 0.001990505028516054, 'timestamp': '2025-09-30 22:11:47.207565', 'step': 2002, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:47.237348', 'step': 2002, 'epoch': 3} {'type': 'loss', 'content': 0.0028346367180347443, 'timestamp': '2025-09-30 22:11:47.239951', 'step': 2003, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:47.271096', 'step': 2003, 'epoch': 3} {'type': 'loss', 'content': 0.040164608508348465, 'timestamp': '2025-09-30 22:11:47.294777', 'step': 2004, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:47.324666', 'step': 2004, 'epoch': 3} {'type': 'loss', 'content': 0.0015259806532412767, 'timestamp': '2025-09-30 22:11:47.326542', 'step': 2005, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:47.356996', 'step': 2005, 'epoch': 3} {'type': 'loss', 'content': 0.023955607786774635, 'timestamp': '2025-09-30 22:11:47.359226', 'step': 2006, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:47.389113', 'step': 2006, 'epoch': 3} {'type': 'loss', 'content': 0.010714337229728699, 'timestamp': '2025-09-30 22:11:47.391113', 'step': 2007, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:47.420113', 'step': 2007, 'epoch': 3} {'type': 'loss', 'content': 0.019530069082975388, 'timestamp': '2025-09-30 22:11:47.444120', 'step': 2008, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:47.478806', 'step': 2008, 'epoch': 3} {'type': 'loss', 'content': 0.005085115786641836, 'timestamp': '2025-09-30 22:11:47.480636', 'step': 2009, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:47.510149', 'step': 2009, 'epoch': 3} {'type': 'loss', 'content': 0.0203965175896883, 'timestamp': '2025-09-30 22:11:47.512273', 'step': 2010, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:47.543661', 'step': 2010, 'epoch': 3} {'type': 'loss', 'content': 0.004833654034882784, 'timestamp': '2025-09-30 22:11:47.546287', 'step': 2011, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:47.576880', 'step': 2011, 'epoch': 3} {'type': 'loss', 'content': 0.0030174553394317627, 'timestamp': '2025-09-30 22:11:47.600315', 'step': 2012, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:47.632537', 'step': 2012, 'epoch': 3} {'type': 'loss', 'content': 0.0027652904391288757, 'timestamp': '2025-09-30 22:11:47.634574', 'step': 2013, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:47.665703', 'step': 2013, 'epoch': 3} {'type': 'loss', 'content': 0.0048994021490216255, 'timestamp': '2025-09-30 22:11:47.668123', 'step': 2014, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:47.698929', 'step': 2014, 'epoch': 3} {'type': 'loss', 'content': 0.024076785892248154, 'timestamp': '2025-09-30 22:11:47.701111', 'step': 2015, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:47.733693', 'step': 2015, 'epoch': 3} {'type': 'loss', 'content': 0.009501011110842228, 'timestamp': '2025-09-30 22:11:47.757465', 'step': 2016, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:47.787088', 'step': 2016, 'epoch': 3} {'type': 'loss', 'content': 0.009324478916823864, 'timestamp': '2025-09-30 22:11:47.789314', 'step': 2017, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:47.821263', 'step': 2017, 'epoch': 3} {'type': 'loss', 'content': 0.01339875441044569, 'timestamp': '2025-09-30 22:11:47.823502', 'step': 2018, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:47.853658', 'step': 2018, 'epoch': 3} {'type': 'loss', 'content': 0.002396277617663145, 'timestamp': '2025-09-30 22:11:47.855895', 'step': 2019, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:47.886382', 'step': 2019, 'epoch': 3} {'type': 'loss', 'content': 0.012006105855107307, 'timestamp': '2025-09-30 22:11:47.909764', 'step': 2020, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:47.941224', 'step': 2020, 'epoch': 3} {'type': 'loss', 'content': 0.0036790850572288036, 'timestamp': '2025-09-30 22:11:47.955276', 'step': 2021, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:47.986337', 'step': 2021, 'epoch': 3} {'type': 'loss', 'content': 0.006259873043745756, 'timestamp': '2025-09-30 22:11:47.989571', 'step': 2022, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:48.019824', 'step': 2022, 'epoch': 3} {'type': 'loss', 'content': 0.0020233269315212965, 'timestamp': '2025-09-30 22:11:48.022334', 'step': 2023, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:48.055789', 'step': 2023, 'epoch': 3} {'type': 'loss', 'content': 0.008467396721243858, 'timestamp': '2025-09-30 22:11:48.079904', 'step': 2024, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:48.113280', 'step': 2024, 'epoch': 3} {'type': 'loss', 'content': 0.013029148802161217, 'timestamp': '2025-09-30 22:11:48.116011', 'step': 2025, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:48.146751', 'step': 2025, 'epoch': 3} {'type': 'loss', 'content': 0.0013387370854616165, 'timestamp': '2025-09-30 22:11:48.148866', 'step': 2026, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:48.180806', 'step': 2026, 'epoch': 3} {'type': 'loss', 'content': 0.002847356954589486, 'timestamp': '2025-09-30 22:11:48.183220', 'step': 2027, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:48.214279', 'step': 2027, 'epoch': 3} {'type': 'loss', 'content': 0.003806897671893239, 'timestamp': '2025-09-30 22:11:48.238058', 'step': 2028, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:48.269355', 'step': 2028, 'epoch': 3} {'type': 'loss', 'content': 0.020344218239188194, 'timestamp': '2025-09-30 22:11:48.271870', 'step': 2029, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:48.302858', 'step': 2029, 'epoch': 3} {'type': 'loss', 'content': 0.011094714514911175, 'timestamp': '2025-09-30 22:11:48.305095', 'step': 2030, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:48.336226', 'step': 2030, 'epoch': 3} {'type': 'loss', 'content': 0.003669965546578169, 'timestamp': '2025-09-30 22:11:48.338542', 'step': 2031, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:48.369177', 'step': 2031, 'epoch': 3} {'type': 'loss', 'content': 0.00963125191628933, 'timestamp': '2025-09-30 22:11:48.393090', 'step': 2032, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:48.423617', 'step': 2032, 'epoch': 3} {'type': 'loss', 'content': 0.00689741550013423, 'timestamp': '2025-09-30 22:11:48.425619', 'step': 2033, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:48.455255', 'step': 2033, 'epoch': 3} {'type': 'loss', 'content': 0.010891190730035305, 'timestamp': '2025-09-30 22:11:48.457542', 'step': 2034, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:48.488398', 'step': 2034, 'epoch': 3} {'type': 'loss', 'content': 0.003279446391388774, 'timestamp': '2025-09-30 22:11:48.494058', 'step': 2035, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:48.524167', 'step': 2035, 'epoch': 3} {'type': 'loss', 'content': 0.01791677437722683, 'timestamp': '2025-09-30 22:11:48.547641', 'step': 2036, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:48.578445', 'step': 2036, 'epoch': 3} {'type': 'loss', 'content': 0.008618688210844994, 'timestamp': '2025-09-30 22:11:48.580755', 'step': 2037, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:48.614180', 'step': 2037, 'epoch': 3} {'type': 'loss', 'content': 0.023733243346214294, 'timestamp': '2025-09-30 22:11:48.616326', 'step': 2038, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:48.648518', 'step': 2038, 'epoch': 3} {'type': 'loss', 'content': 0.0052038319408893585, 'timestamp': '2025-09-30 22:11:48.651005', 'step': 2039, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:48.680617', 'step': 2039, 'epoch': 3} {'type': 'loss', 'content': 0.005849936511367559, 'timestamp': '2025-09-30 22:11:48.704292', 'step': 2040, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:48.734190', 'step': 2040, 'epoch': 3} {'type': 'loss', 'content': 0.009815479628741741, 'timestamp': '2025-09-30 22:11:48.736277', 'step': 2041, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:48.766915', 'step': 2041, 'epoch': 3} {'type': 'loss', 'content': 0.0007055861060507596, 'timestamp': '2025-09-30 22:11:48.769193', 'step': 2042, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:48.799254', 'step': 2042, 'epoch': 3} {'type': 'loss', 'content': 0.012282871641218662, 'timestamp': '2025-09-30 22:11:48.802790', 'step': 2043, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:48.832188', 'step': 2043, 'epoch': 3} {'type': 'loss', 'content': 0.015396817587316036, 'timestamp': '2025-09-30 22:11:48.856754', 'step': 2044, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:48.889786', 'step': 2044, 'epoch': 3} {'type': 'loss', 'content': 0.0020254473201930523, 'timestamp': '2025-09-30 22:11:48.893077', 'step': 2045, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:48.924965', 'step': 2045, 'epoch': 3} {'type': 'loss', 'content': 0.03287850692868233, 'timestamp': '2025-09-30 22:11:48.927323', 'step': 2046, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:48.959676', 'step': 2046, 'epoch': 3} {'type': 'loss', 'content': 0.013674319721758366, 'timestamp': '2025-09-30 22:11:48.962453', 'step': 2047, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:48.992228', 'step': 2047, 'epoch': 3} {'type': 'loss', 'content': 0.005486636888235807, 'timestamp': '2025-09-30 22:11:49.015942', 'step': 2048, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:49.046842', 'step': 2048, 'epoch': 3} {'type': 'loss', 'content': 0.0018138373270630836, 'timestamp': '2025-09-30 22:11:49.051151', 'step': 2049, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:49.085344', 'step': 2049, 'epoch': 3} {'type': 'loss', 'content': 0.007017158903181553, 'timestamp': '2025-09-30 22:11:49.087734', 'step': 2050, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:49.120421', 'step': 2050, 'epoch': 3} {'type': 'loss', 'content': 0.006239436566829681, 'timestamp': '2025-09-30 22:11:49.123116', 'step': 2051, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:49.162657', 'step': 2051, 'epoch': 3} {'type': 'loss', 'content': 0.0296341422945261, 'timestamp': '2025-09-30 22:11:49.186855', 'step': 2052, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-30 22:11:49.938037', 'step': 2052, 'epoch': 3} {'type': 'pplx', 'content': 46841069.929396845, 'timestamp': '2025-09-30 22:11:49.940737', 'step': 2052, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:49.968858', 'step': 2052, 'epoch': 3} {'type': 'loss', 'content': 0.002824206370860338, 'timestamp': '2025-09-30 22:11:49.971342', 'step': 2053, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:50.001368', 'step': 2053, 'epoch': 3} {'type': 'loss', 'content': 0.0025209374725818634, 'timestamp': '2025-09-30 22:11:50.004275', 'step': 2054, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:50.035925', 'step': 2054, 'epoch': 3} {'type': 'loss', 'content': 0.005634330213069916, 'timestamp': '2025-09-30 22:11:50.038218', 'step': 2055, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:50.068792', 'step': 2055, 'epoch': 3} {'type': 'loss', 'content': 0.001257882104255259, 'timestamp': '2025-09-30 22:11:50.093493', 'step': 2056, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:50.123631', 'step': 2056, 'epoch': 3} {'type': 'loss', 'content': 0.0063269915990531445, 'timestamp': '2025-09-30 22:11:50.126394', 'step': 2057, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:50.157085', 'step': 2057, 'epoch': 3} {'type': 'loss', 'content': 0.006260029971599579, 'timestamp': '2025-09-30 22:11:50.159285', 'step': 2058, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:50.192581', 'step': 2058, 'epoch': 3} {'type': 'loss', 'content': 0.011972022242844105, 'timestamp': '2025-09-30 22:11:50.194639', 'step': 2059, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:50.232178', 'step': 2059, 'epoch': 3} {'type': 'loss', 'content': 0.0028113038279116154, 'timestamp': '2025-09-30 22:11:50.256226', 'step': 2060, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:50.286662', 'step': 2060, 'epoch': 3} {'type': 'loss', 'content': 0.00559657160192728, 'timestamp': '2025-09-30 22:11:50.288730', 'step': 2061, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:50.318881', 'step': 2061, 'epoch': 3} {'type': 'loss', 'content': 0.0006818880210630596, 'timestamp': '2025-09-30 22:11:50.324812', 'step': 2062, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:50.360667', 'step': 2062, 'epoch': 3} {'type': 'loss', 'content': 0.022419020533561707, 'timestamp': '2025-09-30 22:11:50.363341', 'step': 2063, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:50.394960', 'step': 2063, 'epoch': 3} {'type': 'loss', 'content': 0.003586748382076621, 'timestamp': '2025-09-30 22:11:50.418215', 'step': 2064, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:50.448556', 'step': 2064, 'epoch': 3} {'type': 'loss', 'content': 0.0011212294921278954, 'timestamp': '2025-09-30 22:11:50.455815', 'step': 2065, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:50.501180', 'step': 2065, 'epoch': 3} {'type': 'loss', 'content': 0.0036978931166231632, 'timestamp': '2025-09-30 22:11:50.503528', 'step': 2066, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:50.534271', 'step': 2066, 'epoch': 3} {'type': 'loss', 'content': 0.0025766324251890182, 'timestamp': '2025-09-30 22:11:50.536403', 'step': 2067, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:50.566915', 'step': 2067, 'epoch': 3} {'type': 'loss', 'content': 0.024868767708539963, 'timestamp': '2025-09-30 22:11:50.590573', 'step': 2068, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:50.623035', 'step': 2068, 'epoch': 3} {'type': 'loss', 'content': 0.0010927347466349602, 'timestamp': '2025-09-30 22:11:50.625739', 'step': 2069, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:50.656140', 'step': 2069, 'epoch': 3} {'type': 'loss', 'content': 0.009025784209370613, 'timestamp': '2025-09-30 22:11:50.658880', 'step': 2070, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:50.688974', 'step': 2070, 'epoch': 3} {'type': 'loss', 'content': 0.0004706038744188845, 'timestamp': '2025-09-30 22:11:50.690971', 'step': 2071, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:50.720799', 'step': 2071, 'epoch': 3} {'type': 'loss', 'content': 0.023899797350168228, 'timestamp': '2025-09-30 22:11:50.744231', 'step': 2072, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:50.781701', 'step': 2072, 'epoch': 3} {'type': 'loss', 'content': 0.016089284792542458, 'timestamp': '2025-09-30 22:11:50.784228', 'step': 2073, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:50.815685', 'step': 2073, 'epoch': 3} {'type': 'loss', 'content': 0.0031490155961364508, 'timestamp': '2025-09-30 22:11:50.817925', 'step': 2074, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:50.847379', 'step': 2074, 'epoch': 3} {'type': 'loss', 'content': 0.0023220006842166185, 'timestamp': '2025-09-30 22:11:50.849849', 'step': 2075, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:50.879700', 'step': 2075, 'epoch': 3} {'type': 'loss', 'content': 0.005324964411556721, 'timestamp': '2025-09-30 22:11:50.904241', 'step': 2076, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:50.935012', 'step': 2076, 'epoch': 3} {'type': 'loss', 'content': 0.005782104562968016, 'timestamp': '2025-09-30 22:11:50.937191', 'step': 2077, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:50.969077', 'step': 2077, 'epoch': 3} {'type': 'loss', 'content': 0.02153591811656952, 'timestamp': '2025-09-30 22:11:50.971267', 'step': 2078, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:51.003235', 'step': 2078, 'epoch': 3} {'type': 'loss', 'content': 0.029172439128160477, 'timestamp': '2025-09-30 22:11:51.006114', 'step': 2079, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:51.052262', 'step': 2079, 'epoch': 3} {'type': 'loss', 'content': 0.0013484329683706164, 'timestamp': '2025-09-30 22:11:51.075692', 'step': 2080, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:51.110716', 'step': 2080, 'epoch': 3} {'type': 'loss', 'content': 0.005239028949290514, 'timestamp': '2025-09-30 22:11:51.112549', 'step': 2081, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:51.142907', 'step': 2081, 'epoch': 3} {'type': 'loss', 'content': 0.0015419954434037209, 'timestamp': '2025-09-30 22:11:51.144985', 'step': 2082, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:51.174915', 'step': 2082, 'epoch': 3} {'type': 'loss', 'content': 0.0034008428920060396, 'timestamp': '2025-09-30 22:11:51.177065', 'step': 2083, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:51.208306', 'step': 2083, 'epoch': 3} {'type': 'loss', 'content': 0.001755467732436955, 'timestamp': '2025-09-30 22:11:51.232161', 'step': 2084, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:51.262286', 'step': 2084, 'epoch': 3} {'type': 'loss', 'content': 0.03544512018561363, 'timestamp': '2025-09-30 22:11:51.264468', 'step': 2085, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:51.295461', 'step': 2085, 'epoch': 3} {'type': 'loss', 'content': 0.017085498198866844, 'timestamp': '2025-09-30 22:11:51.298046', 'step': 2086, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:51.328296', 'step': 2086, 'epoch': 3} {'type': 'loss', 'content': 0.009373554959893227, 'timestamp': '2025-09-30 22:11:51.330684', 'step': 2087, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:51.362616', 'step': 2087, 'epoch': 3} {'type': 'loss', 'content': 0.0034663775004446507, 'timestamp': '2025-09-30 22:11:51.396071', 'step': 2088, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:51.427474', 'step': 2088, 'epoch': 3} {'type': 'loss', 'content': 0.0010567883728072047, 'timestamp': '2025-09-30 22:11:51.429400', 'step': 2089, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:11:51.460199', 'step': 2089, 'epoch': 3} {'type': 'loss', 'content': 0.001030977233313024, 'timestamp': '2025-09-30 22:11:51.464817', 'step': 2090, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:51.494928', 'step': 2090, 'epoch': 3} {'type': 'loss', 'content': 0.0021973750554025173, 'timestamp': '2025-09-30 22:11:51.497848', 'step': 2091, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:51.528582', 'step': 2091, 'epoch': 3} {'type': 'loss', 'content': 0.01016116701066494, 'timestamp': '2025-09-30 22:11:51.552300', 'step': 2092, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:51.583649', 'step': 2092, 'epoch': 3} {'type': 'loss', 'content': 0.002128815045580268, 'timestamp': '2025-09-30 22:11:51.585550', 'step': 2093, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:51.619516', 'step': 2093, 'epoch': 3} {'type': 'loss', 'content': 0.0021064337342977524, 'timestamp': '2025-09-30 22:11:51.622117', 'step': 2094, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:51.658479', 'step': 2094, 'epoch': 3} {'type': 'loss', 'content': 0.0030677791219204664, 'timestamp': '2025-09-30 22:11:51.660548', 'step': 2095, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:51.693415', 'step': 2095, 'epoch': 3} {'type': 'loss', 'content': 0.04606208577752113, 'timestamp': '2025-09-30 22:11:51.717523', 'step': 2096, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:51.748039', 'step': 2096, 'epoch': 3} {'type': 'loss', 'content': 0.00207854644395411, 'timestamp': '2025-09-30 22:11:51.750250', 'step': 2097, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:51.781538', 'step': 2097, 'epoch': 3} {'type': 'loss', 'content': 0.025525161996483803, 'timestamp': '2025-09-30 22:11:51.783799', 'step': 2098, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:51.813989', 'step': 2098, 'epoch': 3} {'type': 'loss', 'content': 0.0032558145467191935, 'timestamp': '2025-09-30 22:11:51.816499', 'step': 2099, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:51.847061', 'step': 2099, 'epoch': 3} {'type': 'loss', 'content': 0.005950461141765118, 'timestamp': '2025-09-30 22:11:51.872829', 'step': 2100, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:51.904319', 'step': 2100, 'epoch': 3} {'type': 'loss', 'content': 0.0006777332164347172, 'timestamp': '2025-09-30 22:11:51.906638', 'step': 2101, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:51.936459', 'step': 2101, 'epoch': 3} {'type': 'loss', 'content': 0.0009718859218992293, 'timestamp': '2025-09-30 22:11:51.938368', 'step': 2102, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:51.969245', 'step': 2102, 'epoch': 3} {'type': 'loss', 'content': 0.042966216802597046, 'timestamp': '2025-09-30 22:11:51.971916', 'step': 2103, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:52.002329', 'step': 2103, 'epoch': 3} {'type': 'loss', 'content': 0.0009729270823299885, 'timestamp': '2025-09-30 22:11:52.025887', 'step': 2104, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:52.056745', 'step': 2104, 'epoch': 3} {'type': 'loss', 'content': 0.0013779571745544672, 'timestamp': '2025-09-30 22:11:52.058970', 'step': 2105, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:52.088875', 'step': 2105, 'epoch': 3} {'type': 'loss', 'content': 0.0018243792001158, 'timestamp': '2025-09-30 22:11:52.091108', 'step': 2106, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:52.122468', 'step': 2106, 'epoch': 3} {'type': 'loss', 'content': 0.002409034175798297, 'timestamp': '2025-09-30 22:11:52.125767', 'step': 2107, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:52.157168', 'step': 2107, 'epoch': 3} {'type': 'loss', 'content': 0.011469879187643528, 'timestamp': '2025-09-30 22:11:52.180846', 'step': 2108, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:52.211732', 'step': 2108, 'epoch': 3} {'type': 'loss', 'content': 0.014377386309206486, 'timestamp': '2025-09-30 22:11:52.213983', 'step': 2109, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-30 22:11:53.069474', 'step': 2109, 'epoch': 3} {'type': 'pplx', 'content': 43296303.09889206, 'timestamp': '2025-09-30 22:11:53.079360', 'step': 2109, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:53.109952', 'step': 2109, 'epoch': 3} {'type': 'loss', 'content': 0.0023443028330802917, 'timestamp': '2025-09-30 22:11:53.114172', 'step': 2110, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:53.145304', 'step': 2110, 'epoch': 3} {'type': 'loss', 'content': 0.0002606787602417171, 'timestamp': '2025-09-30 22:11:53.148125', 'step': 2111, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:53.179002', 'step': 2111, 'epoch': 3} {'type': 'loss', 'content': 0.004330683033913374, 'timestamp': '2025-09-30 22:11:53.206509', 'step': 2112, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:53.243614', 'step': 2112, 'epoch': 3} {'type': 'loss', 'content': 0.003312209853902459, 'timestamp': '2025-09-30 22:11:53.256965', 'step': 2113, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:53.294571', 'step': 2113, 'epoch': 3} {'type': 'loss', 'content': 0.022603752091526985, 'timestamp': '2025-09-30 22:11:53.297170', 'step': 2114, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:53.330089', 'step': 2114, 'epoch': 3} {'type': 'loss', 'content': 0.000934332434553653, 'timestamp': '2025-09-30 22:11:53.332258', 'step': 2115, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:53.366930', 'step': 2115, 'epoch': 3} {'type': 'loss', 'content': 0.006673852913081646, 'timestamp': '2025-09-30 22:11:53.391771', 'step': 2116, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:53.422202', 'step': 2116, 'epoch': 3} {'type': 'loss', 'content': 0.0022406296338886023, 'timestamp': '2025-09-30 22:11:53.424234', 'step': 2117, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:53.455438', 'step': 2117, 'epoch': 3} {'type': 'loss', 'content': 0.037457626312971115, 'timestamp': '2025-09-30 22:11:53.457674', 'step': 2118, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:53.488008', 'step': 2118, 'epoch': 3} {'type': 'loss', 'content': 0.0011722417548298836, 'timestamp': '2025-09-30 22:11:53.490242', 'step': 2119, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:53.520783', 'step': 2119, 'epoch': 3} {'type': 'loss', 'content': 0.0004422049969434738, 'timestamp': '2025-09-30 22:11:53.544261', 'step': 2120, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:11:53.575777', 'step': 2120, 'epoch': 3} {'type': 'loss', 'content': 0.0010047383839264512, 'timestamp': '2025-09-30 22:11:53.577754', 'step': 2121, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:53.608143', 'step': 2121, 'epoch': 3} {'type': 'loss', 'content': 0.012830023653805256, 'timestamp': '2025-09-30 22:11:53.610547', 'step': 2122, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:53.641410', 'step': 2122, 'epoch': 3} {'type': 'loss', 'content': 0.023233426734805107, 'timestamp': '2025-09-30 22:11:53.643478', 'step': 2123, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:53.675493', 'step': 2123, 'epoch': 3} {'type': 'loss', 'content': 0.0013475000159814954, 'timestamp': '2025-09-30 22:11:53.699331', 'step': 2124, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:53.730528', 'step': 2124, 'epoch': 3} {'type': 'loss', 'content': 0.005780854728072882, 'timestamp': '2025-09-30 22:11:53.732761', 'step': 2125, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:53.763686', 'step': 2125, 'epoch': 3} {'type': 'loss', 'content': 0.002301039407029748, 'timestamp': '2025-09-30 22:11:53.765767', 'step': 2126, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:53.796135', 'step': 2126, 'epoch': 3} {'type': 'loss', 'content': 0.011333170346915722, 'timestamp': '2025-09-30 22:11:53.798286', 'step': 2127, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:53.831111', 'step': 2127, 'epoch': 3} {'type': 'loss', 'content': 0.013510367833077908, 'timestamp': '2025-09-30 22:11:53.854556', 'step': 2128, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:53.885787', 'step': 2128, 'epoch': 3} {'type': 'loss', 'content': 0.003286774503067136, 'timestamp': '2025-09-30 22:11:53.888009', 'step': 2129, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:53.922905', 'step': 2129, 'epoch': 3} {'type': 'loss', 'content': 0.002777442801743746, 'timestamp': '2025-09-30 22:11:53.925253', 'step': 2130, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:53.956869', 'step': 2130, 'epoch': 3} {'type': 'loss', 'content': 0.0014969798503443599, 'timestamp': '2025-09-30 22:11:53.958915', 'step': 2131, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:53.989365', 'step': 2131, 'epoch': 3} {'type': 'loss', 'content': 0.006633365992456675, 'timestamp': '2025-09-30 22:11:54.012958', 'step': 2132, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:54.043183', 'step': 2132, 'epoch': 3} {'type': 'loss', 'content': 0.009707632474601269, 'timestamp': '2025-09-30 22:11:54.045969', 'step': 2133, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:54.078377', 'step': 2133, 'epoch': 3} {'type': 'loss', 'content': 0.0007816283032298088, 'timestamp': '2025-09-30 22:11:54.084173', 'step': 2134, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:54.115541', 'step': 2134, 'epoch': 3} {'type': 'loss', 'content': 0.0020825075916945934, 'timestamp': '2025-09-30 22:11:54.117655', 'step': 2135, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:54.160331', 'step': 2135, 'epoch': 3} {'type': 'loss', 'content': 0.0006500629824586213, 'timestamp': '2025-09-30 22:11:54.185025', 'step': 2136, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:54.215733', 'step': 2136, 'epoch': 3} {'type': 'loss', 'content': 0.004673543851822615, 'timestamp': '2025-09-30 22:11:54.217989', 'step': 2137, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:54.249423', 'step': 2137, 'epoch': 3} {'type': 'loss', 'content': 0.005406899843364954, 'timestamp': '2025-09-30 22:11:54.252433', 'step': 2138, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:54.287182', 'step': 2138, 'epoch': 3} {'type': 'loss', 'content': 0.000536845182068646, 'timestamp': '2025-09-30 22:11:54.289777', 'step': 2139, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:54.326367', 'step': 2139, 'epoch': 3} {'type': 'loss', 'content': 0.0016699606785550714, 'timestamp': '2025-09-30 22:11:54.351702', 'step': 2140, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:54.383402', 'step': 2140, 'epoch': 3} {'type': 'loss', 'content': 0.0018563418416306376, 'timestamp': '2025-09-30 22:11:54.386335', 'step': 2141, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:54.418835', 'step': 2141, 'epoch': 3} {'type': 'loss', 'content': 0.002186473226174712, 'timestamp': '2025-09-30 22:11:54.422064', 'step': 2142, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:54.457194', 'step': 2142, 'epoch': 3} {'type': 'loss', 'content': 0.0011395520996302366, 'timestamp': '2025-09-30 22:11:54.460279', 'step': 2143, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:54.491473', 'step': 2143, 'epoch': 3} {'type': 'loss', 'content': 0.005414300598204136, 'timestamp': '2025-09-30 22:11:54.518972', 'step': 2144, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:54.556454', 'step': 2144, 'epoch': 3} {'type': 'loss', 'content': 0.001469918410293758, 'timestamp': '2025-09-30 22:11:54.558496', 'step': 2145, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:54.590782', 'step': 2145, 'epoch': 3} {'type': 'loss', 'content': 0.0007216363446787, 'timestamp': '2025-09-30 22:11:54.593067', 'step': 2146, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:54.627549', 'step': 2146, 'epoch': 3} {'type': 'loss', 'content': 0.003747421083971858, 'timestamp': '2025-09-30 22:11:54.629824', 'step': 2147, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:54.660401', 'step': 2147, 'epoch': 3} {'type': 'loss', 'content': 0.001542619545944035, 'timestamp': '2025-09-30 22:11:54.684354', 'step': 2148, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:54.717346', 'step': 2148, 'epoch': 3} {'type': 'loss', 'content': 0.006204643286764622, 'timestamp': '2025-09-30 22:11:54.719616', 'step': 2149, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:54.751906', 'step': 2149, 'epoch': 3} {'type': 'loss', 'content': 0.0003929475205950439, 'timestamp': '2025-09-30 22:11:54.757081', 'step': 2150, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:54.790172', 'step': 2150, 'epoch': 3} {'type': 'loss', 'content': 0.01017941627651453, 'timestamp': '2025-09-30 22:11:54.792528', 'step': 2151, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:54.828313', 'step': 2151, 'epoch': 3} {'type': 'loss', 'content': 0.005522797349840403, 'timestamp': '2025-09-30 22:11:54.852416', 'step': 2152, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:54.884029', 'step': 2152, 'epoch': 3} {'type': 'loss', 'content': 0.0026290721725672483, 'timestamp': '2025-09-30 22:11:54.886810', 'step': 2153, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:54.917613', 'step': 2153, 'epoch': 3} {'type': 'loss', 'content': 0.0038917474448680878, 'timestamp': '2025-09-30 22:11:54.919671', 'step': 2154, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:54.950099', 'step': 2154, 'epoch': 3} {'type': 'loss', 'content': 0.003153451019898057, 'timestamp': '2025-09-30 22:11:54.952576', 'step': 2155, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:54.985718', 'step': 2155, 'epoch': 3} {'type': 'loss', 'content': 0.0026568372268229723, 'timestamp': '2025-09-30 22:11:55.009286', 'step': 2156, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:55.042409', 'step': 2156, 'epoch': 3} {'type': 'loss', 'content': 0.003504603635519743, 'timestamp': '2025-09-30 22:11:55.046968', 'step': 2157, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:55.077820', 'step': 2157, 'epoch': 3} {'type': 'loss', 'content': 0.0010103067616000772, 'timestamp': '2025-09-30 22:11:55.084037', 'step': 2158, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:55.121773', 'step': 2158, 'epoch': 3} {'type': 'loss', 'content': 0.008074522949755192, 'timestamp': '2025-09-30 22:11:55.124579', 'step': 2159, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:55.167900', 'step': 2159, 'epoch': 3} {'type': 'loss', 'content': 0.002990599488839507, 'timestamp': '2025-09-30 22:11:55.191805', 'step': 2160, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:55.224558', 'step': 2160, 'epoch': 3} {'type': 'loss', 'content': 0.014099623076617718, 'timestamp': '2025-09-30 22:11:55.226571', 'step': 2161, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:11:55.257948', 'step': 2161, 'epoch': 3} {'type': 'loss', 'content': 0.0026668712962418795, 'timestamp': '2025-09-30 22:11:55.260610', 'step': 2162, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:55.291956', 'step': 2162, 'epoch': 3} {'type': 'loss', 'content': 0.0051649161614477634, 'timestamp': '2025-09-30 22:11:55.295302', 'step': 2163, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:55.330018', 'step': 2163, 'epoch': 3} {'type': 'loss', 'content': 0.003950497601181269, 'timestamp': '2025-09-30 22:11:55.353385', 'step': 2164, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:55.386345', 'step': 2164, 'epoch': 3} {'type': 'loss', 'content': 0.0007944428361952305, 'timestamp': '2025-09-30 22:11:55.389019', 'step': 2165, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:55.420687', 'step': 2165, 'epoch': 3} {'type': 'loss', 'content': 0.0038365547079592943, 'timestamp': '2025-09-30 22:11:55.423338', 'step': 2166, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-30 22:11:56.276475', 'step': 2166, 'epoch': 3} {'type': 'pplx', 'content': 41345378.60273332, 'timestamp': '2025-09-30 22:11:56.278302', 'step': 2166, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:56.306541', 'step': 2166, 'epoch': 3} {'type': 'loss', 'content': 0.031493984162807465, 'timestamp': '2025-09-30 22:11:56.308282', 'step': 2167, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:56.338618', 'step': 2167, 'epoch': 3} {'type': 'loss', 'content': 0.006299360655248165, 'timestamp': '2025-09-30 22:11:56.362412', 'step': 2168, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:56.397194', 'step': 2168, 'epoch': 3} {'type': 'loss', 'content': 0.0030568100046366453, 'timestamp': '2025-09-30 22:11:56.399331', 'step': 2169, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:11:56.433632', 'step': 2169, 'epoch': 3} {'type': 'loss', 'content': 0.00035096920328214765, 'timestamp': '2025-09-30 22:11:56.436242', 'step': 2170, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:56.468904', 'step': 2170, 'epoch': 3} {'type': 'loss', 'content': 0.001514105941168964, 'timestamp': '2025-09-30 22:11:56.470954', 'step': 2171, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:56.502330', 'step': 2171, 'epoch': 3} {'type': 'loss', 'content': 0.000387070031138137, 'timestamp': '2025-09-30 22:11:56.526142', 'step': 2172, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:56.556135', 'step': 2172, 'epoch': 3} {'type': 'loss', 'content': 0.0033165724016726017, 'timestamp': '2025-09-30 22:11:56.558296', 'step': 2173, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:56.588419', 'step': 2173, 'epoch': 3} {'type': 'loss', 'content': 0.005177160259336233, 'timestamp': '2025-09-30 22:11:56.591005', 'step': 2174, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:56.621126', 'step': 2174, 'epoch': 3} {'type': 'loss', 'content': 0.002708001295104623, 'timestamp': '2025-09-30 22:11:56.623111', 'step': 2175, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:56.654412', 'step': 2175, 'epoch': 3} {'type': 'loss', 'content': 0.0013978255447000265, 'timestamp': '2025-09-30 22:11:56.678894', 'step': 2176, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:56.709023', 'step': 2176, 'epoch': 3} {'type': 'loss', 'content': 0.0025910213589668274, 'timestamp': '2025-09-30 22:11:56.710820', 'step': 2177, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:56.743305', 'step': 2177, 'epoch': 3} {'type': 'loss', 'content': 0.06920283287763596, 'timestamp': '2025-09-30 22:11:56.745056', 'step': 2178, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:56.775267', 'step': 2178, 'epoch': 3} {'type': 'loss', 'content': 0.0009130876278504729, 'timestamp': '2025-09-30 22:11:56.777823', 'step': 2179, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:56.809419', 'step': 2179, 'epoch': 3} {'type': 'loss', 'content': 0.0014621271984651685, 'timestamp': '2025-09-30 22:11:56.833406', 'step': 2180, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:56.863839', 'step': 2180, 'epoch': 3} {'type': 'loss', 'content': 0.0005752498982474208, 'timestamp': '2025-09-30 22:11:56.865798', 'step': 2181, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:56.895858', 'step': 2181, 'epoch': 3} {'type': 'loss', 'content': 0.0007989993318915367, 'timestamp': '2025-09-30 22:11:56.898158', 'step': 2182, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:56.928574', 'step': 2182, 'epoch': 3} {'type': 'loss', 'content': 0.0009071322274394333, 'timestamp': '2025-09-30 22:11:56.930839', 'step': 2183, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:56.962076', 'step': 2183, 'epoch': 3} {'type': 'loss', 'content': 0.0018987265648320317, 'timestamp': '2025-09-30 22:11:56.985837', 'step': 2184, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:57.015617', 'step': 2184, 'epoch': 3} {'type': 'loss', 'content': 0.029847219586372375, 'timestamp': '2025-09-30 22:11:57.025141', 'step': 2185, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:57.055917', 'step': 2185, 'epoch': 3} {'type': 'loss', 'content': 0.001003224402666092, 'timestamp': '2025-09-30 22:11:57.058942', 'step': 2186, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:57.090904', 'step': 2186, 'epoch': 3} {'type': 'loss', 'content': 0.023648729547858238, 'timestamp': '2025-09-30 22:11:57.094613', 'step': 2187, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:57.126710', 'step': 2187, 'epoch': 3} {'type': 'loss', 'content': 0.003437680657953024, 'timestamp': '2025-09-30 22:11:57.149958', 'step': 2188, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:57.179942', 'step': 2188, 'epoch': 3} {'type': 'loss', 'content': 0.003817408112809062, 'timestamp': '2025-09-30 22:11:57.182016', 'step': 2189, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:57.212247', 'step': 2189, 'epoch': 3} {'type': 'loss', 'content': 0.0008727815002202988, 'timestamp': '2025-09-30 22:11:57.215645', 'step': 2190, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:57.252181', 'step': 2190, 'epoch': 3} {'type': 'loss', 'content': 0.01654052920639515, 'timestamp': '2025-09-30 22:11:57.255333', 'step': 2191, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:57.287622', 'step': 2191, 'epoch': 3} {'type': 'loss', 'content': 0.00038399777258746326, 'timestamp': '2025-09-30 22:11:57.311839', 'step': 2192, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:57.346721', 'step': 2192, 'epoch': 3} {'type': 'loss', 'content': 0.00027529371436685324, 'timestamp': '2025-09-30 22:11:57.348871', 'step': 2193, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:57.386221', 'step': 2193, 'epoch': 3} {'type': 'loss', 'content': 0.024949418380856514, 'timestamp': '2025-09-30 22:11:57.392977', 'step': 2194, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:57.432676', 'step': 2194, 'epoch': 3} {'type': 'loss', 'content': 0.0010464598890393972, 'timestamp': '2025-09-30 22:11:57.435229', 'step': 2195, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:57.482869', 'step': 2195, 'epoch': 3} {'type': 'loss', 'content': 0.040087323635816574, 'timestamp': '2025-09-30 22:11:57.509506', 'step': 2196, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:57.544617', 'step': 2196, 'epoch': 3} {'type': 'loss', 'content': 0.01321853045374155, 'timestamp': '2025-09-30 22:11:57.548530', 'step': 2197, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:57.581886', 'step': 2197, 'epoch': 3} {'type': 'loss', 'content': 0.00213692057877779, 'timestamp': '2025-09-30 22:11:57.585034', 'step': 2198, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:57.618036', 'step': 2198, 'epoch': 3} {'type': 'loss', 'content': 0.0006283451803028584, 'timestamp': '2025-09-30 22:11:57.620431', 'step': 2199, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:57.653624', 'step': 2199, 'epoch': 3} {'type': 'loss', 'content': 0.0003651838924270123, 'timestamp': '2025-09-30 22:11:57.678718', 'step': 2200, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:11:57.718869', 'step': 2200, 'epoch': 3} {'type': 'loss', 'content': 0.0010239933617413044, 'timestamp': '2025-09-30 22:11:57.720993', 'step': 2201, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:57.754953', 'step': 2201, 'epoch': 3} {'type': 'loss', 'content': 0.028298767283558846, 'timestamp': '2025-09-30 22:11:57.757035', 'step': 2202, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:57.791476', 'step': 2202, 'epoch': 3} {'type': 'loss', 'content': 0.0006448552594520152, 'timestamp': '2025-09-30 22:11:57.795278', 'step': 2203, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:57.831152', 'step': 2203, 'epoch': 3} {'type': 'loss', 'content': 0.0008334387093782425, 'timestamp': '2025-09-30 22:11:57.860596', 'step': 2204, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:57.895575', 'step': 2204, 'epoch': 3} {'type': 'loss', 'content': 0.0005287216627039015, 'timestamp': '2025-09-30 22:11:57.898566', 'step': 2205, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:57.935665', 'step': 2205, 'epoch': 3} {'type': 'loss', 'content': 0.0017126341117545962, 'timestamp': '2025-09-30 22:11:57.939812', 'step': 2206, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:57.974133', 'step': 2206, 'epoch': 3} {'type': 'loss', 'content': 0.0010332076344639063, 'timestamp': '2025-09-30 22:11:57.976496', 'step': 2207, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:58.022134', 'step': 2207, 'epoch': 3} {'type': 'loss', 'content': 0.01596010848879814, 'timestamp': '2025-09-30 22:11:58.046395', 'step': 2208, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:58.087991', 'step': 2208, 'epoch': 3} {'type': 'loss', 'content': 0.0012377096572890878, 'timestamp': '2025-09-30 22:11:58.094121', 'step': 2209, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:58.127307', 'step': 2209, 'epoch': 3} {'type': 'loss', 'content': 0.0016445504734292626, 'timestamp': '2025-09-30 22:11:58.133321', 'step': 2210, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:58.169585', 'step': 2210, 'epoch': 3} {'type': 'loss', 'content': 0.00024720263900235295, 'timestamp': '2025-09-30 22:11:58.171920', 'step': 2211, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:58.208594', 'step': 2211, 'epoch': 3} {'type': 'loss', 'content': 0.0014899687375873327, 'timestamp': '2025-09-30 22:11:58.233825', 'step': 2212, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:58.265323', 'step': 2212, 'epoch': 3} {'type': 'loss', 'content': 0.01058391947299242, 'timestamp': '2025-09-30 22:11:58.267681', 'step': 2213, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:58.301220', 'step': 2213, 'epoch': 3} {'type': 'loss', 'content': 0.0021572033874690533, 'timestamp': '2025-09-30 22:11:58.304807', 'step': 2214, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:58.341173', 'step': 2214, 'epoch': 3} {'type': 'loss', 'content': 0.0013435721630230546, 'timestamp': '2025-09-30 22:11:58.344084', 'step': 2215, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:11:58.379137', 'step': 2215, 'epoch': 3} {'type': 'loss', 'content': 0.014729948714375496, 'timestamp': '2025-09-30 22:11:58.403177', 'step': 2216, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:58.435477', 'step': 2216, 'epoch': 3} {'type': 'loss', 'content': 0.006870845798403025, 'timestamp': '2025-09-30 22:11:58.442814', 'step': 2217, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:58.478839', 'step': 2217, 'epoch': 3} {'type': 'loss', 'content': 0.025339728221297264, 'timestamp': '2025-09-30 22:11:58.482003', 'step': 2218, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:58.520112', 'step': 2218, 'epoch': 3} {'type': 'loss', 'content': 0.0009052485111169517, 'timestamp': '2025-09-30 22:11:58.522814', 'step': 2219, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:58.554393', 'step': 2219, 'epoch': 3} {'type': 'loss', 'content': 0.0033087660558521748, 'timestamp': '2025-09-30 22:11:58.580417', 'step': 2220, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:58.621971', 'step': 2220, 'epoch': 3} {'type': 'loss', 'content': 0.0004655886150430888, 'timestamp': '2025-09-30 22:11:58.624104', 'step': 2221, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:58.659507', 'step': 2221, 'epoch': 3} {'type': 'loss', 'content': 0.001055079628713429, 'timestamp': '2025-09-30 22:11:58.662247', 'step': 2222, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:11:58.701919', 'step': 2222, 'epoch': 3} {'type': 'loss', 'content': 0.0007419497705996037, 'timestamp': '2025-09-30 22:11:58.706965', 'step': 2223, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-30 22:11:59.612031', 'step': 2223, 'epoch': 3} {'type': 'pplx', 'content': 41080650.73787688, 'timestamp': '2025-09-30 22:11:59.614983', 'step': 2223, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:59.651495', 'step': 2223, 'epoch': 3} {'type': 'loss', 'content': 0.0021477299742400646, 'timestamp': '2025-09-30 22:11:59.676629', 'step': 2224, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:59.709738', 'step': 2224, 'epoch': 3} {'type': 'loss', 'content': 0.002651253715157509, 'timestamp': '2025-09-30 22:11:59.724011', 'step': 2225, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:59.759451', 'step': 2225, 'epoch': 3} {'type': 'loss', 'content': 0.001111521851271391, 'timestamp': '2025-09-30 22:11:59.767172', 'step': 2226, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:59.802065', 'step': 2226, 'epoch': 3} {'type': 'loss', 'content': 0.011532140895724297, 'timestamp': '2025-09-30 22:11:59.805099', 'step': 2227, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:11:59.852920', 'step': 2227, 'epoch': 3} {'type': 'loss', 'content': 0.012329302728176117, 'timestamp': '2025-09-30 22:11:59.880360', 'step': 2228, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:59.917214', 'step': 2228, 'epoch': 3} {'type': 'loss', 'content': 0.00019277750106994063, 'timestamp': '2025-09-30 22:11:59.920528', 'step': 2229, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:59.953450', 'step': 2229, 'epoch': 3} {'type': 'loss', 'content': 0.0016648133751004934, 'timestamp': '2025-09-30 22:11:59.956007', 'step': 2230, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:11:59.987874', 'step': 2230, 'epoch': 3} {'type': 'loss', 'content': 0.00010171485337195918, 'timestamp': '2025-09-30 22:11:59.990671', 'step': 2231, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:00.023025', 'step': 2231, 'epoch': 3} {'type': 'loss', 'content': 0.003272840054705739, 'timestamp': '2025-09-30 22:12:00.051308', 'step': 2232, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:00.098015', 'step': 2232, 'epoch': 3} {'type': 'loss', 'content': 0.0023247734643518925, 'timestamp': '2025-09-30 22:12:00.100228', 'step': 2233, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:00.144520', 'step': 2233, 'epoch': 3} {'type': 'loss', 'content': 0.0013171957107260823, 'timestamp': '2025-09-30 22:12:00.147086', 'step': 2234, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:00.190259', 'step': 2234, 'epoch': 3} {'type': 'loss', 'content': 0.0005580909783020616, 'timestamp': '2025-09-30 22:12:00.194040', 'step': 2235, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:00.236684', 'step': 2235, 'epoch': 3} {'type': 'loss', 'content': 0.0006598988547921181, 'timestamp': '2025-09-30 22:12:00.267458', 'step': 2236, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:12:00.305777', 'step': 2236, 'epoch': 3} {'type': 'loss', 'content': 0.002186252735555172, 'timestamp': '2025-09-30 22:12:00.312410', 'step': 2237, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:00.357546', 'step': 2237, 'epoch': 3} {'type': 'loss', 'content': 0.006100684404373169, 'timestamp': '2025-09-30 22:12:00.360014', 'step': 2238, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:12:00.400839', 'step': 2238, 'epoch': 3} {'type': 'loss', 'content': 0.03646007925271988, 'timestamp': '2025-09-30 22:12:00.404742', 'step': 2239, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:12:00.437361', 'step': 2239, 'epoch': 3} {'type': 'loss', 'content': 0.003415354760363698, 'timestamp': '2025-09-30 22:12:00.461510', 'step': 2240, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:00.494876', 'step': 2240, 'epoch': 3} {'type': 'loss', 'content': 0.0009296032367274165, 'timestamp': '2025-09-30 22:12:00.498079', 'step': 2241, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:00.544104', 'step': 2241, 'epoch': 3} {'type': 'loss', 'content': 0.00020019887597300112, 'timestamp': '2025-09-30 22:12:00.553285', 'step': 2242, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:00.584744', 'step': 2242, 'epoch': 3} {'type': 'loss', 'content': 0.000863843597471714, 'timestamp': '2025-09-30 22:12:00.588385', 'step': 2243, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:00.623776', 'step': 2243, 'epoch': 3} {'type': 'loss', 'content': 0.0016993492608889937, 'timestamp': '2025-09-30 22:12:00.653484', 'step': 2244, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:12:00.702084', 'step': 2244, 'epoch': 3} {'type': 'loss', 'content': 0.0019676017109304667, 'timestamp': '2025-09-30 22:12:00.705865', 'step': 2245, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:00.742784', 'step': 2245, 'epoch': 3} {'type': 'loss', 'content': 0.00311626517213881, 'timestamp': '2025-09-30 22:12:00.747544', 'step': 2246, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:00.784398', 'step': 2246, 'epoch': 3} {'type': 'loss', 'content': 0.002856964012607932, 'timestamp': '2025-09-30 22:12:00.791186', 'step': 2247, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:00.823880', 'step': 2247, 'epoch': 3} {'type': 'loss', 'content': 0.004770600702613592, 'timestamp': '2025-09-30 22:12:00.848076', 'step': 2248, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:00.880530', 'step': 2248, 'epoch': 3} {'type': 'loss', 'content': 0.00405394472181797, 'timestamp': '2025-09-30 22:12:00.886406', 'step': 2249, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:00.917905', 'step': 2249, 'epoch': 3} {'type': 'loss', 'content': 0.00012620205234270543, 'timestamp': '2025-09-30 22:12:00.920939', 'step': 2250, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:00.956680', 'step': 2250, 'epoch': 3} {'type': 'loss', 'content': 0.0037796092219650745, 'timestamp': '2025-09-30 22:12:00.959116', 'step': 2251, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:00.998630', 'step': 2251, 'epoch': 3} {'type': 'loss', 'content': 0.0041126045398414135, 'timestamp': '2025-09-30 22:12:01.023514', 'step': 2252, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:01.063613', 'step': 2252, 'epoch': 3} {'type': 'loss', 'content': 0.0031900478061288595, 'timestamp': '2025-09-30 22:12:01.066754', 'step': 2253, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:12:01.102425', 'step': 2253, 'epoch': 3} {'type': 'loss', 'content': 0.0007147821015678346, 'timestamp': '2025-09-30 22:12:01.108462', 'step': 2254, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:01.156598', 'step': 2254, 'epoch': 3} {'type': 'loss', 'content': 0.0002387921849731356, 'timestamp': '2025-09-30 22:12:01.160749', 'step': 2255, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:12:01.206689', 'step': 2255, 'epoch': 3} {'type': 'loss', 'content': 8.921477274270728e-05, 'timestamp': '2025-09-30 22:12:01.231930', 'step': 2256, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:01.289999', 'step': 2256, 'epoch': 3} {'type': 'loss', 'content': 0.00011117455142084509, 'timestamp': '2025-09-30 22:12:01.293131', 'step': 2257, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:12:01.327003', 'step': 2257, 'epoch': 3} {'type': 'loss', 'content': 0.002855940954759717, 'timestamp': '2025-09-30 22:12:01.329937', 'step': 2258, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:01.364426', 'step': 2258, 'epoch': 3} {'type': 'loss', 'content': 0.0015121333999559283, 'timestamp': '2025-09-30 22:12:01.368663', 'step': 2259, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:12:01.400739', 'step': 2259, 'epoch': 3} {'type': 'loss', 'content': 0.004547151271253824, 'timestamp': '2025-09-30 22:12:01.425741', 'step': 2260, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:01.472112', 'step': 2260, 'epoch': 3} {'type': 'loss', 'content': 0.0014173558447510004, 'timestamp': '2025-09-30 22:12:01.475511', 'step': 2261, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:01.509433', 'step': 2261, 'epoch': 3} {'type': 'loss', 'content': 0.0007967141573317349, 'timestamp': '2025-09-30 22:12:01.513037', 'step': 2262, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:01.551003', 'step': 2262, 'epoch': 3} {'type': 'loss', 'content': 0.004071411211043596, 'timestamp': '2025-09-30 22:12:01.554383', 'step': 2263, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:01.601596', 'step': 2263, 'epoch': 3} {'type': 'loss', 'content': 0.001577091054059565, 'timestamp': '2025-09-30 22:12:01.630556', 'step': 2264, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:01.685148', 'step': 2264, 'epoch': 3} {'type': 'loss', 'content': 7.375147106358781e-05, 'timestamp': '2025-09-30 22:12:01.687706', 'step': 2265, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:01.728395', 'step': 2265, 'epoch': 3} {'type': 'loss', 'content': 0.0009554016287438571, 'timestamp': '2025-09-30 22:12:01.736485', 'step': 2266, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:01.770317', 'step': 2266, 'epoch': 3} {'type': 'loss', 'content': 0.003659574780613184, 'timestamp': '2025-09-30 22:12:01.773461', 'step': 2267, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:12:01.806720', 'step': 2267, 'epoch': 3} {'type': 'loss', 'content': 0.005300324410200119, 'timestamp': '2025-09-30 22:12:01.830722', 'step': 2268, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:01.872522', 'step': 2268, 'epoch': 3} {'type': 'loss', 'content': 0.0002838976215571165, 'timestamp': '2025-09-30 22:12:01.876519', 'step': 2269, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:01.908720', 'step': 2269, 'epoch': 3} {'type': 'loss', 'content': 0.00027007676544599235, 'timestamp': '2025-09-30 22:12:01.911009', 'step': 2270, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:01.957192', 'step': 2270, 'epoch': 3} {'type': 'loss', 'content': 0.0017464784905314445, 'timestamp': '2025-09-30 22:12:01.963999', 'step': 2271, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:02.001305', 'step': 2271, 'epoch': 3} {'type': 'loss', 'content': 0.0007175366627052426, 'timestamp': '2025-09-30 22:12:02.029407', 'step': 2272, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:02.092776', 'step': 2272, 'epoch': 3} {'type': 'loss', 'content': 0.004835275001823902, 'timestamp': '2025-09-30 22:12:02.095027', 'step': 2273, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:02.130028', 'step': 2273, 'epoch': 3} {'type': 'loss', 'content': 0.0029070251621305943, 'timestamp': '2025-09-30 22:12:02.137218', 'step': 2274, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:02.169591', 'step': 2274, 'epoch': 3} {'type': 'loss', 'content': 0.012067276053130627, 'timestamp': '2025-09-30 22:12:02.175245', 'step': 2275, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:02.220400', 'step': 2275, 'epoch': 3} {'type': 'loss', 'content': 0.009939714334905148, 'timestamp': '2025-09-30 22:12:02.244711', 'step': 2276, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:12:02.290047', 'step': 2276, 'epoch': 3} {'type': 'loss', 'content': 0.0007312466041184962, 'timestamp': '2025-09-30 22:12:02.292294', 'step': 2277, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:02.331166', 'step': 2277, 'epoch': 3} {'type': 'loss', 'content': 0.0020863066893070936, 'timestamp': '2025-09-30 22:12:02.335425', 'step': 2278, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:02.370381', 'step': 2278, 'epoch': 3} {'type': 'loss', 'content': 0.0002355527103645727, 'timestamp': '2025-09-30 22:12:02.373473', 'step': 2279, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:02.405483', 'step': 2279, 'epoch': 3} {'type': 'loss', 'content': 0.007479370106011629, 'timestamp': '2025-09-30 22:12:02.430228', 'step': 2280, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-30 22:12:03.399586', 'step': 2280, 'epoch': 3} {'type': 'pplx', 'content': 46849847.313949354, 'timestamp': '2025-09-30 22:12:03.407812', 'step': 2280, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:03.436963', 'step': 2280, 'epoch': 3} {'type': 'loss', 'content': 0.001072447281330824, 'timestamp': '2025-09-30 22:12:03.440189', 'step': 2281, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:03.473962', 'step': 2281, 'epoch': 3} {'type': 'loss', 'content': 0.0004119630320928991, 'timestamp': '2025-09-30 22:12:03.482672', 'step': 2282, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:03.519456', 'step': 2282, 'epoch': 3} {'type': 'loss', 'content': 0.00015061446174513549, 'timestamp': '2025-09-30 22:12:03.523228', 'step': 2283, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:03.556675', 'step': 2283, 'epoch': 3} {'type': 'loss', 'content': 0.000557638006284833, 'timestamp': '2025-09-30 22:12:03.585615', 'step': 2284, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:03.631870', 'step': 2284, 'epoch': 3} {'type': 'loss', 'content': 0.0005773733137175441, 'timestamp': '2025-09-30 22:12:03.634814', 'step': 2285, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:03.675837', 'step': 2285, 'epoch': 3} {'type': 'loss', 'content': 0.00030607712687924504, 'timestamp': '2025-09-30 22:12:03.683777', 'step': 2286, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:03.717388', 'step': 2286, 'epoch': 3} {'type': 'loss', 'content': 0.0001611269690329209, 'timestamp': '2025-09-30 22:12:03.719972', 'step': 2287, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:12:03.756677', 'step': 2287, 'epoch': 3} {'type': 'loss', 'content': 0.0007993626059032977, 'timestamp': '2025-09-30 22:12:03.781334', 'step': 2288, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:03.821981', 'step': 2288, 'epoch': 3} {'type': 'loss', 'content': 9.523901098873466e-05, 'timestamp': '2025-09-30 22:12:03.825103', 'step': 2289, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:03.858730', 'step': 2289, 'epoch': 3} {'type': 'loss', 'content': 0.000503116229083389, 'timestamp': '2025-09-30 22:12:03.862442', 'step': 2290, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:03.897690', 'step': 2290, 'epoch': 3} {'type': 'loss', 'content': 0.0010981824016198516, 'timestamp': '2025-09-30 22:12:03.901476', 'step': 2291, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:03.933846', 'step': 2291, 'epoch': 3} {'type': 'loss', 'content': 0.003907725214958191, 'timestamp': '2025-09-30 22:12:03.958803', 'step': 2292, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:03.997275', 'step': 2292, 'epoch': 3} {'type': 'loss', 'content': 0.0007813084521330893, 'timestamp': '2025-09-30 22:12:04.006086', 'step': 2293, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:12:04.044795', 'step': 2293, 'epoch': 3} {'type': 'loss', 'content': 0.0015995989087969065, 'timestamp': '2025-09-30 22:12:04.052998', 'step': 2294, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:04.106455', 'step': 2294, 'epoch': 3} {'type': 'loss', 'content': 0.008173911832273006, 'timestamp': '2025-09-30 22:12:04.112277', 'step': 2295, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:04.157702', 'step': 2295, 'epoch': 3} {'type': 'loss', 'content': 0.006400450598448515, 'timestamp': '2025-09-30 22:12:04.182227', 'step': 2296, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:04.218839', 'step': 2296, 'epoch': 3} {'type': 'loss', 'content': 0.001864130492322147, 'timestamp': '2025-09-30 22:12:04.222678', 'step': 2297, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:04.256426', 'step': 2297, 'epoch': 3} {'type': 'loss', 'content': 0.002440927317366004, 'timestamp': '2025-09-30 22:12:04.259903', 'step': 2298, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:04.299811', 'step': 2298, 'epoch': 3} {'type': 'loss', 'content': 0.0001420110056642443, 'timestamp': '2025-09-30 22:12:04.303080', 'step': 2299, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:04.349103', 'step': 2299, 'epoch': 3} {'type': 'loss', 'content': 0.00029452916351146996, 'timestamp': '2025-09-30 22:12:04.384052', 'step': 2300, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:04.415818', 'step': 2300, 'epoch': 3} {'type': 'loss', 'content': 0.00028071500128135085, 'timestamp': '2025-09-30 22:12:04.418464', 'step': 2301, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:04.467075', 'step': 2301, 'epoch': 3} {'type': 'loss', 'content': 0.00010159516386920586, 'timestamp': '2025-09-30 22:12:04.470372', 'step': 2302, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:04.508587', 'step': 2302, 'epoch': 3} {'type': 'loss', 'content': 0.024396846070885658, 'timestamp': '2025-09-30 22:12:04.516211', 'step': 2303, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:04.556466', 'step': 2303, 'epoch': 3} {'type': 'loss', 'content': 0.03193879500031471, 'timestamp': '2025-09-30 22:12:04.580809', 'step': 2304, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:04.627001', 'step': 2304, 'epoch': 3} {'type': 'loss', 'content': 0.00011379749776097015, 'timestamp': '2025-09-30 22:12:04.635055', 'step': 2305, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:04.672852', 'step': 2305, 'epoch': 3} {'type': 'loss', 'content': 0.0007863629725761712, 'timestamp': '2025-09-30 22:12:04.680034', 'step': 2306, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:12:04.714575', 'step': 2306, 'epoch': 3} {'type': 'loss', 'content': 0.019120637327432632, 'timestamp': '2025-09-30 22:12:04.717585', 'step': 2307, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:04.750404', 'step': 2307, 'epoch': 3} {'type': 'loss', 'content': 0.013492346741259098, 'timestamp': '2025-09-30 22:12:04.778357', 'step': 2308, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:04.810855', 'step': 2308, 'epoch': 3} {'type': 'loss', 'content': 0.0003213980817236006, 'timestamp': '2025-09-30 22:12:04.818468', 'step': 2309, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:04.867683', 'step': 2309, 'epoch': 3} {'type': 'loss', 'content': 0.00035105025744996965, 'timestamp': '2025-09-30 22:12:04.874844', 'step': 2310, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:04.928073', 'step': 2310, 'epoch': 3} {'type': 'loss', 'content': 0.00015687046106904745, 'timestamp': '2025-09-30 22:12:04.931276', 'step': 2311, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:04.974813', 'step': 2311, 'epoch': 3} {'type': 'loss', 'content': 0.0003475369594525546, 'timestamp': '2025-09-30 22:12:05.003685', 'step': 2312, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:05.046387', 'step': 2312, 'epoch': 3} {'type': 'loss', 'content': 0.0002608960203360766, 'timestamp': '2025-09-30 22:12:05.059813', 'step': 2313, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:05.098699', 'step': 2313, 'epoch': 3} {'type': 'loss', 'content': 0.00015405855083372444, 'timestamp': '2025-09-30 22:12:05.105923', 'step': 2314, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:05.150191', 'step': 2314, 'epoch': 3} {'type': 'loss', 'content': 0.007762204390019178, 'timestamp': '2025-09-30 22:12:05.153871', 'step': 2315, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:05.187810', 'step': 2315, 'epoch': 3} {'type': 'loss', 'content': 0.000645505147986114, 'timestamp': '2025-09-30 22:12:05.212043', 'step': 2316, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:05.254396', 'step': 2316, 'epoch': 3} {'type': 'loss', 'content': 0.01529739797115326, 'timestamp': '2025-09-30 22:12:05.258026', 'step': 2317, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:05.298298', 'step': 2317, 'epoch': 3} {'type': 'loss', 'content': 0.0003246499109081924, 'timestamp': '2025-09-30 22:12:05.308497', 'step': 2318, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:05.343271', 'step': 2318, 'epoch': 3} {'type': 'loss', 'content': 0.002341361017897725, 'timestamp': '2025-09-30 22:12:05.346182', 'step': 2319, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:05.400375', 'step': 2319, 'epoch': 3} {'type': 'loss', 'content': 0.01718306355178356, 'timestamp': '2025-09-30 22:12:05.434471', 'step': 2320, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:12:05.481672', 'step': 2320, 'epoch': 3} {'type': 'loss', 'content': 0.0016600488452240825, 'timestamp': '2025-09-30 22:12:05.485877', 'step': 2321, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:05.518932', 'step': 2321, 'epoch': 3} {'type': 'loss', 'content': 0.0003560645563993603, 'timestamp': '2025-09-30 22:12:05.522143', 'step': 2322, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:05.569894', 'step': 2322, 'epoch': 3} {'type': 'loss', 'content': 0.0010048066033050418, 'timestamp': '2025-09-30 22:12:05.573432', 'step': 2323, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:05.624869', 'step': 2323, 'epoch': 3} {'type': 'loss', 'content': 0.010342958383262157, 'timestamp': '2025-09-30 22:12:05.661338', 'step': 2324, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:05.703923', 'step': 2324, 'epoch': 3} {'type': 'loss', 'content': 0.0004138563817832619, 'timestamp': '2025-09-30 22:12:05.707259', 'step': 2325, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:05.747256', 'step': 2325, 'epoch': 3} {'type': 'loss', 'content': 0.009076876565814018, 'timestamp': '2025-09-30 22:12:05.758697', 'step': 2326, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:05.791111', 'step': 2326, 'epoch': 3} {'type': 'loss', 'content': 0.004802730865776539, 'timestamp': '2025-09-30 22:12:05.794003', 'step': 2327, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:05.835394', 'step': 2327, 'epoch': 3} {'type': 'loss', 'content': 0.0006566385854966938, 'timestamp': '2025-09-30 22:12:05.859710', 'step': 2328, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:05.897532', 'step': 2328, 'epoch': 3} {'type': 'loss', 'content': 0.00047095856280066073, 'timestamp': '2025-09-30 22:12:05.900682', 'step': 2329, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:05.944301', 'step': 2329, 'epoch': 3} {'type': 'loss', 'content': 0.00011278774763923138, 'timestamp': '2025-09-30 22:12:05.951644', 'step': 2330, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:05.985479', 'step': 2330, 'epoch': 3} {'type': 'loss', 'content': 0.0002812811580952257, 'timestamp': '2025-09-30 22:12:05.993798', 'step': 2331, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:12:06.040965', 'step': 2331, 'epoch': 3} {'type': 'loss', 'content': 0.001078323693946004, 'timestamp': '2025-09-30 22:12:06.065658', 'step': 2332, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:06.104599', 'step': 2332, 'epoch': 3} {'type': 'loss', 'content': 0.014645090326666832, 'timestamp': '2025-09-30 22:12:06.108302', 'step': 2333, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:06.146369', 'step': 2333, 'epoch': 3} {'type': 'loss', 'content': 0.0006407131440937519, 'timestamp': '2025-09-30 22:12:06.153793', 'step': 2334, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:06.196517', 'step': 2334, 'epoch': 3} {'type': 'loss', 'content': 0.005221139173954725, 'timestamp': '2025-09-30 22:12:06.205904', 'step': 2335, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:06.245185', 'step': 2335, 'epoch': 3} {'type': 'loss', 'content': 0.013521954417228699, 'timestamp': '2025-09-30 22:12:06.281742', 'step': 2336, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:06.313675', 'step': 2336, 'epoch': 3} {'type': 'loss', 'content': 0.005685387644916773, 'timestamp': '2025-09-30 22:12:06.316262', 'step': 2337, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-30 22:12:07.330929', 'step': 2337, 'epoch': 3} {'type': 'pplx', 'content': 46391372.17725871, 'timestamp': '2025-09-30 22:12:07.336034', 'step': 2337, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:07.373369', 'step': 2337, 'epoch': 3} {'type': 'loss', 'content': 0.0001280421856790781, 'timestamp': '2025-09-30 22:12:07.375905', 'step': 2338, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:07.413942', 'step': 2338, 'epoch': 3} {'type': 'loss', 'content': 0.0010907641844823956, 'timestamp': '2025-09-30 22:12:07.421205', 'step': 2339, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:07.460438', 'step': 2339, 'epoch': 3} {'type': 'loss', 'content': 0.001054750056937337, 'timestamp': '2025-09-30 22:12:07.489134', 'step': 2340, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:07.529870', 'step': 2340, 'epoch': 3} {'type': 'loss', 'content': 0.0010710905771702528, 'timestamp': '2025-09-30 22:12:07.536172', 'step': 2341, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:12:07.586081', 'step': 2341, 'epoch': 3} {'type': 'loss', 'content': 0.0010237701935693622, 'timestamp': '2025-09-30 22:12:07.589822', 'step': 2342, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:07.627960', 'step': 2342, 'epoch': 3} {'type': 'loss', 'content': 0.008467769250273705, 'timestamp': '2025-09-30 22:12:07.631790', 'step': 2343, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:07.667145', 'step': 2343, 'epoch': 3} {'type': 'loss', 'content': 0.002808736404404044, 'timestamp': '2025-09-30 22:12:07.691732', 'step': 2344, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:07.725161', 'step': 2344, 'epoch': 3} {'type': 'loss', 'content': 0.029646631330251694, 'timestamp': '2025-09-30 22:12:07.729297', 'step': 2345, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:07.761797', 'step': 2345, 'epoch': 3} {'type': 'loss', 'content': 0.007430060766637325, 'timestamp': '2025-09-30 22:12:07.765228', 'step': 2346, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:07.799199', 'step': 2346, 'epoch': 3} {'type': 'loss', 'content': 0.0001956993219209835, 'timestamp': '2025-09-30 22:12:07.803655', 'step': 2347, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:07.836540', 'step': 2347, 'epoch': 3} {'type': 'loss', 'content': 0.0011663747718557715, 'timestamp': '2025-09-30 22:12:07.861082', 'step': 2348, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:07.893631', 'step': 2348, 'epoch': 3} {'type': 'loss', 'content': 0.0004969866713508964, 'timestamp': '2025-09-30 22:12:07.896893', 'step': 2349, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:07.943325', 'step': 2349, 'epoch': 3} {'type': 'loss', 'content': 0.0021070470102131367, 'timestamp': '2025-09-30 22:12:07.945967', 'step': 2350, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:12:07.989762', 'step': 2350, 'epoch': 3} {'type': 'loss', 'content': 0.0004443576035555452, 'timestamp': '2025-09-30 22:12:07.992864', 'step': 2351, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:08.032050', 'step': 2351, 'epoch': 3} {'type': 'loss', 'content': 0.0007625381113030016, 'timestamp': '2025-09-30 22:12:08.056555', 'step': 2352, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:08.112245', 'step': 2352, 'epoch': 3} {'type': 'loss', 'content': 9.592527931090444e-05, 'timestamp': '2025-09-30 22:12:08.119195', 'step': 2353, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:08.159404', 'step': 2353, 'epoch': 3} {'type': 'loss', 'content': 0.004538069479167461, 'timestamp': '2025-09-30 22:12:08.162209', 'step': 2354, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:08.204106', 'step': 2354, 'epoch': 3} {'type': 'loss', 'content': 0.0006490609957836568, 'timestamp': '2025-09-30 22:12:08.208849', 'step': 2355, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:08.254052', 'step': 2355, 'epoch': 3} {'type': 'loss', 'content': 0.002888438990339637, 'timestamp': '2025-09-30 22:12:08.285560', 'step': 2356, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:08.330447', 'step': 2356, 'epoch': 3} {'type': 'loss', 'content': 0.00023478925868403167, 'timestamp': '2025-09-30 22:12:08.342245', 'step': 2357, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:08.376392', 'step': 2357, 'epoch': 3} {'type': 'loss', 'content': 0.00019593130855355412, 'timestamp': '2025-09-30 22:12:08.383505', 'step': 2358, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:08.416777', 'step': 2358, 'epoch': 3} {'type': 'loss', 'content': 0.00017743787611834705, 'timestamp': '2025-09-30 22:12:08.421424', 'step': 2359, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:08.456841', 'step': 2359, 'epoch': 3} {'type': 'loss', 'content': 0.002296753926202655, 'timestamp': '2025-09-30 22:12:08.492356', 'step': 2360, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:08.527590', 'step': 2360, 'epoch': 3} {'type': 'loss', 'content': 6.212801235960796e-05, 'timestamp': '2025-09-30 22:12:08.532029', 'step': 2361, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:08.571372', 'step': 2361, 'epoch': 3} {'type': 'loss', 'content': 0.002575967228040099, 'timestamp': '2025-09-30 22:12:08.574168', 'step': 2362, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:08.607047', 'step': 2362, 'epoch': 3} {'type': 'loss', 'content': 0.00018172072304878384, 'timestamp': '2025-09-30 22:12:08.618408', 'step': 2363, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:08.654014', 'step': 2363, 'epoch': 3} {'type': 'loss', 'content': 0.0006860285648144782, 'timestamp': '2025-09-30 22:12:08.688882', 'step': 2364, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:08.732977', 'step': 2364, 'epoch': 3} {'type': 'loss', 'content': 0.02065959945321083, 'timestamp': '2025-09-30 22:12:08.736267', 'step': 2365, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:08.778800', 'step': 2365, 'epoch': 3} {'type': 'loss', 'content': 0.03529736027121544, 'timestamp': '2025-09-30 22:12:08.782845', 'step': 2366, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:08.823676', 'step': 2366, 'epoch': 3} {'type': 'loss', 'content': 0.00047834464930929244, 'timestamp': '2025-09-30 22:12:08.829550', 'step': 2367, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:08.865440', 'step': 2367, 'epoch': 3} {'type': 'loss', 'content': 0.031571704894304276, 'timestamp': '2025-09-30 22:12:08.889965', 'step': 2368, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:08.925521', 'step': 2368, 'epoch': 3} {'type': 'loss', 'content': 8.10366400401108e-05, 'timestamp': '2025-09-30 22:12:08.929028', 'step': 2369, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:08.973742', 'step': 2369, 'epoch': 3} {'type': 'loss', 'content': 0.00013628315355163068, 'timestamp': '2025-09-30 22:12:08.977054', 'step': 2370, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:09.032106', 'step': 2370, 'epoch': 3} {'type': 'loss', 'content': 0.036459606140851974, 'timestamp': '2025-09-30 22:12:09.036007', 'step': 2371, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:09.070983', 'step': 2371, 'epoch': 3} {'type': 'loss', 'content': 0.018056688830256462, 'timestamp': '2025-09-30 22:12:09.106327', 'step': 2372, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:09.140407', 'step': 2372, 'epoch': 3} {'type': 'loss', 'content': 0.035005517303943634, 'timestamp': '2025-09-30 22:12:09.145133', 'step': 2373, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:09.180499', 'step': 2373, 'epoch': 3} {'type': 'loss', 'content': 0.0010858295718207955, 'timestamp': '2025-09-30 22:12:09.194325', 'step': 2374, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:09.230899', 'step': 2374, 'epoch': 3} {'type': 'loss', 'content': 0.0038253210950642824, 'timestamp': '2025-09-30 22:12:09.234483', 'step': 2375, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:09.272120', 'step': 2375, 'epoch': 3} {'type': 'loss', 'content': 0.0023231101222336292, 'timestamp': '2025-09-30 22:12:09.297811', 'step': 2376, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:09.332286', 'step': 2376, 'epoch': 3} {'type': 'loss', 'content': 0.0017414873000234365, 'timestamp': '2025-09-30 22:12:09.334687', 'step': 2377, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:09.373795', 'step': 2377, 'epoch': 3} {'type': 'loss', 'content': 0.030804920941591263, 'timestamp': '2025-09-30 22:12:09.387863', 'step': 2378, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:09.423964', 'step': 2378, 'epoch': 3} {'type': 'loss', 'content': 0.0013301247963681817, 'timestamp': '2025-09-30 22:12:09.427559', 'step': 2379, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:09.461909', 'step': 2379, 'epoch': 3} {'type': 'loss', 'content': 0.0005260419566184282, 'timestamp': '2025-09-30 22:12:09.485744', 'step': 2380, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:09.521679', 'step': 2380, 'epoch': 3} {'type': 'loss', 'content': 0.0029137025121599436, 'timestamp': '2025-09-30 22:12:09.524875', 'step': 2381, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:12:09.562647', 'step': 2381, 'epoch': 3} {'type': 'loss', 'content': 0.0006266444106586277, 'timestamp': '2025-09-30 22:12:09.568225', 'step': 2382, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:09.604472', 'step': 2382, 'epoch': 3} {'type': 'loss', 'content': 0.02861565351486206, 'timestamp': '2025-09-30 22:12:09.608029', 'step': 2383, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:09.642994', 'step': 2383, 'epoch': 3} {'type': 'loss', 'content': 0.019098343327641487, 'timestamp': '2025-09-30 22:12:09.668409', 'step': 2384, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:09.709435', 'step': 2384, 'epoch': 3} {'type': 'loss', 'content': 0.00022823250037617981, 'timestamp': '2025-09-30 22:12:09.713273', 'step': 2385, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:09.749008', 'step': 2385, 'epoch': 3} {'type': 'loss', 'content': 0.02544480934739113, 'timestamp': '2025-09-30 22:12:09.755914', 'step': 2386, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:09.802396', 'step': 2386, 'epoch': 3} {'type': 'loss', 'content': 0.001819111406803131, 'timestamp': '2025-09-30 22:12:09.805901', 'step': 2387, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:09.840618', 'step': 2387, 'epoch': 3} {'type': 'loss', 'content': 0.016899127513170242, 'timestamp': '2025-09-30 22:12:09.865412', 'step': 2388, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:09.905943', 'step': 2388, 'epoch': 3} {'type': 'loss', 'content': 0.0014039212837815285, 'timestamp': '2025-09-30 22:12:09.909281', 'step': 2389, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:09.975526', 'step': 2389, 'epoch': 3} {'type': 'loss', 'content': 0.00031415908597409725, 'timestamp': '2025-09-30 22:12:09.979228', 'step': 2390, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:12:10.015441', 'step': 2390, 'epoch': 3} {'type': 'loss', 'content': 0.0005526886088773608, 'timestamp': '2025-09-30 22:12:10.018176', 'step': 2391, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:10.054781', 'step': 2391, 'epoch': 3} {'type': 'loss', 'content': 0.017070923000574112, 'timestamp': '2025-09-30 22:12:10.079783', 'step': 2392, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:10.116338', 'step': 2392, 'epoch': 3} {'type': 'loss', 'content': 0.0017836365150287747, 'timestamp': '2025-09-30 22:12:10.120300', 'step': 2393, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:10.160381', 'step': 2393, 'epoch': 3} {'type': 'loss', 'content': 0.013684429228305817, 'timestamp': '2025-09-30 22:12:10.163909', 'step': 2394, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-30 22:12:11.274444', 'step': 2394, 'epoch': 3} {'type': 'pplx', 'content': 65413116.89364355, 'timestamp': '2025-09-30 22:12:11.287171', 'step': 2394, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:11.326687', 'step': 2394, 'epoch': 3} {'type': 'loss', 'content': 0.014189009554684162, 'timestamp': '2025-09-30 22:12:11.331378', 'step': 2395, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:11.390147', 'step': 2395, 'epoch': 3} {'type': 'loss', 'content': 0.00021919552818872035, 'timestamp': '2025-09-30 22:12:11.422375', 'step': 2396, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:11.470444', 'step': 2396, 'epoch': 3} {'type': 'loss', 'content': 0.0008827035198919475, 'timestamp': '2025-09-30 22:12:11.473912', 'step': 2397, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:11.519495', 'step': 2397, 'epoch': 3} {'type': 'loss', 'content': 0.030242696404457092, 'timestamp': '2025-09-30 22:12:11.524313', 'step': 2398, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:12:11.557997', 'step': 2398, 'epoch': 3} {'type': 'loss', 'content': 0.013079182244837284, 'timestamp': '2025-09-30 22:12:11.562352', 'step': 2399, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:11.602505', 'step': 2399, 'epoch': 3} {'type': 'loss', 'content': 0.00754732359200716, 'timestamp': '2025-09-30 22:12:11.633917', 'step': 2400, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:11.667168', 'step': 2400, 'epoch': 3} {'type': 'loss', 'content': 0.0012320360401645303, 'timestamp': '2025-09-30 22:12:11.670956', 'step': 2401, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:12:11.704264', 'step': 2401, 'epoch': 3} {'type': 'loss', 'content': 0.026908395811915398, 'timestamp': '2025-09-30 22:12:11.710053', 'step': 2402, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:11.742404', 'step': 2402, 'epoch': 3} {'type': 'loss', 'content': 0.0063280873000621796, 'timestamp': '2025-09-30 22:12:11.744904', 'step': 2403, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:11.781457', 'step': 2403, 'epoch': 3} {'type': 'loss', 'content': 0.0008967557223513722, 'timestamp': '2025-09-30 22:12:11.805821', 'step': 2404, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:11.842617', 'step': 2404, 'epoch': 3} {'type': 'loss', 'content': 0.0037160839419811964, 'timestamp': '2025-09-30 22:12:11.848816', 'step': 2405, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:11.881560', 'step': 2405, 'epoch': 3} {'type': 'loss', 'content': 0.0011413343017920852, 'timestamp': '2025-09-30 22:12:11.892778', 'step': 2406, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:12:11.926028', 'step': 2406, 'epoch': 3} {'type': 'loss', 'content': 0.024273769930005074, 'timestamp': '2025-09-30 22:12:11.928620', 'step': 2407, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:11.983045', 'step': 2407, 'epoch': 3} {'type': 'loss', 'content': 0.0033409486059099436, 'timestamp': '2025-09-30 22:12:12.008628', 'step': 2408, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:12.043425', 'step': 2408, 'epoch': 3} {'type': 'loss', 'content': 0.0031633952166885138, 'timestamp': '2025-09-30 22:12:12.045444', 'step': 2409, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:12.082323', 'step': 2409, 'epoch': 3} {'type': 'loss', 'content': 0.006252944469451904, 'timestamp': '2025-09-30 22:12:12.085624', 'step': 2410, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:12:12.119050', 'step': 2410, 'epoch': 3} {'type': 'loss', 'content': 0.008213681168854237, 'timestamp': '2025-09-30 22:12:12.122300', 'step': 2411, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:12:12.162129', 'step': 2411, 'epoch': 3} {'type': 'loss', 'content': 0.010081358253955841, 'timestamp': '2025-09-30 22:12:12.186003', 'step': 2412, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:12.222342', 'step': 2412, 'epoch': 3} {'type': 'loss', 'content': 0.016692589968442917, 'timestamp': '2025-09-30 22:12:12.232519', 'step': 2413, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:12.270128', 'step': 2413, 'epoch': 3} {'type': 'loss', 'content': 0.007243197411298752, 'timestamp': '2025-09-30 22:12:12.273893', 'step': 2414, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:12.333563', 'step': 2414, 'epoch': 3} {'type': 'loss', 'content': 0.002702240599319339, 'timestamp': '2025-09-30 22:12:12.348304', 'step': 2415, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:12.384547', 'step': 2415, 'epoch': 3} {'type': 'loss', 'content': 0.0019787922501564026, 'timestamp': '2025-09-30 22:12:12.409192', 'step': 2416, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:12.445124', 'step': 2416, 'epoch': 3} {'type': 'loss', 'content': 0.006027872674167156, 'timestamp': '2025-09-30 22:12:12.449558', 'step': 2417, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:12.490956', 'step': 2417, 'epoch': 3} {'type': 'loss', 'content': 0.008715354837477207, 'timestamp': '2025-09-30 22:12:12.494453', 'step': 2418, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:12.527345', 'step': 2418, 'epoch': 3} {'type': 'loss', 'content': 0.009561752900481224, 'timestamp': '2025-09-30 22:12:12.530277', 'step': 2419, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:12.580919', 'step': 2419, 'epoch': 3} {'type': 'loss', 'content': 0.0023871201556175947, 'timestamp': '2025-09-30 22:12:12.605637', 'step': 2420, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:12.637007', 'step': 2420, 'epoch': 3} {'type': 'loss', 'content': 0.00500152911990881, 'timestamp': '2025-09-30 22:12:12.639971', 'step': 2421, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:12:12.681120', 'step': 2421, 'epoch': 3} {'type': 'loss', 'content': 0.012087523005902767, 'timestamp': '2025-09-30 22:12:12.684477', 'step': 2422, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:12.727338', 'step': 2422, 'epoch': 3} {'type': 'loss', 'content': 0.0017976844683289528, 'timestamp': '2025-09-30 22:12:12.731067', 'step': 2423, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:12.780306', 'step': 2423, 'epoch': 3} {'type': 'loss', 'content': 0.013470481149852276, 'timestamp': '2025-09-30 22:12:12.804491', 'step': 2424, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:12:12.840920', 'step': 2424, 'epoch': 3} {'type': 'loss', 'content': 0.0020187811460345984, 'timestamp': '2025-09-30 22:12:12.843628', 'step': 2425, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:12.873849', 'step': 2425, 'epoch': 3} {'type': 'loss', 'content': 0.004044502507895231, 'timestamp': '2025-09-30 22:12:12.876983', 'step': 2426, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:12.909004', 'step': 2426, 'epoch': 3} {'type': 'loss', 'content': 0.019353091716766357, 'timestamp': '2025-09-30 22:12:12.915834', 'step': 2427, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:12:12.958545', 'step': 2427, 'epoch': 3} {'type': 'loss', 'content': 0.0004785063210874796, 'timestamp': '2025-09-30 22:12:12.990788', 'step': 2428, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:13.044761', 'step': 2428, 'epoch': 3} {'type': 'loss', 'content': 0.006889610085636377, 'timestamp': '2025-09-30 22:12:13.058763', 'step': 2429, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:13.118812', 'step': 2429, 'epoch': 3} {'type': 'loss', 'content': 0.004911174066364765, 'timestamp': '2025-09-30 22:12:13.123717', 'step': 2430, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:12:13.171498', 'step': 2430, 'epoch': 3} {'type': 'loss', 'content': 0.0064652832224965096, 'timestamp': '2025-09-30 22:12:13.178647', 'step': 2431, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:13.223671', 'step': 2431, 'epoch': 3} {'type': 'loss', 'content': 0.003408350283280015, 'timestamp': '2025-09-30 22:12:13.249700', 'step': 2432, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:13.286262', 'step': 2432, 'epoch': 3} {'type': 'loss', 'content': 0.025650396943092346, 'timestamp': '2025-09-30 22:12:13.288935', 'step': 2433, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:13.323316', 'step': 2433, 'epoch': 3} {'type': 'loss', 'content': 0.0033151302486658096, 'timestamp': '2025-09-30 22:12:13.326983', 'step': 2434, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:13.366555', 'step': 2434, 'epoch': 3} {'type': 'loss', 'content': 0.010545426048338413, 'timestamp': '2025-09-30 22:12:13.370016', 'step': 2435, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:13.401499', 'step': 2435, 'epoch': 3} {'type': 'loss', 'content': 0.011878552846610546, 'timestamp': '2025-09-30 22:12:13.425534', 'step': 2436, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:13.464542', 'step': 2436, 'epoch': 3} {'type': 'loss', 'content': 0.0001749217917677015, 'timestamp': '2025-09-30 22:12:13.467762', 'step': 2437, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:13.509444', 'step': 2437, 'epoch': 3} {'type': 'loss', 'content': 0.04364948347210884, 'timestamp': '2025-09-30 22:12:13.512321', 'step': 2438, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:13.554957', 'step': 2438, 'epoch': 3} {'type': 'loss', 'content': 0.003104813862591982, 'timestamp': '2025-09-30 22:12:13.557897', 'step': 2439, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:13.595618', 'step': 2439, 'epoch': 3} {'type': 'loss', 'content': 0.0048273103311657906, 'timestamp': '2025-09-30 22:12:13.623372', 'step': 2440, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:13.675134', 'step': 2440, 'epoch': 3} {'type': 'loss', 'content': 0.005417822860181332, 'timestamp': '2025-09-30 22:12:13.682437', 'step': 2441, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:12:13.717391', 'step': 2441, 'epoch': 3} {'type': 'loss', 'content': 0.011303612031042576, 'timestamp': '2025-09-30 22:12:13.719797', 'step': 2442, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:13.754682', 'step': 2442, 'epoch': 3} {'type': 'loss', 'content': 0.0019748841878026724, 'timestamp': '2025-09-30 22:12:13.756935', 'step': 2443, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:13.797829', 'step': 2443, 'epoch': 3} {'type': 'loss', 'content': 0.0004670954658649862, 'timestamp': '2025-09-30 22:12:13.822910', 'step': 2444, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:13.867396', 'step': 2444, 'epoch': 3} {'type': 'loss', 'content': 0.0493459478020668, 'timestamp': '2025-09-30 22:12:13.871818', 'step': 2445, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:13.915025', 'step': 2445, 'epoch': 3} {'type': 'loss', 'content': 0.0002415820927126333, 'timestamp': '2025-09-30 22:12:13.917755', 'step': 2446, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:13.971422', 'step': 2446, 'epoch': 3} {'type': 'loss', 'content': 0.0018418595427647233, 'timestamp': '2025-09-30 22:12:13.988199', 'step': 2447, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:14.023598', 'step': 2447, 'epoch': 3} {'type': 'loss', 'content': 0.018292101100087166, 'timestamp': '2025-09-30 22:12:14.049047', 'step': 2448, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:14.121049', 'step': 2448, 'epoch': 3} {'type': 'loss', 'content': 0.01522571686655283, 'timestamp': '2025-09-30 22:12:14.124100', 'step': 2449, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:14.157646', 'step': 2449, 'epoch': 3} {'type': 'loss', 'content': 0.0033422100823372602, 'timestamp': '2025-09-30 22:12:14.161885', 'step': 2450, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:14.195614', 'step': 2450, 'epoch': 3} {'type': 'loss', 'content': 0.002138448180630803, 'timestamp': '2025-09-30 22:12:14.210631', 'step': 2451, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-30 22:12:15.179053', 'step': 2451, 'epoch': 3} {'type': 'pplx', 'content': 62585234.83673584, 'timestamp': '2025-09-30 22:12:15.183192', 'step': 2451, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:15.222614', 'step': 2451, 'epoch': 3} {'type': 'loss', 'content': 0.036872122436761856, 'timestamp': '2025-09-30 22:12:15.247415', 'step': 2452, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:15.281654', 'step': 2452, 'epoch': 3} {'type': 'loss', 'content': 0.0013359521981328726, 'timestamp': '2025-09-30 22:12:15.286305', 'step': 2453, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:15.320155', 'step': 2453, 'epoch': 3} {'type': 'loss', 'content': 0.0018873109947890043, 'timestamp': '2025-09-30 22:12:15.322950', 'step': 2454, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:15.366238', 'step': 2454, 'epoch': 3} {'type': 'loss', 'content': 0.028096193447709084, 'timestamp': '2025-09-30 22:12:15.374875', 'step': 2455, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:15.419954', 'step': 2455, 'epoch': 3} {'type': 'loss', 'content': 0.00184065627399832, 'timestamp': '2025-09-30 22:12:15.444890', 'step': 2456, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:15.478246', 'step': 2456, 'epoch': 3} {'type': 'loss', 'content': 0.0008506966987624764, 'timestamp': '2025-09-30 22:12:15.483383', 'step': 2457, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:15.521614', 'step': 2457, 'epoch': 3} {'type': 'loss', 'content': 0.004837132524698973, 'timestamp': '2025-09-30 22:12:15.524898', 'step': 2458, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:15.559790', 'step': 2458, 'epoch': 3} {'type': 'loss', 'content': 0.011527560651302338, 'timestamp': '2025-09-30 22:12:15.563171', 'step': 2459, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:15.602170', 'step': 2459, 'epoch': 3} {'type': 'loss', 'content': 0.002231266815215349, 'timestamp': '2025-09-30 22:12:15.626342', 'step': 2460, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:15.660092', 'step': 2460, 'epoch': 3} {'type': 'loss', 'content': 0.009949088096618652, 'timestamp': '2025-09-30 22:12:15.664775', 'step': 2461, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:15.702591', 'step': 2461, 'epoch': 3} {'type': 'loss', 'content': 0.016047203913331032, 'timestamp': '2025-09-30 22:12:15.705987', 'step': 2462, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:15.761061', 'step': 2462, 'epoch': 3} {'type': 'loss', 'content': 0.03306948393583298, 'timestamp': '2025-09-30 22:12:15.764529', 'step': 2463, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:15.796586', 'step': 2463, 'epoch': 3} {'type': 'loss', 'content': 0.011734615080058575, 'timestamp': '2025-09-30 22:12:15.821966', 'step': 2464, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:15.856787', 'step': 2464, 'epoch': 3} {'type': 'loss', 'content': 0.004299917258322239, 'timestamp': '2025-09-30 22:12:15.859498', 'step': 2465, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:15.904285', 'step': 2465, 'epoch': 3} {'type': 'loss', 'content': 0.0013004597276449203, 'timestamp': '2025-09-30 22:12:15.913300', 'step': 2466, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:15.957918', 'step': 2466, 'epoch': 3} {'type': 'loss', 'content': 0.007806502282619476, 'timestamp': '2025-09-30 22:12:15.960512', 'step': 2467, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:15.993666', 'step': 2467, 'epoch': 3} {'type': 'loss', 'content': 0.05851675942540169, 'timestamp': '2025-09-30 22:12:16.018333', 'step': 2468, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:16.064242', 'step': 2468, 'epoch': 3} {'type': 'loss', 'content': 0.0025632530450820923, 'timestamp': '2025-09-30 22:12:16.067653', 'step': 2469, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:16.107226', 'step': 2469, 'epoch': 3} {'type': 'loss', 'content': 0.02141547203063965, 'timestamp': '2025-09-30 22:12:16.110708', 'step': 2470, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:16.151475', 'step': 2470, 'epoch': 3} {'type': 'loss', 'content': 0.0010793081019073725, 'timestamp': '2025-09-30 22:12:16.155860', 'step': 2471, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:16.190306', 'step': 2471, 'epoch': 3} {'type': 'loss', 'content': 0.0025140002835541964, 'timestamp': '2025-09-30 22:12:16.214536', 'step': 2472, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:16.265471', 'step': 2472, 'epoch': 3} {'type': 'loss', 'content': 0.0012594551080837846, 'timestamp': '2025-09-30 22:12:16.268933', 'step': 2473, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:16.308928', 'step': 2473, 'epoch': 3} {'type': 'loss', 'content': 0.015506746247410774, 'timestamp': '2025-09-30 22:12:16.311797', 'step': 2474, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:16.346452', 'step': 2474, 'epoch': 3} {'type': 'loss', 'content': 0.00189836451318115, 'timestamp': '2025-09-30 22:12:16.350378', 'step': 2475, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:16.391136', 'step': 2475, 'epoch': 3} {'type': 'loss', 'content': 0.0021572846453636885, 'timestamp': '2025-09-30 22:12:16.422709', 'step': 2476, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:16.463272', 'step': 2476, 'epoch': 3} {'type': 'loss', 'content': 0.0027588761877268553, 'timestamp': '2025-09-30 22:12:16.466118', 'step': 2477, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:16.503079', 'step': 2477, 'epoch': 3} {'type': 'loss', 'content': 0.008273358456790447, 'timestamp': '2025-09-30 22:12:16.506913', 'step': 2478, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:16.541526', 'step': 2478, 'epoch': 3} {'type': 'loss', 'content': 0.003073727013543248, 'timestamp': '2025-09-30 22:12:16.552618', 'step': 2479, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:16.586838', 'step': 2479, 'epoch': 3} {'type': 'loss', 'content': 0.00348069379106164, 'timestamp': '2025-09-30 22:12:16.610849', 'step': 2480, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:16.657545', 'step': 2480, 'epoch': 3} {'type': 'loss', 'content': 0.011410152539610863, 'timestamp': '2025-09-30 22:12:16.660386', 'step': 2481, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:16.727292', 'step': 2481, 'epoch': 3} {'type': 'loss', 'content': 0.0004028068215120584, 'timestamp': '2025-09-30 22:12:16.731599', 'step': 2482, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:12:16.791057', 'step': 2482, 'epoch': 3} {'type': 'loss', 'content': 0.0011346134124323726, 'timestamp': '2025-09-30 22:12:16.793551', 'step': 2483, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:12:16.827410', 'step': 2483, 'epoch': 3} {'type': 'loss', 'content': 0.0011217163410037756, 'timestamp': '2025-09-30 22:12:16.852392', 'step': 2484, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:16.888043', 'step': 2484, 'epoch': 3} {'type': 'loss', 'content': 0.016155900433659554, 'timestamp': '2025-09-30 22:12:16.891312', 'step': 2485, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:16.939331', 'step': 2485, 'epoch': 3} {'type': 'loss', 'content': 0.008874503895640373, 'timestamp': '2025-09-30 22:12:16.943070', 'step': 2486, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:16.985134', 'step': 2486, 'epoch': 3} {'type': 'loss', 'content': 0.020130667835474014, 'timestamp': '2025-09-30 22:12:16.988103', 'step': 2487, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:17.022044', 'step': 2487, 'epoch': 3} {'type': 'loss', 'content': 0.008245137520134449, 'timestamp': '2025-09-30 22:12:17.053088', 'step': 2488, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:17.093042', 'step': 2488, 'epoch': 3} {'type': 'loss', 'content': 0.00684535875916481, 'timestamp': '2025-09-30 22:12:17.097028', 'step': 2489, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:17.128858', 'step': 2489, 'epoch': 3} {'type': 'loss', 'content': 7.354576518991962e-05, 'timestamp': '2025-09-30 22:12:17.131648', 'step': 2490, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:17.164087', 'step': 2490, 'epoch': 3} {'type': 'loss', 'content': 4.672312206821516e-05, 'timestamp': '2025-09-30 22:12:17.168337', 'step': 2491, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:17.208240', 'step': 2491, 'epoch': 3} {'type': 'loss', 'content': 0.04931880906224251, 'timestamp': '2025-09-30 22:12:17.232431', 'step': 2492, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:17.265929', 'step': 2492, 'epoch': 3} {'type': 'loss', 'content': 8.33717203931883e-05, 'timestamp': '2025-09-30 22:12:17.269271', 'step': 2493, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:17.317146', 'step': 2493, 'epoch': 3} {'type': 'loss', 'content': 0.002100432524457574, 'timestamp': '2025-09-30 22:12:17.319594', 'step': 2494, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:17.351844', 'step': 2494, 'epoch': 3} {'type': 'loss', 'content': 0.033802784979343414, 'timestamp': '2025-09-30 22:12:17.355014', 'step': 2495, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:17.392473', 'step': 2495, 'epoch': 3} {'type': 'loss', 'content': 7.228619506349787e-05, 'timestamp': '2025-09-30 22:12:17.416363', 'step': 2496, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:17.448412', 'step': 2496, 'epoch': 3} {'type': 'loss', 'content': 0.024196816608309746, 'timestamp': '2025-09-30 22:12:17.451206', 'step': 2497, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:12:17.483674', 'step': 2497, 'epoch': 3} {'type': 'loss', 'content': 0.004435116890817881, 'timestamp': '2025-09-30 22:12:17.494907', 'step': 2498, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:17.539375', 'step': 2498, 'epoch': 3} {'type': 'loss', 'content': 0.0020751405972987413, 'timestamp': '2025-09-30 22:12:17.542827', 'step': 2499, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:17.580444', 'step': 2499, 'epoch': 3} {'type': 'loss', 'content': 0.00010092502634506673, 'timestamp': '2025-09-30 22:12:17.607680', 'step': 2500, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 2500', 'timestamp': '2025-09-30 22:12:24.441468', 'step': 2500, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:24.476136', 'step': 2500, 'epoch': 3} {'type': 'loss', 'content': 0.00021722108067478985, 'timestamp': '2025-09-30 22:12:24.479174', 'step': 2501, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:24.512768', 'step': 2501, 'epoch': 3} {'type': 'loss', 'content': 0.00021702356752939522, 'timestamp': '2025-09-30 22:12:24.515547', 'step': 2502, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:24.552425', 'step': 2502, 'epoch': 3} {'type': 'loss', 'content': 0.03270643576979637, 'timestamp': '2025-09-30 22:12:24.555294', 'step': 2503, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:12:24.588294', 'step': 2503, 'epoch': 3} {'type': 'loss', 'content': 0.033587705343961716, 'timestamp': '2025-09-30 22:12:24.621200', 'step': 2504, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:24.657910', 'step': 2504, 'epoch': 3} {'type': 'loss', 'content': 0.0011009655427187681, 'timestamp': '2025-09-30 22:12:24.661900', 'step': 2505, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:24.711655', 'step': 2505, 'epoch': 3} {'type': 'loss', 'content': 0.025243079289793968, 'timestamp': '2025-09-30 22:12:24.714798', 'step': 2506, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:24.757544', 'step': 2506, 'epoch': 3} {'type': 'loss', 'content': 0.01099986769258976, 'timestamp': '2025-09-30 22:12:24.768866', 'step': 2507, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:24.809829', 'step': 2507, 'epoch': 3} {'type': 'loss', 'content': 0.04144655540585518, 'timestamp': '2025-09-30 22:12:24.836337', 'step': 2508, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-30 22:12:25.877102', 'step': 2508, 'epoch': 3} {'type': 'pplx', 'content': 51440041.32039302, 'timestamp': '2025-09-30 22:12:25.879619', 'step': 2508, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:25.913015', 'step': 2508, 'epoch': 3} {'type': 'loss', 'content': 0.05126844719052315, 'timestamp': '2025-09-30 22:12:25.915528', 'step': 2509, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:25.954174', 'step': 2509, 'epoch': 3} {'type': 'loss', 'content': 0.0024910145439207554, 'timestamp': '2025-09-30 22:12:25.964119', 'step': 2510, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:26.003353', 'step': 2510, 'epoch': 3} {'type': 'loss', 'content': 0.002371475100517273, 'timestamp': '2025-09-30 22:12:26.007123', 'step': 2511, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:26.044932', 'step': 2511, 'epoch': 3} {'type': 'loss', 'content': 0.008505993522703648, 'timestamp': '2025-09-30 22:12:26.068871', 'step': 2512, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:26.100655', 'step': 2512, 'epoch': 3} {'type': 'loss', 'content': 0.008775151334702969, 'timestamp': '2025-09-30 22:12:26.103596', 'step': 2513, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:26.133666', 'step': 2513, 'epoch': 3} {'type': 'loss', 'content': 0.005994870793074369, 'timestamp': '2025-09-30 22:12:26.135971', 'step': 2514, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:12:26.174131', 'step': 2514, 'epoch': 3} {'type': 'loss', 'content': 0.004314147401601076, 'timestamp': '2025-09-30 22:12:26.177619', 'step': 2515, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:12:26.212159', 'step': 2515, 'epoch': 3} {'type': 'loss', 'content': 0.010322661139070988, 'timestamp': '2025-09-30 22:12:26.237417', 'step': 2516, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:26.268834', 'step': 2516, 'epoch': 3} {'type': 'loss', 'content': 0.009751622565090656, 'timestamp': '2025-09-30 22:12:26.271108', 'step': 2517, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:26.310414', 'step': 2517, 'epoch': 3} {'type': 'loss', 'content': 0.006905748508870602, 'timestamp': '2025-09-30 22:12:26.316132', 'step': 2518, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:26.358662', 'step': 2518, 'epoch': 3} {'type': 'loss', 'content': 0.041554901748895645, 'timestamp': '2025-09-30 22:12:26.370479', 'step': 2519, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:26.407451', 'step': 2519, 'epoch': 3} {'type': 'loss', 'content': 0.0019552961457520723, 'timestamp': '2025-09-30 22:12:26.432159', 'step': 2520, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:26.477273', 'step': 2520, 'epoch': 3} {'type': 'loss', 'content': 0.00302482838742435, 'timestamp': '2025-09-30 22:12:26.481855', 'step': 2521, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:26.519834', 'step': 2521, 'epoch': 3} {'type': 'loss', 'content': 0.0036011997144669294, 'timestamp': '2025-09-30 22:12:26.522302', 'step': 2522, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:26.560096', 'step': 2522, 'epoch': 3} {'type': 'loss', 'content': 0.04953055456280708, 'timestamp': '2025-09-30 22:12:26.562550', 'step': 2523, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:26.604060', 'step': 2523, 'epoch': 3} {'type': 'loss', 'content': 0.024661192670464516, 'timestamp': '2025-09-30 22:12:26.632779', 'step': 2524, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:26.668836', 'step': 2524, 'epoch': 3} {'type': 'loss', 'content': 0.007575987372547388, 'timestamp': '2025-09-30 22:12:26.673588', 'step': 2525, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:26.712052', 'step': 2525, 'epoch': 3} {'type': 'loss', 'content': 0.01171951089054346, 'timestamp': '2025-09-30 22:12:26.714809', 'step': 2526, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:26.746519', 'step': 2526, 'epoch': 3} {'type': 'loss', 'content': 0.007814085111021996, 'timestamp': '2025-09-30 22:12:26.749250', 'step': 2527, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:26.790855', 'step': 2527, 'epoch': 3} {'type': 'loss', 'content': 0.03643188998103142, 'timestamp': '2025-09-30 22:12:26.814516', 'step': 2528, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:26.847222', 'step': 2528, 'epoch': 3} {'type': 'loss', 'content': 0.005677658133208752, 'timestamp': '2025-09-30 22:12:26.850913', 'step': 2529, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:12:26.885157', 'step': 2529, 'epoch': 3} {'type': 'loss', 'content': 0.02060539647936821, 'timestamp': '2025-09-30 22:12:26.890913', 'step': 2530, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:26.929792', 'step': 2530, 'epoch': 3} {'type': 'loss', 'content': 0.019244346767663956, 'timestamp': '2025-09-30 22:12:26.935318', 'step': 2531, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:26.974780', 'step': 2531, 'epoch': 3} {'type': 'loss', 'content': 0.005939108785241842, 'timestamp': '2025-09-30 22:12:27.001770', 'step': 2532, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:27.033678', 'step': 2532, 'epoch': 3} {'type': 'loss', 'content': 0.025780802592635155, 'timestamp': '2025-09-30 22:12:27.037527', 'step': 2533, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:27.073435', 'step': 2533, 'epoch': 3} {'type': 'loss', 'content': 0.0057016233913600445, 'timestamp': '2025-09-30 22:12:27.078239', 'step': 2534, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:12:27.111678', 'step': 2534, 'epoch': 3} {'type': 'loss', 'content': 0.005294333212077618, 'timestamp': '2025-09-30 22:12:27.114140', 'step': 2535, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:27.144822', 'step': 2535, 'epoch': 3} {'type': 'loss', 'content': 0.018375219777226448, 'timestamp': '2025-09-30 22:12:27.169702', 'step': 2536, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:27.200582', 'step': 2536, 'epoch': 3} {'type': 'loss', 'content': 0.014313088729977608, 'timestamp': '2025-09-30 22:12:27.202832', 'step': 2537, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:27.243292', 'step': 2537, 'epoch': 3} {'type': 'loss', 'content': 0.0019038409227505326, 'timestamp': '2025-09-30 22:12:27.252377', 'step': 2538, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:27.284146', 'step': 2538, 'epoch': 3} {'type': 'loss', 'content': 0.0079488605260849, 'timestamp': '2025-09-30 22:12:27.289216', 'step': 2539, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:12:27.321077', 'step': 2539, 'epoch': 3} {'type': 'loss', 'content': 0.0012947311624884605, 'timestamp': '2025-09-30 22:12:27.345027', 'step': 2540, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:27.379042', 'step': 2540, 'epoch': 3} {'type': 'loss', 'content': 0.00036997883580625057, 'timestamp': '2025-09-30 22:12:27.381824', 'step': 2541, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:27.413535', 'step': 2541, 'epoch': 3} {'type': 'loss', 'content': 0.0030447926837950945, 'timestamp': '2025-09-30 22:12:27.416146', 'step': 2542, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:27.449899', 'step': 2542, 'epoch': 3} {'type': 'loss', 'content': 0.0076310886070132256, 'timestamp': '2025-09-30 22:12:27.452189', 'step': 2543, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:27.483123', 'step': 2543, 'epoch': 3} {'type': 'loss', 'content': 0.016252310946583748, 'timestamp': '2025-09-30 22:12:27.509141', 'step': 2544, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:12:27.541918', 'step': 2544, 'epoch': 3} {'type': 'loss', 'content': 0.014082059264183044, 'timestamp': '2025-09-30 22:12:27.545842', 'step': 2545, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:27.582115', 'step': 2545, 'epoch': 3} {'type': 'loss', 'content': 0.016975652426481247, 'timestamp': '2025-09-30 22:12:27.585641', 'step': 2546, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:27.629840', 'step': 2546, 'epoch': 3} {'type': 'loss', 'content': 0.004181718919426203, 'timestamp': '2025-09-30 22:12:27.632924', 'step': 2547, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:27.664508', 'step': 2547, 'epoch': 3} {'type': 'loss', 'content': 0.001995036145672202, 'timestamp': '2025-09-30 22:12:27.688987', 'step': 2548, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:12:27.727040', 'step': 2548, 'epoch': 3} {'type': 'loss', 'content': 0.020780479535460472, 'timestamp': '2025-09-30 22:12:27.730356', 'step': 2549, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:27.765703', 'step': 2549, 'epoch': 3} {'type': 'loss', 'content': 0.013351112604141235, 'timestamp': '2025-09-30 22:12:27.769747', 'step': 2550, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:27.803467', 'step': 2550, 'epoch': 3} {'type': 'loss', 'content': 0.008309331722557545, 'timestamp': '2025-09-30 22:12:27.809594', 'step': 2551, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:27.842365', 'step': 2551, 'epoch': 3} {'type': 'loss', 'content': 0.007991375401616096, 'timestamp': '2025-09-30 22:12:27.866581', 'step': 2552, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:27.901678', 'step': 2552, 'epoch': 3} {'type': 'loss', 'content': 0.0011701161274686456, 'timestamp': '2025-09-30 22:12:27.903831', 'step': 2553, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:12:27.935098', 'step': 2553, 'epoch': 3} {'type': 'loss', 'content': 0.03201867640018463, 'timestamp': '2025-09-30 22:12:27.937765', 'step': 2554, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:27.972125', 'step': 2554, 'epoch': 3} {'type': 'loss', 'content': 0.0001384599308948964, 'timestamp': '2025-09-30 22:12:27.975342', 'step': 2555, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:28.007132', 'step': 2555, 'epoch': 3} {'type': 'loss', 'content': 0.00021487221238203347, 'timestamp': '2025-09-30 22:12:28.030624', 'step': 2556, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:28.061356', 'step': 2556, 'epoch': 3} {'type': 'loss', 'content': 0.00542490417137742, 'timestamp': '2025-09-30 22:12:28.063865', 'step': 2557, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:28.094687', 'step': 2557, 'epoch': 3} {'type': 'loss', 'content': 0.01566317304968834, 'timestamp': '2025-09-30 22:12:28.096878', 'step': 2558, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:28.133186', 'step': 2558, 'epoch': 3} {'type': 'loss', 'content': 0.03404511138796806, 'timestamp': '2025-09-30 22:12:28.136052', 'step': 2559, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:28.168448', 'step': 2559, 'epoch': 3} {'type': 'loss', 'content': 0.0022068375255912542, 'timestamp': '2025-09-30 22:12:28.192768', 'step': 2560, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:28.226671', 'step': 2560, 'epoch': 3} {'type': 'loss', 'content': 0.011201408691704273, 'timestamp': '2025-09-30 22:12:28.232748', 'step': 2561, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:28.269849', 'step': 2561, 'epoch': 3} {'type': 'loss', 'content': 0.001101975911296904, 'timestamp': '2025-09-30 22:12:28.273520', 'step': 2562, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:28.305675', 'step': 2562, 'epoch': 3} {'type': 'loss', 'content': 0.013607998378574848, 'timestamp': '2025-09-30 22:12:28.309824', 'step': 2563, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:28.342934', 'step': 2563, 'epoch': 3} {'type': 'loss', 'content': 0.03017262928187847, 'timestamp': '2025-09-30 22:12:28.368890', 'step': 2564, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:28.402515', 'step': 2564, 'epoch': 3} {'type': 'loss', 'content': 0.0012094740523025393, 'timestamp': '2025-09-30 22:12:28.408818', 'step': 2565, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-30 22:12:29.366942', 'step': 2565, 'epoch': 3} {'type': 'pplx', 'content': 39231890.14571114, 'timestamp': '2025-09-30 22:12:29.376737', 'step': 2565, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:29.409321', 'step': 2565, 'epoch': 3} {'type': 'loss', 'content': 0.014957462437450886, 'timestamp': '2025-09-30 22:12:29.411859', 'step': 2566, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:12:29.448095', 'step': 2566, 'epoch': 3} {'type': 'loss', 'content': 0.004640209022909403, 'timestamp': '2025-09-30 22:12:29.454261', 'step': 2567, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:29.496575', 'step': 2567, 'epoch': 3} {'type': 'loss', 'content': 0.004045676905661821, 'timestamp': '2025-09-30 22:12:29.521337', 'step': 2568, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:29.558966', 'step': 2568, 'epoch': 3} {'type': 'loss', 'content': 0.002006492344662547, 'timestamp': '2025-09-30 22:12:29.564906', 'step': 2569, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:29.610682', 'step': 2569, 'epoch': 3} {'type': 'loss', 'content': 0.014200160279870033, 'timestamp': '2025-09-30 22:12:29.613197', 'step': 2570, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:29.645926', 'step': 2570, 'epoch': 3} {'type': 'loss', 'content': 0.003748588962480426, 'timestamp': '2025-09-30 22:12:29.652741', 'step': 2571, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:29.686228', 'step': 2571, 'epoch': 3} {'type': 'loss', 'content': 0.011747321113944054, 'timestamp': '2025-09-30 22:12:29.710794', 'step': 2572, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:29.755916', 'step': 2572, 'epoch': 3} {'type': 'loss', 'content': 0.0015971793327480555, 'timestamp': '2025-09-30 22:12:29.758470', 'step': 2573, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:29.792048', 'step': 2573, 'epoch': 3} {'type': 'loss', 'content': 0.010424541309475899, 'timestamp': '2025-09-30 22:12:29.795357', 'step': 2574, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:12:29.834368', 'step': 2574, 'epoch': 3} {'type': 'loss', 'content': 0.0025860874447971582, 'timestamp': '2025-09-30 22:12:29.837329', 'step': 2575, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:29.872233', 'step': 2575, 'epoch': 3} {'type': 'loss', 'content': 0.0033157693687826395, 'timestamp': '2025-09-30 22:12:29.897788', 'step': 2576, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:29.930678', 'step': 2576, 'epoch': 3} {'type': 'loss', 'content': 0.002955112373456359, 'timestamp': '2025-09-30 22:12:29.934228', 'step': 2577, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:12:29.982088', 'step': 2577, 'epoch': 3} {'type': 'loss', 'content': 0.0027001777198165655, 'timestamp': '2025-09-30 22:12:29.990385', 'step': 2578, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:30.033243', 'step': 2578, 'epoch': 3} {'type': 'loss', 'content': 0.004555149935185909, 'timestamp': '2025-09-30 22:12:30.036195', 'step': 2579, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:30.078336', 'step': 2579, 'epoch': 3} {'type': 'loss', 'content': 0.00765460729598999, 'timestamp': '2025-09-30 22:12:30.104552', 'step': 2580, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:30.136627', 'step': 2580, 'epoch': 3} {'type': 'loss', 'content': 0.001375475199893117, 'timestamp': '2025-09-30 22:12:30.141326', 'step': 2581, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:30.186922', 'step': 2581, 'epoch': 3} {'type': 'loss', 'content': 0.004311113618314266, 'timestamp': '2025-09-30 22:12:30.189913', 'step': 2582, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:30.230673', 'step': 2582, 'epoch': 3} {'type': 'loss', 'content': 0.001586766797117889, 'timestamp': '2025-09-30 22:12:30.233034', 'step': 2583, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:30.264135', 'step': 2583, 'epoch': 3} {'type': 'loss', 'content': 0.0055139511823654175, 'timestamp': '2025-09-30 22:12:30.289063', 'step': 2584, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:30.325416', 'step': 2584, 'epoch': 3} {'type': 'loss', 'content': 0.0027511361986398697, 'timestamp': '2025-09-30 22:12:30.329234', 'step': 2585, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:30.360416', 'step': 2585, 'epoch': 3} {'type': 'loss', 'content': 0.03730406612157822, 'timestamp': '2025-09-30 22:12:30.362790', 'step': 2586, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:30.395216', 'step': 2586, 'epoch': 3} {'type': 'loss', 'content': 0.007691751234233379, 'timestamp': '2025-09-30 22:12:30.402560', 'step': 2587, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:12:30.438793', 'step': 2587, 'epoch': 3} {'type': 'loss', 'content': 0.009390302933752537, 'timestamp': '2025-09-30 22:12:30.462611', 'step': 2588, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:12:30.500825', 'step': 2588, 'epoch': 3} {'type': 'loss', 'content': 0.00674975011497736, 'timestamp': '2025-09-30 22:12:30.503073', 'step': 2589, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:30.540795', 'step': 2589, 'epoch': 3} {'type': 'loss', 'content': 0.007475547958165407, 'timestamp': '2025-09-30 22:12:30.544910', 'step': 2590, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:30.587056', 'step': 2590, 'epoch': 3} {'type': 'loss', 'content': 0.012665103189647198, 'timestamp': '2025-09-30 22:12:30.591335', 'step': 2591, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:30.627082', 'step': 2591, 'epoch': 3} {'type': 'loss', 'content': 0.0015189863042905927, 'timestamp': '2025-09-30 22:12:30.652721', 'step': 2592, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:30.691876', 'step': 2592, 'epoch': 3} {'type': 'loss', 'content': 0.0010610331082716584, 'timestamp': '2025-09-30 22:12:30.701017', 'step': 2593, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:30.736087', 'step': 2593, 'epoch': 3} {'type': 'loss', 'content': 0.0031566577963531017, 'timestamp': '2025-09-30 22:12:30.739780', 'step': 2594, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:30.779880', 'step': 2594, 'epoch': 3} {'type': 'loss', 'content': 0.008387443609535694, 'timestamp': '2025-09-30 22:12:30.782575', 'step': 2595, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:12:30.830165', 'step': 2595, 'epoch': 3} {'type': 'loss', 'content': 0.022871583700180054, 'timestamp': '2025-09-30 22:12:30.858927', 'step': 2596, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:30.893996', 'step': 2596, 'epoch': 3} {'type': 'loss', 'content': 0.004506740719079971, 'timestamp': '2025-09-30 22:12:30.896576', 'step': 2597, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:30.949635', 'step': 2597, 'epoch': 3} {'type': 'loss', 'content': 0.0055312542244791985, 'timestamp': '2025-09-30 22:12:30.954854', 'step': 2598, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:31.003512', 'step': 2598, 'epoch': 3} {'type': 'loss', 'content': 0.009669343009591103, 'timestamp': '2025-09-30 22:12:31.007371', 'step': 2599, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:31.040506', 'step': 2599, 'epoch': 3} {'type': 'loss', 'content': 0.00065530592110008, 'timestamp': '2025-09-30 22:12:31.065660', 'step': 2600, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:31.101798', 'step': 2600, 'epoch': 3} {'type': 'loss', 'content': 0.0021030474454164505, 'timestamp': '2025-09-30 22:12:31.104441', 'step': 2601, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:31.138107', 'step': 2601, 'epoch': 3} {'type': 'loss', 'content': 0.0021827437449246645, 'timestamp': '2025-09-30 22:12:31.140443', 'step': 2602, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:31.178379', 'step': 2602, 'epoch': 3} {'type': 'loss', 'content': 0.0010505650425329804, 'timestamp': '2025-09-30 22:12:31.180654', 'step': 2603, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:31.215997', 'step': 2603, 'epoch': 3} {'type': 'loss', 'content': 0.015516921877861023, 'timestamp': '2025-09-30 22:12:31.239625', 'step': 2604, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:12:31.276673', 'step': 2604, 'epoch': 3} {'type': 'loss', 'content': 0.006490841507911682, 'timestamp': '2025-09-30 22:12:31.279848', 'step': 2605, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:31.315663', 'step': 2605, 'epoch': 3} {'type': 'loss', 'content': 0.0006698822253383696, 'timestamp': '2025-09-30 22:12:31.318399', 'step': 2606, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:31.353069', 'step': 2606, 'epoch': 3} {'type': 'loss', 'content': 0.02557486668229103, 'timestamp': '2025-09-30 22:12:31.355880', 'step': 2607, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:31.394065', 'step': 2607, 'epoch': 3} {'type': 'loss', 'content': 0.003963562194257975, 'timestamp': '2025-09-30 22:12:31.421702', 'step': 2608, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:12:31.455888', 'step': 2608, 'epoch': 3} {'type': 'loss', 'content': 0.00249542691744864, 'timestamp': '2025-09-30 22:12:31.460311', 'step': 2609, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:31.493897', 'step': 2609, 'epoch': 3} {'type': 'loss', 'content': 0.002343180123716593, 'timestamp': '2025-09-30 22:12:31.496673', 'step': 2610, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:31.542619', 'step': 2610, 'epoch': 3} {'type': 'loss', 'content': 0.002632777439430356, 'timestamp': '2025-09-30 22:12:31.545298', 'step': 2611, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:31.579660', 'step': 2611, 'epoch': 3} {'type': 'loss', 'content': 0.00020765874069184065, 'timestamp': '2025-09-30 22:12:31.603546', 'step': 2612, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:31.636516', 'step': 2612, 'epoch': 3} {'type': 'loss', 'content': 0.0328451506793499, 'timestamp': '2025-09-30 22:12:31.639116', 'step': 2613, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:31.675980', 'step': 2613, 'epoch': 3} {'type': 'loss', 'content': 0.0003175671736244112, 'timestamp': '2025-09-30 22:12:31.678731', 'step': 2614, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:31.721871', 'step': 2614, 'epoch': 3} {'type': 'loss', 'content': 0.0012592887505888939, 'timestamp': '2025-09-30 22:12:31.727277', 'step': 2615, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:12:31.769963', 'step': 2615, 'epoch': 3} {'type': 'loss', 'content': 0.0013843054184690118, 'timestamp': '2025-09-30 22:12:31.796074', 'step': 2616, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:31.829063', 'step': 2616, 'epoch': 3} {'type': 'loss', 'content': 0.0089047159999609, 'timestamp': '2025-09-30 22:12:31.832200', 'step': 2617, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:12:31.871034', 'step': 2617, 'epoch': 3} {'type': 'loss', 'content': 0.0008639580337330699, 'timestamp': '2025-09-30 22:12:31.874295', 'step': 2618, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:31.908469', 'step': 2618, 'epoch': 3} {'type': 'loss', 'content': 0.00013822069740854204, 'timestamp': '2025-09-30 22:12:31.911191', 'step': 2619, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:31.951390', 'step': 2619, 'epoch': 3} {'type': 'loss', 'content': 6.882520392537117e-05, 'timestamp': '2025-09-30 22:12:31.976319', 'step': 2620, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:32.013452', 'step': 2620, 'epoch': 3} {'type': 'loss', 'content': 4.944354441249743e-05, 'timestamp': '2025-09-30 22:12:32.016223', 'step': 2621, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:32.051172', 'step': 2621, 'epoch': 3} {'type': 'loss', 'content': 0.008317261002957821, 'timestamp': '2025-09-30 22:12:32.061533', 'step': 2622, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-30 22:12:33.118249', 'step': 2622, 'epoch': 3} {'type': 'pplx', 'content': 46235497.62527117, 'timestamp': '2025-09-30 22:12:33.120941', 'step': 2622, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:33.156231', 'step': 2622, 'epoch': 3} {'type': 'loss', 'content': 0.003576470073312521, 'timestamp': '2025-09-30 22:12:33.159119', 'step': 2623, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:12:33.207386', 'step': 2623, 'epoch': 3} {'type': 'loss', 'content': 6.79981749271974e-05, 'timestamp': '2025-09-30 22:12:33.231660', 'step': 2624, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:33.273982', 'step': 2624, 'epoch': 3} {'type': 'loss', 'content': 0.027295293286442757, 'timestamp': '2025-09-30 22:12:33.282084', 'step': 2625, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:33.314005', 'step': 2625, 'epoch': 3} {'type': 'loss', 'content': 0.0028682462871074677, 'timestamp': '2025-09-30 22:12:33.322254', 'step': 2626, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:33.365089', 'step': 2626, 'epoch': 3} {'type': 'loss', 'content': 0.0009013204253278673, 'timestamp': '2025-09-30 22:12:33.368126', 'step': 2627, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:33.411800', 'step': 2627, 'epoch': 3} {'type': 'loss', 'content': 0.026948420330882072, 'timestamp': '2025-09-30 22:12:33.438808', 'step': 2628, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:33.478169', 'step': 2628, 'epoch': 3} {'type': 'loss', 'content': 0.00045880835386924446, 'timestamp': '2025-09-30 22:12:33.485131', 'step': 2629, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:33.519423', 'step': 2629, 'epoch': 3} {'type': 'loss', 'content': 0.000256069382885471, 'timestamp': '2025-09-30 22:12:33.522257', 'step': 2630, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:33.554349', 'step': 2630, 'epoch': 3} {'type': 'loss', 'content': 0.0002860224631149322, 'timestamp': '2025-09-30 22:12:33.557109', 'step': 2631, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:33.593198', 'step': 2631, 'epoch': 3} {'type': 'loss', 'content': 0.023285450413823128, 'timestamp': '2025-09-30 22:12:33.617735', 'step': 2632, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:33.652399', 'step': 2632, 'epoch': 3} {'type': 'loss', 'content': 0.0002503176510799676, 'timestamp': '2025-09-30 22:12:33.656684', 'step': 2633, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:33.696388', 'step': 2633, 'epoch': 3} {'type': 'loss', 'content': 0.0053505441173911095, 'timestamp': '2025-09-30 22:12:33.705582', 'step': 2634, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:33.741232', 'step': 2634, 'epoch': 3} {'type': 'loss', 'content': 0.0028885682113468647, 'timestamp': '2025-09-30 22:12:33.752920', 'step': 2635, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:12:33.796115', 'step': 2635, 'epoch': 3} {'type': 'loss', 'content': 0.035982340574264526, 'timestamp': '2025-09-30 22:12:33.822390', 'step': 2636, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:33.858001', 'step': 2636, 'epoch': 3} {'type': 'loss', 'content': 0.009500211104750633, 'timestamp': '2025-09-30 22:12:33.870511', 'step': 2637, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:33.905799', 'step': 2637, 'epoch': 3} {'type': 'loss', 'content': 0.0003542073245625943, 'timestamp': '2025-09-30 22:12:33.916748', 'step': 2638, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:33.963423', 'step': 2638, 'epoch': 3} {'type': 'loss', 'content': 0.03855966776609421, 'timestamp': '2025-09-30 22:12:33.967001', 'step': 2639, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:34.003623', 'step': 2639, 'epoch': 3} {'type': 'loss', 'content': 0.01165574137121439, 'timestamp': '2025-09-30 22:12:34.032569', 'step': 2640, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:34.070451', 'step': 2640, 'epoch': 3} {'type': 'loss', 'content': 0.0001282426674151793, 'timestamp': '2025-09-30 22:12:34.082109', 'step': 2641, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:12:34.130320', 'step': 2641, 'epoch': 3} {'type': 'loss', 'content': 0.0003186077228747308, 'timestamp': '2025-09-30 22:12:34.133394', 'step': 2642, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:34.173089', 'step': 2642, 'epoch': 3} {'type': 'loss', 'content': 0.0013501204084604979, 'timestamp': '2025-09-30 22:12:34.181624', 'step': 2643, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:34.217404', 'step': 2643, 'epoch': 3} {'type': 'loss', 'content': 0.0008143013110384345, 'timestamp': '2025-09-30 22:12:34.242369', 'step': 2644, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:34.287009', 'step': 2644, 'epoch': 3} {'type': 'loss', 'content': 0.0003296361246611923, 'timestamp': '2025-09-30 22:12:34.289451', 'step': 2645, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:34.337306', 'step': 2645, 'epoch': 3} {'type': 'loss', 'content': 0.0003478021826595068, 'timestamp': '2025-09-30 22:12:34.339980', 'step': 2646, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:34.373991', 'step': 2646, 'epoch': 3} {'type': 'loss', 'content': 0.02839597873389721, 'timestamp': '2025-09-30 22:12:34.376996', 'step': 2647, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:12:34.415319', 'step': 2647, 'epoch': 3} {'type': 'loss', 'content': 0.0025328085757791996, 'timestamp': '2025-09-30 22:12:34.439917', 'step': 2648, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:12:34.487858', 'step': 2648, 'epoch': 3} {'type': 'loss', 'content': 0.0014561775606125593, 'timestamp': '2025-09-30 22:12:34.500447', 'step': 2649, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:34.539997', 'step': 2649, 'epoch': 3} {'type': 'loss', 'content': 0.00031854381086304784, 'timestamp': '2025-09-30 22:12:34.545509', 'step': 2650, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:12:34.595679', 'step': 2650, 'epoch': 3} {'type': 'loss', 'content': 0.00034756778040900826, 'timestamp': '2025-09-30 22:12:34.604222', 'step': 2651, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:34.641346', 'step': 2651, 'epoch': 3} {'type': 'loss', 'content': 0.00027000525733456016, 'timestamp': '2025-09-30 22:12:34.667001', 'step': 2652, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:34.707961', 'step': 2652, 'epoch': 3} {'type': 'loss', 'content': 0.00014926944277249277, 'timestamp': '2025-09-30 22:12:34.714366', 'step': 2653, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:34.752646', 'step': 2653, 'epoch': 3} {'type': 'loss', 'content': 0.00026244850596413016, 'timestamp': '2025-09-30 22:12:34.754881', 'step': 2654, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:34.793236', 'step': 2654, 'epoch': 3} {'type': 'loss', 'content': 0.001408580457791686, 'timestamp': '2025-09-30 22:12:34.795807', 'step': 2655, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:12:34.827955', 'step': 2655, 'epoch': 3} {'type': 'loss', 'content': 0.0016296766698360443, 'timestamp': '2025-09-30 22:12:34.852459', 'step': 2656, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:12:34.887341', 'step': 2656, 'epoch': 3} {'type': 'loss', 'content': 0.00039507076144218445, 'timestamp': '2025-09-30 22:12:34.890633', 'step': 2657, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:34.929812', 'step': 2657, 'epoch': 3} {'type': 'loss', 'content': 0.0014382427325472236, 'timestamp': '2025-09-30 22:12:34.932184', 'step': 2658, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:34.971565', 'step': 2658, 'epoch': 3} {'type': 'loss', 'content': 0.000742629577871412, 'timestamp': '2025-09-30 22:12:34.974106', 'step': 2659, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:35.010196', 'step': 2659, 'epoch': 3} {'type': 'loss', 'content': 0.0011206923518329859, 'timestamp': '2025-09-30 22:12:35.041088', 'step': 2660, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:12:35.076357', 'step': 2660, 'epoch': 3} {'type': 'loss', 'content': 0.0002413646870991215, 'timestamp': '2025-09-30 22:12:35.079435', 'step': 2661, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:12:35.112612', 'step': 2661, 'epoch': 3} {'type': 'loss', 'content': 0.0026289846282452345, 'timestamp': '2025-09-30 22:12:35.116867', 'step': 2662, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:35.158701', 'step': 2662, 'epoch': 3} {'type': 'loss', 'content': 0.004568056203424931, 'timestamp': '2025-09-30 22:12:35.162892', 'step': 2663, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:35.196591', 'step': 2663, 'epoch': 3} {'type': 'loss', 'content': 0.011687947437167168, 'timestamp': '2025-09-30 22:12:35.221743', 'step': 2664, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:35.269486', 'step': 2664, 'epoch': 3} {'type': 'loss', 'content': 0.013851391151547432, 'timestamp': '2025-09-30 22:12:35.281776', 'step': 2665, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:35.317418', 'step': 2665, 'epoch': 3} {'type': 'loss', 'content': 0.000755517918150872, 'timestamp': '2025-09-30 22:12:35.324859', 'step': 2666, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:35.367629', 'step': 2666, 'epoch': 3} {'type': 'loss', 'content': 0.0006584663642570376, 'timestamp': '2025-09-30 22:12:35.370139', 'step': 2667, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:35.406241', 'step': 2667, 'epoch': 3} {'type': 'loss', 'content': 0.004880126100033522, 'timestamp': '2025-09-30 22:12:35.434564', 'step': 2668, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:12:35.479084', 'step': 2668, 'epoch': 3} {'type': 'loss', 'content': 0.011352315545082092, 'timestamp': '2025-09-30 22:12:35.481569', 'step': 2669, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:35.514422', 'step': 2669, 'epoch': 3} {'type': 'loss', 'content': 0.00172287633176893, 'timestamp': '2025-09-30 22:12:35.525366', 'step': 2670, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:35.558477', 'step': 2670, 'epoch': 3} {'type': 'loss', 'content': 0.0005279943579807878, 'timestamp': '2025-09-30 22:12:35.561361', 'step': 2671, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:35.594845', 'step': 2671, 'epoch': 3} {'type': 'loss', 'content': 0.0005277348100207746, 'timestamp': '2025-09-30 22:12:35.619232', 'step': 2672, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:35.652163', 'step': 2672, 'epoch': 3} {'type': 'loss', 'content': 0.0016348527278751135, 'timestamp': '2025-09-30 22:12:35.656803', 'step': 2673, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:12:35.688510', 'step': 2673, 'epoch': 3} {'type': 'loss', 'content': 0.0004556818457785994, 'timestamp': '2025-09-30 22:12:35.691923', 'step': 2674, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:35.725413', 'step': 2674, 'epoch': 3} {'type': 'loss', 'content': 0.00033512728987261653, 'timestamp': '2025-09-30 22:12:35.731295', 'step': 2675, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:35.764672', 'step': 2675, 'epoch': 3} {'type': 'loss', 'content': 0.0003525232896208763, 'timestamp': '2025-09-30 22:12:35.789130', 'step': 2676, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:35.840443', 'step': 2676, 'epoch': 3} {'type': 'loss', 'content': 0.0004273319209460169, 'timestamp': '2025-09-30 22:12:35.846633', 'step': 2677, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:35.883671', 'step': 2677, 'epoch': 3} {'type': 'loss', 'content': 0.00047996934154070914, 'timestamp': '2025-09-30 22:12:35.886690', 'step': 2678, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:35.921593', 'step': 2678, 'epoch': 3} {'type': 'loss', 'content': 0.00013553262397181243, 'timestamp': '2025-09-30 22:12:35.925247', 'step': 2679, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-30 22:12:36.913562', 'step': 2679, 'epoch': 3} {'type': 'pplx', 'content': 42617958.04798346, 'timestamp': '2025-09-30 22:12:36.917864', 'step': 2679, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:36.949128', 'step': 2679, 'epoch': 3} {'type': 'loss', 'content': 0.00010548696445766836, 'timestamp': '2025-09-30 22:12:36.973270', 'step': 2680, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:37.007122', 'step': 2680, 'epoch': 3} {'type': 'loss', 'content': 0.0002033339551417157, 'timestamp': '2025-09-30 22:12:37.011913', 'step': 2681, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:37.051670', 'step': 2681, 'epoch': 3} {'type': 'loss', 'content': 0.0023126639425754547, 'timestamp': '2025-09-30 22:12:37.054087', 'step': 2682, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:37.088537', 'step': 2682, 'epoch': 3} {'type': 'loss', 'content': 0.00015263084787875414, 'timestamp': '2025-09-30 22:12:37.091494', 'step': 2683, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:37.123744', 'step': 2683, 'epoch': 3} {'type': 'loss', 'content': 0.00012648347183130682, 'timestamp': '2025-09-30 22:12:37.148056', 'step': 2684, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:37.190144', 'step': 2684, 'epoch': 3} {'type': 'loss', 'content': 0.00020340543414931744, 'timestamp': '2025-09-30 22:12:37.201276', 'step': 2685, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:12:37.235095', 'step': 2685, 'epoch': 3} {'type': 'loss', 'content': 0.00026868985150940716, 'timestamp': '2025-09-30 22:12:37.237911', 'step': 2686, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:37.270419', 'step': 2686, 'epoch': 3} {'type': 'loss', 'content': 0.0008745346567593515, 'timestamp': '2025-09-30 22:12:37.276740', 'step': 2687, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:12:37.312828', 'step': 2687, 'epoch': 3} {'type': 'loss', 'content': 0.0003780484548769891, 'timestamp': '2025-09-30 22:12:37.337304', 'step': 2688, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:37.369820', 'step': 2688, 'epoch': 3} {'type': 'loss', 'content': 0.00034286329173482955, 'timestamp': '2025-09-30 22:12:37.380102', 'step': 2689, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:37.422461', 'step': 2689, 'epoch': 3} {'type': 'loss', 'content': 0.000469776161480695, 'timestamp': '2025-09-30 22:12:37.427254', 'step': 2690, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:37.474975', 'step': 2690, 'epoch': 3} {'type': 'loss', 'content': 0.0003858681011479348, 'timestamp': '2025-09-30 22:12:37.478664', 'step': 2691, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:12:37.512143', 'step': 2691, 'epoch': 3} {'type': 'loss', 'content': 0.00017460745584685355, 'timestamp': '2025-09-30 22:12:37.536265', 'step': 2692, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:37.568058', 'step': 2692, 'epoch': 3} {'type': 'loss', 'content': 0.00015085657651070505, 'timestamp': '2025-09-30 22:12:37.572465', 'step': 2693, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:37.605977', 'step': 2693, 'epoch': 3} {'type': 'loss', 'content': 0.00038411474088206887, 'timestamp': '2025-09-30 22:12:37.609455', 'step': 2694, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:37.652678', 'step': 2694, 'epoch': 3} {'type': 'loss', 'content': 0.00023197698465082794, 'timestamp': '2025-09-30 22:12:37.662867', 'step': 2695, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:37.704594', 'step': 2695, 'epoch': 3} {'type': 'loss', 'content': 0.00028276038938201964, 'timestamp': '2025-09-30 22:12:37.729499', 'step': 2696, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:37.762768', 'step': 2696, 'epoch': 3} {'type': 'loss', 'content': 0.00018464835011400282, 'timestamp': '2025-09-30 22:12:37.765646', 'step': 2697, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:37.803089', 'step': 2697, 'epoch': 3} {'type': 'loss', 'content': 0.0001767083740560338, 'timestamp': '2025-09-30 22:12:37.806072', 'step': 2698, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:37.858964', 'step': 2698, 'epoch': 3} {'type': 'loss', 'content': 0.003367830766364932, 'timestamp': '2025-09-30 22:12:37.863758', 'step': 2699, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:37.897678', 'step': 2699, 'epoch': 3} {'type': 'loss', 'content': 0.0005176683189347386, 'timestamp': '2025-09-30 22:12:37.922003', 'step': 2700, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:37.956779', 'step': 2700, 'epoch': 3} {'type': 'loss', 'content': 0.00020833851885981858, 'timestamp': '2025-09-30 22:12:37.959761', 'step': 2701, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:37.993816', 'step': 2701, 'epoch': 3} {'type': 'loss', 'content': 0.00023042989778332412, 'timestamp': '2025-09-30 22:12:37.996791', 'step': 2702, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:38.029406', 'step': 2702, 'epoch': 3} {'type': 'loss', 'content': 0.00023371285351458937, 'timestamp': '2025-09-30 22:12:38.032643', 'step': 2703, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:38.065029', 'step': 2703, 'epoch': 3} {'type': 'loss', 'content': 0.00014216203999239951, 'timestamp': '2025-09-30 22:12:38.091723', 'step': 2704, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:38.124290', 'step': 2704, 'epoch': 3} {'type': 'loss', 'content': 0.00016069450066424906, 'timestamp': '2025-09-30 22:12:38.127909', 'step': 2705, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:38.167316', 'step': 2705, 'epoch': 3} {'type': 'loss', 'content': 0.00022106101096142083, 'timestamp': '2025-09-30 22:12:38.170411', 'step': 2706, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:38.205407', 'step': 2706, 'epoch': 3} {'type': 'loss', 'content': 0.00013788194337394089, 'timestamp': '2025-09-30 22:12:38.208092', 'step': 2707, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:38.241971', 'step': 2707, 'epoch': 3} {'type': 'loss', 'content': 0.0027592540718615055, 'timestamp': '2025-09-30 22:12:38.269289', 'step': 2708, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:38.306088', 'step': 2708, 'epoch': 3} {'type': 'loss', 'content': 0.00023793955915607512, 'timestamp': '2025-09-30 22:12:38.308756', 'step': 2709, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:38.344310', 'step': 2709, 'epoch': 3} {'type': 'loss', 'content': 0.00043959185131825507, 'timestamp': '2025-09-30 22:12:38.347239', 'step': 2710, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:38.385744', 'step': 2710, 'epoch': 3} {'type': 'loss', 'content': 0.00016678121755830944, 'timestamp': '2025-09-30 22:12:38.388621', 'step': 2711, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:38.420558', 'step': 2711, 'epoch': 3} {'type': 'loss', 'content': 0.0001507179404143244, 'timestamp': '2025-09-30 22:12:38.445568', 'step': 2712, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:38.480184', 'step': 2712, 'epoch': 3} {'type': 'loss', 'content': 0.00047512754099443555, 'timestamp': '2025-09-30 22:12:38.483799', 'step': 2713, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:12:38.530889', 'step': 2713, 'epoch': 3} {'type': 'loss', 'content': 0.05275354161858559, 'timestamp': '2025-09-30 22:12:38.541284', 'step': 2714, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:38.576767', 'step': 2714, 'epoch': 3} {'type': 'loss', 'content': 0.03382421284914017, 'timestamp': '2025-09-30 22:12:38.580081', 'step': 2715, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:38.618144', 'step': 2715, 'epoch': 3} {'type': 'loss', 'content': 0.00014904925774317235, 'timestamp': '2025-09-30 22:12:38.642280', 'step': 2716, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:38.682395', 'step': 2716, 'epoch': 3} {'type': 'loss', 'content': 0.002304724184796214, 'timestamp': '2025-09-30 22:12:38.685149', 'step': 2717, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:38.724172', 'step': 2717, 'epoch': 3} {'type': 'loss', 'content': 7.812363764969632e-05, 'timestamp': '2025-09-30 22:12:38.726961', 'step': 2718, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:38.759301', 'step': 2718, 'epoch': 3} {'type': 'loss', 'content': 0.010375364683568478, 'timestamp': '2025-09-30 22:12:38.770156', 'step': 2719, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:38.802579', 'step': 2719, 'epoch': 3} {'type': 'loss', 'content': 0.0035843809600919485, 'timestamp': '2025-09-30 22:12:38.827050', 'step': 2720, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:38.860385', 'step': 2720, 'epoch': 3} {'type': 'loss', 'content': 0.00012178834003861994, 'timestamp': '2025-09-30 22:12:38.863867', 'step': 2721, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:38.901226', 'step': 2721, 'epoch': 3} {'type': 'loss', 'content': 0.00018442458531353623, 'timestamp': '2025-09-30 22:12:38.904128', 'step': 2722, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:38.938270', 'step': 2722, 'epoch': 3} {'type': 'loss', 'content': 0.0412592850625515, 'timestamp': '2025-09-30 22:12:38.940794', 'step': 2723, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:38.972591', 'step': 2723, 'epoch': 3} {'type': 'loss', 'content': 0.0004322502645663917, 'timestamp': '2025-09-30 22:12:38.996875', 'step': 2724, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:39.039280', 'step': 2724, 'epoch': 3} {'type': 'loss', 'content': 0.00895337201654911, 'timestamp': '2025-09-30 22:12:39.041770', 'step': 2725, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:39.073608', 'step': 2725, 'epoch': 3} {'type': 'loss', 'content': 0.0011367687257006764, 'timestamp': '2025-09-30 22:12:39.081069', 'step': 2726, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:39.116189', 'step': 2726, 'epoch': 3} {'type': 'loss', 'content': 0.029959097504615784, 'timestamp': '2025-09-30 22:12:39.121750', 'step': 2727, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:39.158945', 'step': 2727, 'epoch': 3} {'type': 'loss', 'content': 0.0005665087956003845, 'timestamp': '2025-09-30 22:12:39.182699', 'step': 2728, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:39.217942', 'step': 2728, 'epoch': 3} {'type': 'loss', 'content': 0.021225055679678917, 'timestamp': '2025-09-30 22:12:39.229996', 'step': 2729, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:39.273964', 'step': 2729, 'epoch': 3} {'type': 'loss', 'content': 0.004430113825947046, 'timestamp': '2025-09-30 22:12:39.277006', 'step': 2730, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:39.314672', 'step': 2730, 'epoch': 3} {'type': 'loss', 'content': 0.0031106334645301104, 'timestamp': '2025-09-30 22:12:39.317959', 'step': 2731, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:39.351094', 'step': 2731, 'epoch': 3} {'type': 'loss', 'content': 0.0057603525929152966, 'timestamp': '2025-09-30 22:12:39.375669', 'step': 2732, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:39.410439', 'step': 2732, 'epoch': 3} {'type': 'loss', 'content': 0.005964639596641064, 'timestamp': '2025-09-30 22:12:39.413100', 'step': 2733, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:39.449550', 'step': 2733, 'epoch': 3} {'type': 'loss', 'content': 0.0006206769612617791, 'timestamp': '2025-09-30 22:12:39.452539', 'step': 2734, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:39.489464', 'step': 2734, 'epoch': 3} {'type': 'loss', 'content': 0.0002034508652286604, 'timestamp': '2025-09-30 22:12:39.492379', 'step': 2735, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:39.531365', 'step': 2735, 'epoch': 3} {'type': 'loss', 'content': 0.0005070780753158033, 'timestamp': '2025-09-30 22:12:39.557330', 'step': 2736, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-30 22:12:40.496739', 'step': 2736, 'epoch': 3} {'type': 'pplx', 'content': 38833778.565379806, 'timestamp': '2025-09-30 22:12:40.501353', 'step': 2736, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:40.537420', 'step': 2736, 'epoch': 3} {'type': 'loss', 'content': 0.00042862427653744817, 'timestamp': '2025-09-30 22:12:40.543890', 'step': 2737, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:40.587420', 'step': 2737, 'epoch': 3} {'type': 'loss', 'content': 0.0012384362053126097, 'timestamp': '2025-09-30 22:12:40.595845', 'step': 2738, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:40.644008', 'step': 2738, 'epoch': 3} {'type': 'loss', 'content': 0.0006894408725202084, 'timestamp': '2025-09-30 22:12:40.651361', 'step': 2739, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:40.686317', 'step': 2739, 'epoch': 3} {'type': 'loss', 'content': 0.0016596141504123807, 'timestamp': '2025-09-30 22:12:40.717699', 'step': 2740, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:40.752169', 'step': 2740, 'epoch': 3} {'type': 'loss', 'content': 0.0027973626274615526, 'timestamp': '2025-09-30 22:12:40.756868', 'step': 2741, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:40.793972', 'step': 2741, 'epoch': 3} {'type': 'loss', 'content': 0.0004706961044576019, 'timestamp': '2025-09-30 22:12:40.802139', 'step': 2742, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:40.838051', 'step': 2742, 'epoch': 3} {'type': 'loss', 'content': 0.014765026047825813, 'timestamp': '2025-09-30 22:12:40.840754', 'step': 2743, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:40.876213', 'step': 2743, 'epoch': 3} {'type': 'loss', 'content': 0.0023852067533880472, 'timestamp': '2025-09-30 22:12:40.905606', 'step': 2744, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:40.957718', 'step': 2744, 'epoch': 3} {'type': 'loss', 'content': 0.001915394444949925, 'timestamp': '2025-09-30 22:12:40.963871', 'step': 2745, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:12:41.000241', 'step': 2745, 'epoch': 3} {'type': 'loss', 'content': 0.048781026154756546, 'timestamp': '2025-09-30 22:12:41.005196', 'step': 2746, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:41.036291', 'step': 2746, 'epoch': 3} {'type': 'loss', 'content': 0.00042291748104617, 'timestamp': '2025-09-30 22:12:41.039170', 'step': 2747, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:12:41.070797', 'step': 2747, 'epoch': 3} {'type': 'loss', 'content': 0.0029773118440061808, 'timestamp': '2025-09-30 22:12:41.094545', 'step': 2748, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:41.135632', 'step': 2748, 'epoch': 3} {'type': 'loss', 'content': 0.010216223075985909, 'timestamp': '2025-09-30 22:12:41.142428', 'step': 2749, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:12:41.173671', 'step': 2749, 'epoch': 3} {'type': 'loss', 'content': 0.06497301906347275, 'timestamp': '2025-09-30 22:12:41.176382', 'step': 2750, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:12:41.212088', 'step': 2750, 'epoch': 3} {'type': 'loss', 'content': 0.0005495469667948782, 'timestamp': '2025-09-30 22:12:41.215170', 'step': 2751, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-30 22:12:42.223207', 'step': 2751, 'epoch': 3} {'type': 'pplx', 'content': 36628973.8921973, 'timestamp': '2025-09-30 22:12:42.226115', 'step': 2751, 'epoch': 3} {'type': 'best_pplx', 'content': 36628973.8921973, 'timestamp': '2025-09-30 22:12:42.228676', 'step': 2751, 'epoch': 3} {'type': 'best_step', 'content': 2751, 'timestamp': '2025-09-30 22:12:42.230586', 'step': 2751, 'epoch': 3} {'type': 'total_pplx_flops', 'content': 5014951860256000, 'timestamp': '2025-09-30 22:12:42.232343', 'step': 2751, 'epoch': 3} {'type': 'total_train_flops', 'content': 10640863719936576, 'timestamp': '2025-09-30 22:12:42.237411', 'step': 2751, 'epoch': 3}