{'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-30 22:06:06.153312', 'step': 0, 'epoch': 0} {'type': 'pplx', 'content': 68335452.534585, 'timestamp': '2025-09-30 22:06:06.159801', 'step': 0, 'epoch': 0} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:06.239586', 'step': 0, 'epoch': 1} {'type': 'loss', 'content': 0.3234884440898895, 'timestamp': '2025-09-30 22:06:06.244678', 'step': 1, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:06.292388', 'step': 1, 'epoch': 1} {'type': 'loss', 'content': 0.35814759135246277, 'timestamp': '2025-09-30 22:06:06.295193', 'step': 2, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:06.346912', 'step': 2, 'epoch': 1} {'type': 'loss', 'content': 0.31901952624320984, 'timestamp': '2025-09-30 22:06:06.351399', 'step': 3, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:06.388643', 'step': 3, 'epoch': 1} {'type': 'loss', 'content': 0.3721208870410919, 'timestamp': '2025-09-30 22:06:06.455471', 'step': 4, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:06:06.506862', 'step': 4, 'epoch': 1} {'type': 'loss', 'content': 0.10016343742609024, 'timestamp': '2025-09-30 22:06:06.509505', 'step': 5, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:06.548654', 'step': 5, 'epoch': 1} {'type': 'loss', 'content': 0.12158586829900742, 'timestamp': '2025-09-30 22:06:06.562010', 'step': 6, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:06:06.622986', 'step': 6, 'epoch': 1} {'type': 'loss', 'content': 0.030598539859056473, 'timestamp': '2025-09-30 22:06:06.627321', 'step': 7, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:06.662699', 'step': 7, 'epoch': 1} {'type': 'loss', 'content': 0.004455908201634884, 'timestamp': '2025-09-30 22:06:06.698078', 'step': 8, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:06:06.743279', 'step': 8, 'epoch': 1} {'type': 'loss', 'content': 0.01093387696892023, 'timestamp': '2025-09-30 22:06:06.748884', 'step': 9, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:06.781613', 'step': 9, 'epoch': 1} {'type': 'loss', 'content': 0.04314374923706055, 'timestamp': '2025-09-30 22:06:06.784665', 'step': 10, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:06:06.820810', 'step': 10, 'epoch': 1} {'type': 'loss', 'content': 0.02420825883746147, 'timestamp': '2025-09-30 22:06:06.828663', 'step': 11, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:06.862456', 'step': 11, 'epoch': 1} {'type': 'loss', 'content': 0.044014859944581985, 'timestamp': '2025-09-30 22:06:06.887934', 'step': 12, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:06.922528', 'step': 12, 'epoch': 1} {'type': 'loss', 'content': 0.0571591854095459, 'timestamp': '2025-09-30 22:06:06.925638', 'step': 13, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:06.966043', 'step': 13, 'epoch': 1} {'type': 'loss', 'content': 0.006987304426729679, 'timestamp': '2025-09-30 22:06:06.969392', 'step': 14, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:07.008983', 'step': 14, 'epoch': 1} {'type': 'loss', 'content': 0.039628542959690094, 'timestamp': '2025-09-30 22:06:07.013366', 'step': 15, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:07.048795', 'step': 15, 'epoch': 1} {'type': 'loss', 'content': 0.023766744881868362, 'timestamp': '2025-09-30 22:06:07.078880', 'step': 16, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:07.124202', 'step': 16, 'epoch': 1} {'type': 'loss', 'content': 0.020519467070698738, 'timestamp': '2025-09-30 22:06:07.132232', 'step': 17, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:07.197808', 'step': 17, 'epoch': 1} {'type': 'loss', 'content': 0.041614633053541183, 'timestamp': '2025-09-30 22:06:07.201921', 'step': 18, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:06:07.244302', 'step': 18, 'epoch': 1} {'type': 'loss', 'content': 0.04117533192038536, 'timestamp': '2025-09-30 22:06:07.252496', 'step': 19, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:07.293362', 'step': 19, 'epoch': 1} {'type': 'loss', 'content': 0.045252926647663116, 'timestamp': '2025-09-30 22:06:07.321180', 'step': 20, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:07.353793', 'step': 20, 'epoch': 1} {'type': 'loss', 'content': 0.027239639312028885, 'timestamp': '2025-09-30 22:06:07.362465', 'step': 21, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:07.396025', 'step': 21, 'epoch': 1} {'type': 'loss', 'content': 0.023690115660429, 'timestamp': '2025-09-30 22:06:07.407312', 'step': 22, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:07.448922', 'step': 22, 'epoch': 1} {'type': 'loss', 'content': 0.02684900537133217, 'timestamp': '2025-09-30 22:06:07.460668', 'step': 23, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:07.498815', 'step': 23, 'epoch': 1} {'type': 'loss', 'content': 0.026328429579734802, 'timestamp': '2025-09-30 22:06:07.531164', 'step': 24, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:07.569855', 'step': 24, 'epoch': 1} {'type': 'loss', 'content': 0.02680337429046631, 'timestamp': '2025-09-30 22:06:07.577922', 'step': 25, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:06:07.619007', 'step': 25, 'epoch': 1} {'type': 'loss', 'content': 0.036179907619953156, 'timestamp': '2025-09-30 22:06:07.628077', 'step': 26, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:07.675201', 'step': 26, 'epoch': 1} {'type': 'loss', 'content': 0.04021405801177025, 'timestamp': '2025-09-30 22:06:07.686454', 'step': 27, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:06:07.738628', 'step': 27, 'epoch': 1} {'type': 'loss', 'content': 0.03643062338232994, 'timestamp': '2025-09-30 22:06:07.764616', 'step': 28, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:06:07.808584', 'step': 28, 'epoch': 1} {'type': 'loss', 'content': 0.02700633928179741, 'timestamp': '2025-09-30 22:06:07.813260', 'step': 29, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:07.858983', 'step': 29, 'epoch': 1} {'type': 'loss', 'content': 0.028900552541017532, 'timestamp': '2025-09-30 22:06:07.861506', 'step': 30, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:07.899420', 'step': 30, 'epoch': 1} {'type': 'loss', 'content': 0.03417787328362465, 'timestamp': '2025-09-30 22:06:07.906483', 'step': 31, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:07.945767', 'step': 31, 'epoch': 1} {'type': 'loss', 'content': 0.029415259137749672, 'timestamp': '2025-09-30 22:06:07.970091', 'step': 32, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:08.002736', 'step': 32, 'epoch': 1} {'type': 'loss', 'content': 0.018809909000992775, 'timestamp': '2025-09-30 22:06:08.008770', 'step': 33, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:08.040796', 'step': 33, 'epoch': 1} {'type': 'loss', 'content': 0.02093845047056675, 'timestamp': '2025-09-30 22:06:08.047832', 'step': 34, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:08.085350', 'step': 34, 'epoch': 1} {'type': 'loss', 'content': 0.036695707589387894, 'timestamp': '2025-09-30 22:06:08.090001', 'step': 35, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:08.133354', 'step': 35, 'epoch': 1} {'type': 'loss', 'content': 0.034678272902965546, 'timestamp': '2025-09-30 22:06:08.162471', 'step': 36, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:08.195249', 'step': 36, 'epoch': 1} {'type': 'loss', 'content': 0.025288773700594902, 'timestamp': '2025-09-30 22:06:08.198670', 'step': 37, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:08.238253', 'step': 37, 'epoch': 1} {'type': 'loss', 'content': 0.030010608956217766, 'timestamp': '2025-09-30 22:06:08.248944', 'step': 38, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:08.289167', 'step': 38, 'epoch': 1} {'type': 'loss', 'content': 0.022928962484002113, 'timestamp': '2025-09-30 22:06:08.298947', 'step': 39, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-30 22:06:08.941089', 'step': 39, 'epoch': 1} {'type': 'pplx', 'content': 50567875.175425716, 'timestamp': '2025-09-30 22:06:08.946960', 'step': 39, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:08.984603', 'step': 39, 'epoch': 1} {'type': 'loss', 'content': 0.021819669753313065, 'timestamp': '2025-09-30 22:06:09.009552', 'step': 40, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:09.042289', 'step': 40, 'epoch': 1} {'type': 'loss', 'content': 0.02205788530409336, 'timestamp': '2025-09-30 22:06:09.049659', 'step': 41, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:09.080398', 'step': 41, 'epoch': 1} {'type': 'loss', 'content': 0.02417643368244171, 'timestamp': '2025-09-30 22:06:09.088249', 'step': 42, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:09.124471', 'step': 42, 'epoch': 1} {'type': 'loss', 'content': 0.022384127601981163, 'timestamp': '2025-09-30 22:06:09.132259', 'step': 43, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:06:09.169630', 'step': 43, 'epoch': 1} {'type': 'loss', 'content': 0.025165708735585213, 'timestamp': '2025-09-30 22:06:09.198593', 'step': 44, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:09.232888', 'step': 44, 'epoch': 1} {'type': 'loss', 'content': 0.020649874582886696, 'timestamp': '2025-09-30 22:06:09.242566', 'step': 45, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:09.278932', 'step': 45, 'epoch': 1} {'type': 'loss', 'content': 0.024506300687789917, 'timestamp': '2025-09-30 22:06:09.285321', 'step': 46, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:09.320739', 'step': 46, 'epoch': 1} {'type': 'loss', 'content': 0.030327823013067245, 'timestamp': '2025-09-30 22:06:09.327781', 'step': 47, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:06:09.363739', 'step': 47, 'epoch': 1} {'type': 'loss', 'content': 0.026908371597528458, 'timestamp': '2025-09-30 22:06:09.392728', 'step': 48, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:09.435116', 'step': 48, 'epoch': 1} {'type': 'loss', 'content': 0.023281460627913475, 'timestamp': '2025-09-30 22:06:09.441497', 'step': 49, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:09.474274', 'step': 49, 'epoch': 1} {'type': 'loss', 'content': 0.017589669674634933, 'timestamp': '2025-09-30 22:06:09.483069', 'step': 50, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:09.523298', 'step': 50, 'epoch': 1} {'type': 'loss', 'content': 0.026965469121932983, 'timestamp': '2025-09-30 22:06:09.532137', 'step': 51, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:06:09.571058', 'step': 51, 'epoch': 1} {'type': 'loss', 'content': 0.0181273240596056, 'timestamp': '2025-09-30 22:06:09.599752', 'step': 52, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:09.634906', 'step': 52, 'epoch': 1} {'type': 'loss', 'content': 0.03717336431145668, 'timestamp': '2025-09-30 22:06:09.642417', 'step': 53, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:09.674098', 'step': 53, 'epoch': 1} {'type': 'loss', 'content': 0.034199971705675125, 'timestamp': '2025-09-30 22:06:09.682337', 'step': 54, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:09.718202', 'step': 54, 'epoch': 1} {'type': 'loss', 'content': 0.03449447080492973, 'timestamp': '2025-09-30 22:06:09.725351', 'step': 55, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:09.762014', 'step': 55, 'epoch': 1} {'type': 'loss', 'content': 0.02672051265835762, 'timestamp': '2025-09-30 22:06:09.792245', 'step': 56, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:09.828974', 'step': 56, 'epoch': 1} {'type': 'loss', 'content': 0.02459951676428318, 'timestamp': '2025-09-30 22:06:09.832573', 'step': 57, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:06:09.869323', 'step': 57, 'epoch': 1} {'type': 'loss', 'content': 0.038649629801511765, 'timestamp': '2025-09-30 22:06:09.877878', 'step': 58, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:06:09.916637', 'step': 58, 'epoch': 1} {'type': 'loss', 'content': 0.026001831516623497, 'timestamp': '2025-09-30 22:06:09.919367', 'step': 59, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:09.950252', 'step': 59, 'epoch': 1} {'type': 'loss', 'content': 0.027564698830246925, 'timestamp': '2025-09-30 22:06:09.978868', 'step': 60, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:06:10.023545', 'step': 60, 'epoch': 1} {'type': 'loss', 'content': 0.026089642196893692, 'timestamp': '2025-09-30 22:06:10.026388', 'step': 61, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:10.064725', 'step': 61, 'epoch': 1} {'type': 'loss', 'content': 0.015961112454533577, 'timestamp': '2025-09-30 22:06:10.072737', 'step': 62, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:06:10.112461', 'step': 62, 'epoch': 1} {'type': 'loss', 'content': 0.02579752914607525, 'timestamp': '2025-09-30 22:06:10.122008', 'step': 63, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:10.156037', 'step': 63, 'epoch': 1} {'type': 'loss', 'content': 0.02118992619216442, 'timestamp': '2025-09-30 22:06:10.181430', 'step': 64, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:10.215209', 'step': 64, 'epoch': 1} {'type': 'loss', 'content': 0.020561648532748222, 'timestamp': '2025-09-30 22:06:10.225671', 'step': 65, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:10.265401', 'step': 65, 'epoch': 1} {'type': 'loss', 'content': 0.024944249540567398, 'timestamp': '2025-09-30 22:06:10.271671', 'step': 66, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:10.311390', 'step': 66, 'epoch': 1} {'type': 'loss', 'content': 0.02434808574616909, 'timestamp': '2025-09-30 22:06:10.315703', 'step': 67, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:06:10.349470', 'step': 67, 'epoch': 1} {'type': 'loss', 'content': 0.01686147227883339, 'timestamp': '2025-09-30 22:06:10.379368', 'step': 68, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:10.415927', 'step': 68, 'epoch': 1} {'type': 'loss', 'content': 0.02224905416369438, 'timestamp': '2025-09-30 22:06:10.423488', 'step': 69, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:10.459309', 'step': 69, 'epoch': 1} {'type': 'loss', 'content': 0.028607143089175224, 'timestamp': '2025-09-30 22:06:10.469321', 'step': 70, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:06:10.509787', 'step': 70, 'epoch': 1} {'type': 'loss', 'content': 0.020568443462252617, 'timestamp': '2025-09-30 22:06:10.512584', 'step': 71, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:06:10.554598', 'step': 71, 'epoch': 1} {'type': 'loss', 'content': 0.022160660475492477, 'timestamp': '2025-09-30 22:06:10.578901', 'step': 72, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:06:10.617170', 'step': 72, 'epoch': 1} {'type': 'loss', 'content': 0.02690061740577221, 'timestamp': '2025-09-30 22:06:10.620387', 'step': 73, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:10.655114', 'step': 73, 'epoch': 1} {'type': 'loss', 'content': 0.025107603520154953, 'timestamp': '2025-09-30 22:06:10.671213', 'step': 74, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:10.704822', 'step': 74, 'epoch': 1} {'type': 'loss', 'content': 0.026476547122001648, 'timestamp': '2025-09-30 22:06:10.708188', 'step': 75, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:10.741972', 'step': 75, 'epoch': 1} {'type': 'loss', 'content': 0.032140474766492844, 'timestamp': '2025-09-30 22:06:10.767598', 'step': 76, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:10.799849', 'step': 76, 'epoch': 1} {'type': 'loss', 'content': 0.021117273718118668, 'timestamp': '2025-09-30 22:06:10.804924', 'step': 77, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:10.842499', 'step': 77, 'epoch': 1} {'type': 'loss', 'content': 0.023193124681711197, 'timestamp': '2025-09-30 22:06:10.849349', 'step': 78, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-30 22:06:11.470137', 'step': 78, 'epoch': 1} {'type': 'pplx', 'content': 57662058.52283761, 'timestamp': '2025-09-30 22:06:11.476854', 'step': 78, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:06:11.512578', 'step': 78, 'epoch': 1} {'type': 'loss', 'content': 0.028916534036397934, 'timestamp': '2025-09-30 22:06:11.519955', 'step': 79, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:11.553866', 'step': 79, 'epoch': 1} {'type': 'loss', 'content': 0.023038769140839577, 'timestamp': '2025-09-30 22:06:11.584898', 'step': 80, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:11.626129', 'step': 80, 'epoch': 1} {'type': 'loss', 'content': 0.023145731538534164, 'timestamp': '2025-09-30 22:06:11.630803', 'step': 81, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:11.663277', 'step': 81, 'epoch': 1} {'type': 'loss', 'content': 0.030305249616503716, 'timestamp': '2025-09-30 22:06:11.667909', 'step': 82, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:06:11.702379', 'step': 82, 'epoch': 1} {'type': 'loss', 'content': 0.02923610992729664, 'timestamp': '2025-09-30 22:06:11.710410', 'step': 83, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:11.750975', 'step': 83, 'epoch': 1} {'type': 'loss', 'content': 0.02699192799627781, 'timestamp': '2025-09-30 22:06:11.775982', 'step': 84, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:06:11.811393', 'step': 84, 'epoch': 1} {'type': 'loss', 'content': 0.021742191165685654, 'timestamp': '2025-09-30 22:06:11.814504', 'step': 85, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:11.852633', 'step': 85, 'epoch': 1} {'type': 'loss', 'content': 0.023906385526061058, 'timestamp': '2025-09-30 22:06:11.864993', 'step': 86, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:06:11.908281', 'step': 86, 'epoch': 1} {'type': 'loss', 'content': 0.02628713846206665, 'timestamp': '2025-09-30 22:06:11.917104', 'step': 87, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:11.958177', 'step': 87, 'epoch': 1} {'type': 'loss', 'content': 0.02474205382168293, 'timestamp': '2025-09-30 22:06:11.986085', 'step': 88, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:12.021239', 'step': 88, 'epoch': 1} {'type': 'loss', 'content': 0.024998720735311508, 'timestamp': '2025-09-30 22:06:12.025688', 'step': 89, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:12.056963', 'step': 89, 'epoch': 1} {'type': 'loss', 'content': 0.024945732206106186, 'timestamp': '2025-09-30 22:06:12.064445', 'step': 90, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:12.102084', 'step': 90, 'epoch': 1} {'type': 'loss', 'content': 0.02451312355697155, 'timestamp': '2025-09-30 22:06:12.104799', 'step': 91, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:12.137058', 'step': 91, 'epoch': 1} {'type': 'loss', 'content': 0.024372024461627007, 'timestamp': '2025-09-30 22:06:12.169005', 'step': 92, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:12.205970', 'step': 92, 'epoch': 1} {'type': 'loss', 'content': 0.0259132981300354, 'timestamp': '2025-09-30 22:06:12.215291', 'step': 93, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:12.252919', 'step': 93, 'epoch': 1} {'type': 'loss', 'content': 0.02676175720989704, 'timestamp': '2025-09-30 22:06:12.257787', 'step': 94, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:12.291285', 'step': 94, 'epoch': 1} {'type': 'loss', 'content': 0.02507203258574009, 'timestamp': '2025-09-30 22:06:12.295781', 'step': 95, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:12.333920', 'step': 95, 'epoch': 1} {'type': 'loss', 'content': 0.024420084431767464, 'timestamp': '2025-09-30 22:06:12.362162', 'step': 96, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:12.395610', 'step': 96, 'epoch': 1} {'type': 'loss', 'content': 0.022208968177437782, 'timestamp': '2025-09-30 22:06:12.402115', 'step': 97, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:12.438826', 'step': 97, 'epoch': 1} {'type': 'loss', 'content': 0.024541867896914482, 'timestamp': '2025-09-30 22:06:12.446117', 'step': 98, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:06:12.484405', 'step': 98, 'epoch': 1} {'type': 'loss', 'content': 0.027195433154702187, 'timestamp': '2025-09-30 22:06:12.492014', 'step': 99, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:12.526469', 'step': 99, 'epoch': 1} {'type': 'loss', 'content': 0.02440270408987999, 'timestamp': '2025-09-30 22:06:12.554906', 'step': 100, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:12.594181', 'step': 100, 'epoch': 1} {'type': 'loss', 'content': 0.025244589895009995, 'timestamp': '2025-09-30 22:06:12.598914', 'step': 101, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:12.635269', 'step': 101, 'epoch': 1} {'type': 'loss', 'content': 0.02790530025959015, 'timestamp': '2025-09-30 22:06:12.644348', 'step': 102, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:12.681147', 'step': 102, 'epoch': 1} {'type': 'loss', 'content': 0.028556600213050842, 'timestamp': '2025-09-30 22:06:12.683834', 'step': 103, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:12.720286', 'step': 103, 'epoch': 1} {'type': 'loss', 'content': 0.02198629453778267, 'timestamp': '2025-09-30 22:06:12.745656', 'step': 104, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:06:12.776981', 'step': 104, 'epoch': 1} {'type': 'loss', 'content': 0.029764581471681595, 'timestamp': '2025-09-30 22:06:12.783395', 'step': 105, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:06:12.818699', 'step': 105, 'epoch': 1} {'type': 'loss', 'content': 0.026740163564682007, 'timestamp': '2025-09-30 22:06:12.824581', 'step': 106, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:12.859914', 'step': 106, 'epoch': 1} {'type': 'loss', 'content': 0.02745322324335575, 'timestamp': '2025-09-30 22:06:12.864187', 'step': 107, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:12.903188', 'step': 107, 'epoch': 1} {'type': 'loss', 'content': 0.025504307821393013, 'timestamp': '2025-09-30 22:06:12.929627', 'step': 108, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:12.966009', 'step': 108, 'epoch': 1} {'type': 'loss', 'content': 0.018806248903274536, 'timestamp': '2025-09-30 22:06:12.968948', 'step': 109, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:13.003576', 'step': 109, 'epoch': 1} {'type': 'loss', 'content': 0.03055635467171669, 'timestamp': '2025-09-30 22:06:13.007649', 'step': 110, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:13.044541', 'step': 110, 'epoch': 1} {'type': 'loss', 'content': 0.026181330904364586, 'timestamp': '2025-09-30 22:06:13.049180', 'step': 111, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:13.079905', 'step': 111, 'epoch': 1} {'type': 'loss', 'content': 0.02378338761627674, 'timestamp': '2025-09-30 22:06:13.109191', 'step': 112, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:13.141543', 'step': 112, 'epoch': 1} {'type': 'loss', 'content': 0.025588760152459145, 'timestamp': '2025-09-30 22:06:13.149020', 'step': 113, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:06:13.181795', 'step': 113, 'epoch': 1} {'type': 'loss', 'content': 0.0226790402084589, 'timestamp': '2025-09-30 22:06:13.189465', 'step': 114, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:13.229374', 'step': 114, 'epoch': 1} {'type': 'loss', 'content': 0.02701934054493904, 'timestamp': '2025-09-30 22:06:13.233135', 'step': 115, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:13.265186', 'step': 115, 'epoch': 1} {'type': 'loss', 'content': 0.025871511548757553, 'timestamp': '2025-09-30 22:06:13.292187', 'step': 116, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:06:13.331077', 'step': 116, 'epoch': 1} {'type': 'loss', 'content': 0.01798613741993904, 'timestamp': '2025-09-30 22:06:13.337599', 'step': 117, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-30 22:06:13.985580', 'step': 117, 'epoch': 1} {'type': 'pplx', 'content': 58057663.45508437, 'timestamp': '2025-09-30 22:06:13.988297', 'step': 117, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:14.024582', 'step': 117, 'epoch': 1} {'type': 'loss', 'content': 0.023783918470144272, 'timestamp': '2025-09-30 22:06:14.034515', 'step': 118, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:14.074642', 'step': 118, 'epoch': 1} {'type': 'loss', 'content': 0.023426802828907967, 'timestamp': '2025-09-30 22:06:14.079544', 'step': 119, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:06:14.127430', 'step': 119, 'epoch': 1} {'type': 'loss', 'content': 0.023924678564071655, 'timestamp': '2025-09-30 22:06:14.156372', 'step': 120, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:14.191732', 'step': 120, 'epoch': 1} {'type': 'loss', 'content': 0.02505522407591343, 'timestamp': '2025-09-30 22:06:14.202667', 'step': 121, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:14.243795', 'step': 121, 'epoch': 1} {'type': 'loss', 'content': 0.023241449147462845, 'timestamp': '2025-09-30 22:06:14.248981', 'step': 122, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:14.293503', 'step': 122, 'epoch': 1} {'type': 'loss', 'content': 0.026728898286819458, 'timestamp': '2025-09-30 22:06:14.310532', 'step': 123, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:06:14.349036', 'step': 123, 'epoch': 1} {'type': 'loss', 'content': 0.028911380097270012, 'timestamp': '2025-09-30 22:06:14.383960', 'step': 124, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:14.430695', 'step': 124, 'epoch': 1} {'type': 'loss', 'content': 0.023794135078787804, 'timestamp': '2025-09-30 22:06:14.446784', 'step': 125, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:14.496583', 'step': 125, 'epoch': 1} {'type': 'loss', 'content': 0.021708523854613304, 'timestamp': '2025-09-30 22:06:14.500936', 'step': 126, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:06:14.538234', 'step': 126, 'epoch': 1} {'type': 'loss', 'content': 0.023822659626603127, 'timestamp': '2025-09-30 22:06:14.546642', 'step': 127, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:06:14.582032', 'step': 127, 'epoch': 1} {'type': 'loss', 'content': 0.024587152525782585, 'timestamp': '2025-09-30 22:06:14.607227', 'step': 128, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:14.642587', 'step': 128, 'epoch': 1} {'type': 'loss', 'content': 0.025723149999976158, 'timestamp': '2025-09-30 22:06:14.647641', 'step': 129, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:06:14.698697', 'step': 129, 'epoch': 1} {'type': 'loss', 'content': 0.025141611695289612, 'timestamp': '2025-09-30 22:06:14.704698', 'step': 130, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:14.739117', 'step': 130, 'epoch': 1} {'type': 'loss', 'content': 0.02382255718111992, 'timestamp': '2025-09-30 22:06:14.743358', 'step': 131, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:06:14.776924', 'step': 131, 'epoch': 1} {'type': 'loss', 'content': 0.02148338034749031, 'timestamp': '2025-09-30 22:06:14.801074', 'step': 132, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:14.832460', 'step': 132, 'epoch': 1} {'type': 'loss', 'content': 0.023856794461607933, 'timestamp': '2025-09-30 22:06:14.834499', 'step': 133, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:14.879981', 'step': 133, 'epoch': 1} {'type': 'loss', 'content': 0.02617805078625679, 'timestamp': '2025-09-30 22:06:14.886922', 'step': 134, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:06:14.927519', 'step': 134, 'epoch': 1} {'type': 'loss', 'content': 0.023855309933423996, 'timestamp': '2025-09-30 22:06:14.935227', 'step': 135, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:14.972812', 'step': 135, 'epoch': 1} {'type': 'loss', 'content': 0.024362707510590553, 'timestamp': '2025-09-30 22:06:14.998356', 'step': 136, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:06:15.032411', 'step': 136, 'epoch': 1} {'type': 'loss', 'content': 0.023809295147657394, 'timestamp': '2025-09-30 22:06:15.034375', 'step': 137, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:15.072361', 'step': 137, 'epoch': 1} {'type': 'loss', 'content': 0.02297171764075756, 'timestamp': '2025-09-30 22:06:15.076210', 'step': 138, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:15.121812', 'step': 138, 'epoch': 1} {'type': 'loss', 'content': 0.02328100986778736, 'timestamp': '2025-09-30 22:06:15.126486', 'step': 139, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:06:15.158918', 'step': 139, 'epoch': 1} {'type': 'loss', 'content': 0.023281242698431015, 'timestamp': '2025-09-30 22:06:15.191638', 'step': 140, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:06:15.232799', 'step': 140, 'epoch': 1} {'type': 'loss', 'content': 0.023923074826598167, 'timestamp': '2025-09-30 22:06:15.236324', 'step': 141, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:06:15.277095', 'step': 141, 'epoch': 1} {'type': 'loss', 'content': 0.024552708491683006, 'timestamp': '2025-09-30 22:06:15.287847', 'step': 142, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:15.341930', 'step': 142, 'epoch': 1} {'type': 'loss', 'content': 0.02170448563992977, 'timestamp': '2025-09-30 22:06:15.359891', 'step': 143, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:15.407440', 'step': 143, 'epoch': 1} {'type': 'loss', 'content': 0.025551343336701393, 'timestamp': '2025-09-30 22:06:15.439892', 'step': 144, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:15.495337', 'step': 144, 'epoch': 1} {'type': 'loss', 'content': 0.02667604200541973, 'timestamp': '2025-09-30 22:06:15.509839', 'step': 145, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:15.557378', 'step': 145, 'epoch': 1} {'type': 'loss', 'content': 0.02111595869064331, 'timestamp': '2025-09-30 22:06:15.561594', 'step': 146, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:15.601333', 'step': 146, 'epoch': 1} {'type': 'loss', 'content': 0.02530730329453945, 'timestamp': '2025-09-30 22:06:15.610325', 'step': 147, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:15.650918', 'step': 147, 'epoch': 1} {'type': 'loss', 'content': 0.023542964830994606, 'timestamp': '2025-09-30 22:06:15.681910', 'step': 148, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:15.739661', 'step': 148, 'epoch': 1} {'type': 'loss', 'content': 0.026816150173544884, 'timestamp': '2025-09-30 22:06:15.748319', 'step': 149, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:15.795347', 'step': 149, 'epoch': 1} {'type': 'loss', 'content': 0.023416031152009964, 'timestamp': '2025-09-30 22:06:15.803601', 'step': 150, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:15.840449', 'step': 150, 'epoch': 1} {'type': 'loss', 'content': 0.02175022102892399, 'timestamp': '2025-09-30 22:06:15.850178', 'step': 151, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:15.890241', 'step': 151, 'epoch': 1} {'type': 'loss', 'content': 0.021428557112812996, 'timestamp': '2025-09-30 22:06:15.918399', 'step': 152, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:15.972154', 'step': 152, 'epoch': 1} {'type': 'loss', 'content': 0.02025659568607807, 'timestamp': '2025-09-30 22:06:15.985827', 'step': 153, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:16.022412', 'step': 153, 'epoch': 1} {'type': 'loss', 'content': 0.029116351157426834, 'timestamp': '2025-09-30 22:06:16.026546', 'step': 154, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:16.064960', 'step': 154, 'epoch': 1} {'type': 'loss', 'content': 0.023566367104649544, 'timestamp': '2025-09-30 22:06:16.069181', 'step': 155, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:16.104476', 'step': 155, 'epoch': 1} {'type': 'loss', 'content': 0.024201836436986923, 'timestamp': '2025-09-30 22:06:16.132556', 'step': 156, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-30 22:06:16.805291', 'step': 156, 'epoch': 1} {'type': 'pplx', 'content': 60128195.901556, 'timestamp': '2025-09-30 22:06:16.810561', 'step': 156, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:06:16.842713', 'step': 156, 'epoch': 1} {'type': 'loss', 'content': 0.023354709148406982, 'timestamp': '2025-09-30 22:06:16.845277', 'step': 157, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:16.879380', 'step': 157, 'epoch': 1} {'type': 'loss', 'content': 0.023321080952882767, 'timestamp': '2025-09-30 22:06:16.884106', 'step': 158, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:16.918371', 'step': 158, 'epoch': 1} {'type': 'loss', 'content': 0.0184608343988657, 'timestamp': '2025-09-30 22:06:16.921450', 'step': 159, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:06:16.963578', 'step': 159, 'epoch': 1} {'type': 'loss', 'content': 0.028638625517487526, 'timestamp': '2025-09-30 22:06:16.991655', 'step': 160, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:17.025929', 'step': 160, 'epoch': 1} {'type': 'loss', 'content': 0.026660356670618057, 'timestamp': '2025-09-30 22:06:17.027849', 'step': 161, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:17.063893', 'step': 161, 'epoch': 1} {'type': 'loss', 'content': 0.020861739292740822, 'timestamp': '2025-09-30 22:06:17.065804', 'step': 162, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:06:17.099463', 'step': 162, 'epoch': 1} {'type': 'loss', 'content': 0.03286191076040268, 'timestamp': '2025-09-30 22:06:17.107264', 'step': 163, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:17.155934', 'step': 163, 'epoch': 1} {'type': 'loss', 'content': 0.026328887790441513, 'timestamp': '2025-09-30 22:06:17.181533', 'step': 164, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:17.225929', 'step': 164, 'epoch': 1} {'type': 'loss', 'content': 0.020726049318909645, 'timestamp': '2025-09-30 22:06:17.228526', 'step': 165, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:17.263039', 'step': 165, 'epoch': 1} {'type': 'loss', 'content': 0.028658464550971985, 'timestamp': '2025-09-30 22:06:17.267317', 'step': 166, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:17.303745', 'step': 166, 'epoch': 1} {'type': 'loss', 'content': 0.02197512611746788, 'timestamp': '2025-09-30 22:06:17.308076', 'step': 167, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:17.341695', 'step': 167, 'epoch': 1} {'type': 'loss', 'content': 0.02463226579129696, 'timestamp': '2025-09-30 22:06:17.366485', 'step': 168, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:06:17.398774', 'step': 168, 'epoch': 1} {'type': 'loss', 'content': 0.023855552077293396, 'timestamp': '2025-09-30 22:06:17.403932', 'step': 169, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:17.438146', 'step': 169, 'epoch': 1} {'type': 'loss', 'content': 0.02280578203499317, 'timestamp': '2025-09-30 22:06:17.442377', 'step': 170, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:17.479740', 'step': 170, 'epoch': 1} {'type': 'loss', 'content': 0.0270631592720747, 'timestamp': '2025-09-30 22:06:17.482061', 'step': 171, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:06:17.519029', 'step': 171, 'epoch': 1} {'type': 'loss', 'content': 0.024634188041090965, 'timestamp': '2025-09-30 22:06:17.547343', 'step': 172, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:17.580035', 'step': 172, 'epoch': 1} {'type': 'loss', 'content': 0.028481189161539078, 'timestamp': '2025-09-30 22:06:17.582263', 'step': 173, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:17.619799', 'step': 173, 'epoch': 1} {'type': 'loss', 'content': 0.024899035692214966, 'timestamp': '2025-09-30 22:06:17.621892', 'step': 174, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:17.656923', 'step': 174, 'epoch': 1} {'type': 'loss', 'content': 0.01988835260272026, 'timestamp': '2025-09-30 22:06:17.661545', 'step': 175, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:17.694918', 'step': 175, 'epoch': 1} {'type': 'loss', 'content': 0.02222885750234127, 'timestamp': '2025-09-30 22:06:17.718625', 'step': 176, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:06:17.751206', 'step': 176, 'epoch': 1} {'type': 'loss', 'content': 0.026466725394129753, 'timestamp': '2025-09-30 22:06:17.753279', 'step': 177, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:17.785153', 'step': 177, 'epoch': 1} {'type': 'loss', 'content': 0.018711643293499947, 'timestamp': '2025-09-30 22:06:17.787833', 'step': 178, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:17.823812', 'step': 178, 'epoch': 1} {'type': 'loss', 'content': 0.022940317168831825, 'timestamp': '2025-09-30 22:06:17.828208', 'step': 179, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:17.862094', 'step': 179, 'epoch': 1} {'type': 'loss', 'content': 0.02327939309179783, 'timestamp': '2025-09-30 22:06:17.885892', 'step': 180, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:17.918900', 'step': 180, 'epoch': 1} {'type': 'loss', 'content': 0.02618366852402687, 'timestamp': '2025-09-30 22:06:17.920795', 'step': 181, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:06:17.953019', 'step': 181, 'epoch': 1} {'type': 'loss', 'content': 0.026767287403345108, 'timestamp': '2025-09-30 22:06:17.960637', 'step': 182, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:06:17.995007', 'step': 182, 'epoch': 1} {'type': 'loss', 'content': 0.02279074303805828, 'timestamp': '2025-09-30 22:06:17.997057', 'step': 183, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:18.034141', 'step': 183, 'epoch': 1} {'type': 'loss', 'content': 0.023376479744911194, 'timestamp': '2025-09-30 22:06:18.058008', 'step': 184, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:18.089115', 'step': 184, 'epoch': 1} {'type': 'loss', 'content': 0.023480886593461037, 'timestamp': '2025-09-30 22:06:18.091595', 'step': 185, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:18.123826', 'step': 185, 'epoch': 1} {'type': 'loss', 'content': 0.024746397510170937, 'timestamp': '2025-09-30 22:06:18.128096', 'step': 186, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:18.164361', 'step': 186, 'epoch': 1} {'type': 'loss', 'content': 0.026330998167395592, 'timestamp': '2025-09-30 22:06:18.166292', 'step': 187, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:06:18.198316', 'step': 187, 'epoch': 1} {'type': 'loss', 'content': 0.024767503142356873, 'timestamp': '2025-09-30 22:06:18.226573', 'step': 188, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:18.261000', 'step': 188, 'epoch': 1} {'type': 'loss', 'content': 0.0204471405595541, 'timestamp': '2025-09-30 22:06:18.263280', 'step': 189, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:18.294801', 'step': 189, 'epoch': 1} {'type': 'loss', 'content': 0.022645656019449234, 'timestamp': '2025-09-30 22:06:18.299066', 'step': 190, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:18.331507', 'step': 190, 'epoch': 1} {'type': 'loss', 'content': 0.026833781972527504, 'timestamp': '2025-09-30 22:06:18.333483', 'step': 191, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:18.367209', 'step': 191, 'epoch': 1} {'type': 'loss', 'content': 0.023763876408338547, 'timestamp': '2025-09-30 22:06:18.390938', 'step': 192, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:18.421801', 'step': 192, 'epoch': 1} {'type': 'loss', 'content': 0.02496214583516121, 'timestamp': '2025-09-30 22:06:18.423787', 'step': 193, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:18.456539', 'step': 193, 'epoch': 1} {'type': 'loss', 'content': 0.022200098261237144, 'timestamp': '2025-09-30 22:06:18.460592', 'step': 194, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:18.491440', 'step': 194, 'epoch': 1} {'type': 'loss', 'content': 0.0215250663459301, 'timestamp': '2025-09-30 22:06:18.494339', 'step': 195, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-30 22:06:19.146650', 'step': 195, 'epoch': 1} {'type': 'pplx', 'content': 61265013.73487314, 'timestamp': '2025-09-30 22:06:19.149720', 'step': 195, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:19.182883', 'step': 195, 'epoch': 1} {'type': 'loss', 'content': 0.023729149252176285, 'timestamp': '2025-09-30 22:06:19.208362', 'step': 196, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:19.248428', 'step': 196, 'epoch': 1} {'type': 'loss', 'content': 0.021181587129831314, 'timestamp': '2025-09-30 22:06:19.250925', 'step': 197, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:19.283508', 'step': 197, 'epoch': 1} {'type': 'loss', 'content': 0.022421227768063545, 'timestamp': '2025-09-30 22:06:19.286159', 'step': 198, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:06:19.318784', 'step': 198, 'epoch': 1} {'type': 'loss', 'content': 0.02127877064049244, 'timestamp': '2025-09-30 22:06:19.326066', 'step': 199, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:19.360335', 'step': 199, 'epoch': 1} {'type': 'loss', 'content': 0.02680543251335621, 'timestamp': '2025-09-30 22:06:19.388112', 'step': 200, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:19.424365', 'step': 200, 'epoch': 1} {'type': 'loss', 'content': 0.021091178059577942, 'timestamp': '2025-09-30 22:06:19.438368', 'step': 201, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:19.481803', 'step': 201, 'epoch': 1} {'type': 'loss', 'content': 0.024247558787465096, 'timestamp': '2025-09-30 22:06:19.486717', 'step': 202, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:06:19.520237', 'step': 202, 'epoch': 1} {'type': 'loss', 'content': 0.021932261064648628, 'timestamp': '2025-09-30 22:06:19.533549', 'step': 203, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:19.579316', 'step': 203, 'epoch': 1} {'type': 'loss', 'content': 0.025690482929348946, 'timestamp': '2025-09-30 22:06:19.604139', 'step': 204, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:19.652713', 'step': 204, 'epoch': 1} {'type': 'loss', 'content': 0.028097007423639297, 'timestamp': '2025-09-30 22:06:19.657320', 'step': 205, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:19.694961', 'step': 205, 'epoch': 1} {'type': 'loss', 'content': 0.023479444906115532, 'timestamp': '2025-09-30 22:06:19.709008', 'step': 206, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:06:19.768686', 'step': 206, 'epoch': 1} {'type': 'loss', 'content': 0.018325267359614372, 'timestamp': '2025-09-30 22:06:19.776601', 'step': 207, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:06:19.815261', 'step': 207, 'epoch': 1} {'type': 'loss', 'content': 0.024427268654108047, 'timestamp': '2025-09-30 22:06:19.843859', 'step': 208, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:19.882618', 'step': 208, 'epoch': 1} {'type': 'loss', 'content': 0.02468344010412693, 'timestamp': '2025-09-30 22:06:19.886118', 'step': 209, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:19.925200', 'step': 209, 'epoch': 1} {'type': 'loss', 'content': 0.028634795919060707, 'timestamp': '2025-09-30 22:06:19.927329', 'step': 210, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:19.968803', 'step': 210, 'epoch': 1} {'type': 'loss', 'content': 0.026183156296610832, 'timestamp': '2025-09-30 22:06:19.975598', 'step': 211, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:20.010821', 'step': 211, 'epoch': 1} {'type': 'loss', 'content': 0.01938895508646965, 'timestamp': '2025-09-30 22:06:20.035584', 'step': 212, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:06:20.074552', 'step': 212, 'epoch': 1} {'type': 'loss', 'content': 0.023146267980337143, 'timestamp': '2025-09-30 22:06:20.079433', 'step': 213, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:20.112483', 'step': 213, 'epoch': 1} {'type': 'loss', 'content': 0.030951758846640587, 'timestamp': '2025-09-30 22:06:20.119834', 'step': 214, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:20.151526', 'step': 214, 'epoch': 1} {'type': 'loss', 'content': 0.017646988853812218, 'timestamp': '2025-09-30 22:06:20.156564', 'step': 215, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:20.191678', 'step': 215, 'epoch': 1} {'type': 'loss', 'content': 0.020599817857146263, 'timestamp': '2025-09-30 22:06:20.221417', 'step': 216, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:06:20.256856', 'step': 216, 'epoch': 1} {'type': 'loss', 'content': 0.02311067096889019, 'timestamp': '2025-09-30 22:06:20.265333', 'step': 217, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:06:20.301971', 'step': 217, 'epoch': 1} {'type': 'loss', 'content': 0.018607063218951225, 'timestamp': '2025-09-30 22:06:20.310012', 'step': 218, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:20.350535', 'step': 218, 'epoch': 1} {'type': 'loss', 'content': 0.019698362797498703, 'timestamp': '2025-09-30 22:06:20.358037', 'step': 219, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:06:20.402403', 'step': 219, 'epoch': 1} {'type': 'loss', 'content': 0.02162352204322815, 'timestamp': '2025-09-30 22:06:20.431291', 'step': 220, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:20.474981', 'step': 220, 'epoch': 1} {'type': 'loss', 'content': 0.02282586693763733, 'timestamp': '2025-09-30 22:06:20.482606', 'step': 221, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:06:20.517739', 'step': 221, 'epoch': 1} {'type': 'loss', 'content': 0.02331475540995598, 'timestamp': '2025-09-30 22:06:20.528010', 'step': 222, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:06:20.561612', 'step': 222, 'epoch': 1} {'type': 'loss', 'content': 0.022115275263786316, 'timestamp': '2025-09-30 22:06:20.569022', 'step': 223, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:20.605957', 'step': 223, 'epoch': 1} {'type': 'loss', 'content': 0.023793192580342293, 'timestamp': '2025-09-30 22:06:20.634937', 'step': 224, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:20.666302', 'step': 224, 'epoch': 1} {'type': 'loss', 'content': 0.025844072923064232, 'timestamp': '2025-09-30 22:06:20.673036', 'step': 225, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:20.710131', 'step': 225, 'epoch': 1} {'type': 'loss', 'content': 0.03035455010831356, 'timestamp': '2025-09-30 22:06:20.718027', 'step': 226, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:20.753996', 'step': 226, 'epoch': 1} {'type': 'loss', 'content': 0.023812538012862206, 'timestamp': '2025-09-30 22:06:20.759884', 'step': 227, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:20.793425', 'step': 227, 'epoch': 1} {'type': 'loss', 'content': 0.023976163938641548, 'timestamp': '2025-09-30 22:06:20.819722', 'step': 228, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:06:20.860481', 'step': 228, 'epoch': 1} {'type': 'loss', 'content': 0.015135176479816437, 'timestamp': '2025-09-30 22:06:20.875475', 'step': 229, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:20.918279', 'step': 229, 'epoch': 1} {'type': 'loss', 'content': 0.020723579451441765, 'timestamp': '2025-09-30 22:06:20.921030', 'step': 230, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:20.954749', 'step': 230, 'epoch': 1} {'type': 'loss', 'content': 0.020386500284075737, 'timestamp': '2025-09-30 22:06:20.962096', 'step': 231, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:06:21.000099', 'step': 231, 'epoch': 1} {'type': 'loss', 'content': 0.01781288906931877, 'timestamp': '2025-09-30 22:06:21.029073', 'step': 232, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:06:21.072306', 'step': 232, 'epoch': 1} {'type': 'loss', 'content': 0.01598362810909748, 'timestamp': '2025-09-30 22:06:21.083441', 'step': 233, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:06:21.115580', 'step': 233, 'epoch': 1} {'type': 'loss', 'content': 0.027275921776890755, 'timestamp': '2025-09-30 22:06:21.122328', 'step': 234, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-30 22:06:21.782110', 'step': 234, 'epoch': 1} {'type': 'pplx', 'content': 64263270.81331823, 'timestamp': '2025-09-30 22:06:21.790075', 'step': 234, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:06:21.827322', 'step': 234, 'epoch': 1} {'type': 'loss', 'content': 0.025542790070176125, 'timestamp': '2025-09-30 22:06:21.829775', 'step': 235, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:06:21.870541', 'step': 235, 'epoch': 1} {'type': 'loss', 'content': 0.017988238483667374, 'timestamp': '2025-09-30 22:06:21.899521', 'step': 236, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:21.942330', 'step': 236, 'epoch': 1} {'type': 'loss', 'content': 0.03205994889140129, 'timestamp': '2025-09-30 22:06:21.948012', 'step': 237, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:21.986669', 'step': 237, 'epoch': 1} {'type': 'loss', 'content': 0.01907184161245823, 'timestamp': '2025-09-30 22:06:21.991617', 'step': 238, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:22.025386', 'step': 238, 'epoch': 1} {'type': 'loss', 'content': 0.017065171152353287, 'timestamp': '2025-09-30 22:06:22.028018', 'step': 239, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:22.064478', 'step': 239, 'epoch': 1} {'type': 'loss', 'content': 0.023361805826425552, 'timestamp': '2025-09-30 22:06:22.090095', 'step': 240, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:06:22.125961', 'step': 240, 'epoch': 1} {'type': 'loss', 'content': 0.025280656293034554, 'timestamp': '2025-09-30 22:06:22.134904', 'step': 241, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:22.171375', 'step': 241, 'epoch': 1} {'type': 'loss', 'content': 0.015582531690597534, 'timestamp': '2025-09-30 22:06:22.179252', 'step': 242, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:22.215546', 'step': 242, 'epoch': 1} {'type': 'loss', 'content': 0.021414535120129585, 'timestamp': '2025-09-30 22:06:22.222422', 'step': 243, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:22.258698', 'step': 243, 'epoch': 1} {'type': 'loss', 'content': 0.023022836074233055, 'timestamp': '2025-09-30 22:06:22.287682', 'step': 244, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:06:22.320455', 'step': 244, 'epoch': 1} {'type': 'loss', 'content': 0.018914300948381424, 'timestamp': '2025-09-30 22:06:22.325263', 'step': 245, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:06:22.359000', 'step': 245, 'epoch': 1} {'type': 'loss', 'content': 0.027963733300566673, 'timestamp': '2025-09-30 22:06:22.366766', 'step': 246, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:22.404691', 'step': 246, 'epoch': 1} {'type': 'loss', 'content': 0.01878245547413826, 'timestamp': '2025-09-30 22:06:22.409119', 'step': 247, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:22.456732', 'step': 247, 'epoch': 1} {'type': 'loss', 'content': 0.023027878254652023, 'timestamp': '2025-09-30 22:06:22.486436', 'step': 248, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:22.524432', 'step': 248, 'epoch': 1} {'type': 'loss', 'content': 0.019469697028398514, 'timestamp': '2025-09-30 22:06:22.531177', 'step': 249, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:22.578306', 'step': 249, 'epoch': 1} {'type': 'loss', 'content': 0.018262622877955437, 'timestamp': '2025-09-30 22:06:22.581410', 'step': 250, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:22.615104', 'step': 250, 'epoch': 1} {'type': 'loss', 'content': 0.0221934225410223, 'timestamp': '2025-09-30 22:06:22.622786', 'step': 251, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:22.656684', 'step': 251, 'epoch': 1} {'type': 'loss', 'content': 0.01658940128982067, 'timestamp': '2025-09-30 22:06:22.687329', 'step': 252, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:22.729072', 'step': 252, 'epoch': 1} {'type': 'loss', 'content': 0.01807001605629921, 'timestamp': '2025-09-30 22:06:22.736230', 'step': 253, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:06:22.774577', 'step': 253, 'epoch': 1} {'type': 'loss', 'content': 0.021754702553153038, 'timestamp': '2025-09-30 22:06:22.782602', 'step': 254, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:22.820114', 'step': 254, 'epoch': 1} {'type': 'loss', 'content': 0.026683764532208443, 'timestamp': '2025-09-30 22:06:22.827270', 'step': 255, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:22.865345', 'step': 255, 'epoch': 1} {'type': 'loss', 'content': 0.013391956686973572, 'timestamp': '2025-09-30 22:06:22.896128', 'step': 256, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:22.936522', 'step': 256, 'epoch': 1} {'type': 'loss', 'content': 0.017003720626235008, 'timestamp': '2025-09-30 22:06:22.945242', 'step': 257, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:06:22.986133', 'step': 257, 'epoch': 1} {'type': 'loss', 'content': 0.00964332651346922, 'timestamp': '2025-09-30 22:06:22.996079', 'step': 258, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:23.032923', 'step': 258, 'epoch': 1} {'type': 'loss', 'content': 0.009957768023014069, 'timestamp': '2025-09-30 22:06:23.042631', 'step': 259, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:23.079157', 'step': 259, 'epoch': 1} {'type': 'loss', 'content': 0.017908958718180656, 'timestamp': '2025-09-30 22:06:23.110604', 'step': 260, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:23.146484', 'step': 260, 'epoch': 1} {'type': 'loss', 'content': 0.018779877573251724, 'timestamp': '2025-09-30 22:06:23.157800', 'step': 261, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:06:23.197483', 'step': 261, 'epoch': 1} {'type': 'loss', 'content': 0.013146973215043545, 'timestamp': '2025-09-30 22:06:23.208174', 'step': 262, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:23.243232', 'step': 262, 'epoch': 1} {'type': 'loss', 'content': 0.012784031219780445, 'timestamp': '2025-09-30 22:06:23.254286', 'step': 263, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:23.294930', 'step': 263, 'epoch': 1} {'type': 'loss', 'content': 0.026621146127581596, 'timestamp': '2025-09-30 22:06:23.322677', 'step': 264, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:23.355055', 'step': 264, 'epoch': 1} {'type': 'loss', 'content': 0.015577270649373531, 'timestamp': '2025-09-30 22:06:23.363287', 'step': 265, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:23.395307', 'step': 265, 'epoch': 1} {'type': 'loss', 'content': 0.023295968770980835, 'timestamp': '2025-09-30 22:06:23.403246', 'step': 266, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:23.440336', 'step': 266, 'epoch': 1} {'type': 'loss', 'content': 0.004181277472525835, 'timestamp': '2025-09-30 22:06:23.450464', 'step': 267, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:06:23.492255', 'step': 267, 'epoch': 1} {'type': 'loss', 'content': 0.02283737063407898, 'timestamp': '2025-09-30 22:06:23.520986', 'step': 268, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:23.559469', 'step': 268, 'epoch': 1} {'type': 'loss', 'content': 0.027866655960679054, 'timestamp': '2025-09-30 22:06:23.565152', 'step': 269, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:23.603471', 'step': 269, 'epoch': 1} {'type': 'loss', 'content': 0.017093496397137642, 'timestamp': '2025-09-30 22:06:23.610790', 'step': 270, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:23.651049', 'step': 270, 'epoch': 1} {'type': 'loss', 'content': 0.024166589602828026, 'timestamp': '2025-09-30 22:06:23.659249', 'step': 271, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:23.696804', 'step': 271, 'epoch': 1} {'type': 'loss', 'content': 0.026155179366469383, 'timestamp': '2025-09-30 22:06:23.727858', 'step': 272, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:06:23.767971', 'step': 272, 'epoch': 1} {'type': 'loss', 'content': 0.024930324405431747, 'timestamp': '2025-09-30 22:06:23.773031', 'step': 273, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-30 22:06:24.396835', 'step': 273, 'epoch': 1} {'type': 'pplx', 'content': 67738689.07516396, 'timestamp': '2025-09-30 22:06:24.406333', 'step': 273, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:24.441918', 'step': 273, 'epoch': 1} {'type': 'loss', 'content': 0.010441062971949577, 'timestamp': '2025-09-30 22:06:24.463419', 'step': 274, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:06:24.496375', 'step': 274, 'epoch': 1} {'type': 'loss', 'content': 0.023288477212190628, 'timestamp': '2025-09-30 22:06:24.503290', 'step': 275, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:06:24.541145', 'step': 275, 'epoch': 1} {'type': 'loss', 'content': 0.01126602478325367, 'timestamp': '2025-09-30 22:06:24.573099', 'step': 276, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:24.614461', 'step': 276, 'epoch': 1} {'type': 'loss', 'content': 0.019958725199103355, 'timestamp': '2025-09-30 22:06:24.617351', 'step': 277, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:06:24.652885', 'step': 277, 'epoch': 1} {'type': 'loss', 'content': 0.013975156471133232, 'timestamp': '2025-09-30 22:06:24.665078', 'step': 278, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:24.709700', 'step': 278, 'epoch': 1} {'type': 'loss', 'content': 0.030132178217172623, 'timestamp': '2025-09-30 22:06:24.718532', 'step': 279, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:24.757773', 'step': 279, 'epoch': 1} {'type': 'loss', 'content': 0.012328540906310081, 'timestamp': '2025-09-30 22:06:24.782673', 'step': 280, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:24.825528', 'step': 280, 'epoch': 1} {'type': 'loss', 'content': 0.027462830767035484, 'timestamp': '2025-09-30 22:06:24.835370', 'step': 281, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:24.876083', 'step': 281, 'epoch': 1} {'type': 'loss', 'content': 0.01589822955429554, 'timestamp': '2025-09-30 22:06:24.885679', 'step': 282, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:06:24.923440', 'step': 282, 'epoch': 1} {'type': 'loss', 'content': 0.030031440779566765, 'timestamp': '2025-09-30 22:06:24.931219', 'step': 283, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:06:24.982462', 'step': 283, 'epoch': 1} {'type': 'loss', 'content': 0.025346266105771065, 'timestamp': '2025-09-30 22:06:25.010712', 'step': 284, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:25.050615', 'step': 284, 'epoch': 1} {'type': 'loss', 'content': 0.02812729962170124, 'timestamp': '2025-09-30 22:06:25.060034', 'step': 285, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:06:25.096299', 'step': 285, 'epoch': 1} {'type': 'loss', 'content': 0.0242477897554636, 'timestamp': '2025-09-30 22:06:25.104222', 'step': 286, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:25.142717', 'step': 286, 'epoch': 1} {'type': 'loss', 'content': 0.018631068989634514, 'timestamp': '2025-09-30 22:06:25.149525', 'step': 287, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:25.192273', 'step': 287, 'epoch': 1} {'type': 'loss', 'content': 0.009550183080136776, 'timestamp': '2025-09-30 22:06:25.217564', 'step': 288, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:25.249899', 'step': 288, 'epoch': 1} {'type': 'loss', 'content': 0.003718113526701927, 'timestamp': '2025-09-30 22:06:25.259578', 'step': 289, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:25.303569', 'step': 289, 'epoch': 1} {'type': 'loss', 'content': 0.020078999921679497, 'timestamp': '2025-09-30 22:06:25.311292', 'step': 290, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:06:25.350162', 'step': 290, 'epoch': 1} {'type': 'loss', 'content': 0.022243574261665344, 'timestamp': '2025-09-30 22:06:25.359780', 'step': 291, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:25.400489', 'step': 291, 'epoch': 1} {'type': 'loss', 'content': 0.014406189322471619, 'timestamp': '2025-09-30 22:06:25.430318', 'step': 292, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:25.468538', 'step': 292, 'epoch': 1} {'type': 'loss', 'content': 0.0135045712813735, 'timestamp': '2025-09-30 22:06:25.471073', 'step': 293, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:25.503612', 'step': 293, 'epoch': 1} {'type': 'loss', 'content': 0.009503084234893322, 'timestamp': '2025-09-30 22:06:25.512131', 'step': 294, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:25.548039', 'step': 294, 'epoch': 1} {'type': 'loss', 'content': 0.010631188750267029, 'timestamp': '2025-09-30 22:06:25.555157', 'step': 295, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:06:25.595185', 'step': 295, 'epoch': 1} {'type': 'loss', 'content': 0.025840530171990395, 'timestamp': '2025-09-30 22:06:25.620041', 'step': 296, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:25.652551', 'step': 296, 'epoch': 1} {'type': 'loss', 'content': 0.005571034271270037, 'timestamp': '2025-09-30 22:06:25.664110', 'step': 297, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:25.703265', 'step': 297, 'epoch': 1} {'type': 'loss', 'content': 0.03459536284208298, 'timestamp': '2025-09-30 22:06:25.710008', 'step': 298, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:25.746808', 'step': 298, 'epoch': 1} {'type': 'loss', 'content': 0.013033518567681313, 'timestamp': '2025-09-30 22:06:25.755910', 'step': 299, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:06:25.794091', 'step': 299, 'epoch': 1} {'type': 'loss', 'content': 0.025730684399604797, 'timestamp': '2025-09-30 22:06:25.822479', 'step': 300, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:25.856938', 'step': 300, 'epoch': 1} {'type': 'loss', 'content': 0.015336094424128532, 'timestamp': '2025-09-30 22:06:25.868770', 'step': 301, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:25.909828', 'step': 301, 'epoch': 1} {'type': 'loss', 'content': 0.047021035104990005, 'timestamp': '2025-09-30 22:06:25.914186', 'step': 302, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:25.952032', 'step': 302, 'epoch': 1} {'type': 'loss', 'content': 0.004503812175244093, 'timestamp': '2025-09-30 22:06:25.956650', 'step': 303, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:25.993051', 'step': 303, 'epoch': 1} {'type': 'loss', 'content': 0.005192614626139402, 'timestamp': '2025-09-30 22:06:26.022878', 'step': 304, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:06:26.053400', 'step': 304, 'epoch': 1} {'type': 'loss', 'content': 0.03500426560640335, 'timestamp': '2025-09-30 22:06:26.061522', 'step': 305, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:26.105787', 'step': 305, 'epoch': 1} {'type': 'loss', 'content': 0.011063890531659126, 'timestamp': '2025-09-30 22:06:26.114338', 'step': 306, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:26.152122', 'step': 306, 'epoch': 1} {'type': 'loss', 'content': 0.0441848523914814, 'timestamp': '2025-09-30 22:06:26.159235', 'step': 307, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:26.194940', 'step': 307, 'epoch': 1} {'type': 'loss', 'content': 0.013916602358222008, 'timestamp': '2025-09-30 22:06:26.223115', 'step': 308, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:26.255058', 'step': 308, 'epoch': 1} {'type': 'loss', 'content': 0.010168136097490788, 'timestamp': '2025-09-30 22:06:26.260774', 'step': 309, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:26.294159', 'step': 309, 'epoch': 1} {'type': 'loss', 'content': 0.004245879594236612, 'timestamp': '2025-09-30 22:06:26.303522', 'step': 310, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:26.338332', 'step': 310, 'epoch': 1} {'type': 'loss', 'content': 0.007543101906776428, 'timestamp': '2025-09-30 22:06:26.345524', 'step': 311, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:26.385370', 'step': 311, 'epoch': 1} {'type': 'loss', 'content': 0.0123042743653059, 'timestamp': '2025-09-30 22:06:26.412954', 'step': 312, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-30 22:06:27.028571', 'step': 312, 'epoch': 1} {'type': 'pplx', 'content': 80127076.99242038, 'timestamp': '2025-09-30 22:06:27.043101', 'step': 312, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:27.087289', 'step': 312, 'epoch': 1} {'type': 'loss', 'content': 0.025003399699926376, 'timestamp': '2025-09-30 22:06:27.100523', 'step': 313, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:06:27.142075', 'step': 313, 'epoch': 1} {'type': 'loss', 'content': 0.03449965640902519, 'timestamp': '2025-09-30 22:06:27.150373', 'step': 314, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:27.186771', 'step': 314, 'epoch': 1} {'type': 'loss', 'content': 0.015554225072264671, 'timestamp': '2025-09-30 22:06:27.198131', 'step': 315, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:27.235703', 'step': 315, 'epoch': 1} {'type': 'loss', 'content': 0.002846226328983903, 'timestamp': '2025-09-30 22:06:27.267432', 'step': 316, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:27.303449', 'step': 316, 'epoch': 1} {'type': 'loss', 'content': 0.0057856314815580845, 'timestamp': '2025-09-30 22:06:27.315891', 'step': 317, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:27.347591', 'step': 317, 'epoch': 1} {'type': 'loss', 'content': 0.006301513407379389, 'timestamp': '2025-09-30 22:06:27.356087', 'step': 318, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:27.392323', 'step': 318, 'epoch': 1} {'type': 'loss', 'content': 0.027794986963272095, 'timestamp': '2025-09-30 22:06:27.395540', 'step': 319, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:27.427051', 'step': 319, 'epoch': 1} {'type': 'loss', 'content': 0.03337537869811058, 'timestamp': '2025-09-30 22:06:27.452922', 'step': 320, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:27.489746', 'step': 320, 'epoch': 1} {'type': 'loss', 'content': 0.0039304569363594055, 'timestamp': '2025-09-30 22:06:27.494950', 'step': 321, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:06:27.531296', 'step': 321, 'epoch': 1} {'type': 'loss', 'content': 0.009028865024447441, 'timestamp': '2025-09-30 22:06:27.539041', 'step': 322, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:27.572205', 'step': 322, 'epoch': 1} {'type': 'loss', 'content': 0.004246709402650595, 'timestamp': '2025-09-30 22:06:27.576856', 'step': 323, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:27.609544', 'step': 323, 'epoch': 1} {'type': 'loss', 'content': 0.00866038165986538, 'timestamp': '2025-09-30 22:06:27.634664', 'step': 324, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:27.667286', 'step': 324, 'epoch': 1} {'type': 'loss', 'content': 0.0015846890164539218, 'timestamp': '2025-09-30 22:06:27.675799', 'step': 325, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:27.713112', 'step': 325, 'epoch': 1} {'type': 'loss', 'content': 0.03486005961894989, 'timestamp': '2025-09-30 22:06:27.720983', 'step': 326, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:27.756875', 'step': 326, 'epoch': 1} {'type': 'loss', 'content': 0.0034004233311861753, 'timestamp': '2025-09-30 22:06:27.759550', 'step': 327, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:27.791582', 'step': 327, 'epoch': 1} {'type': 'loss', 'content': 0.007430666591972113, 'timestamp': '2025-09-30 22:06:27.817271', 'step': 328, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:06:27.856464', 'step': 328, 'epoch': 1} {'type': 'loss', 'content': 0.014357471838593483, 'timestamp': '2025-09-30 22:06:27.862786', 'step': 329, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:27.894317', 'step': 329, 'epoch': 1} {'type': 'loss', 'content': 0.02626877650618553, 'timestamp': '2025-09-30 22:06:27.905202', 'step': 330, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:06:27.947148', 'step': 330, 'epoch': 1} {'type': 'loss', 'content': 0.016307147219777107, 'timestamp': '2025-09-30 22:06:27.951155', 'step': 331, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:27.986537', 'step': 331, 'epoch': 1} {'type': 'loss', 'content': 0.01717582531273365, 'timestamp': '2025-09-30 22:06:28.017852', 'step': 332, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:06:28.049610', 'step': 332, 'epoch': 1} {'type': 'loss', 'content': 0.0503578707575798, 'timestamp': '2025-09-30 22:06:28.054474', 'step': 333, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:28.092300', 'step': 333, 'epoch': 1} {'type': 'loss', 'content': 0.02990400418639183, 'timestamp': '2025-09-30 22:06:28.096095', 'step': 334, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:28.134615', 'step': 334, 'epoch': 1} {'type': 'loss', 'content': 0.010535559616982937, 'timestamp': '2025-09-30 22:06:28.146207', 'step': 335, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:28.184003', 'step': 335, 'epoch': 1} {'type': 'loss', 'content': 0.022340644150972366, 'timestamp': '2025-09-30 22:06:28.212849', 'step': 336, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:28.248764', 'step': 336, 'epoch': 1} {'type': 'loss', 'content': 0.0386832021176815, 'timestamp': '2025-09-30 22:06:28.259287', 'step': 337, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:06:28.296838', 'step': 337, 'epoch': 1} {'type': 'loss', 'content': 0.019745545461773872, 'timestamp': '2025-09-30 22:06:28.309734', 'step': 338, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:28.340946', 'step': 338, 'epoch': 1} {'type': 'loss', 'content': 0.04619959369301796, 'timestamp': '2025-09-30 22:06:28.349895', 'step': 339, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:28.388222', 'step': 339, 'epoch': 1} {'type': 'loss', 'content': 0.02208585850894451, 'timestamp': '2025-09-30 22:06:28.412970', 'step': 340, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:06:28.451940', 'step': 340, 'epoch': 1} {'type': 'loss', 'content': 0.027847709134221077, 'timestamp': '2025-09-30 22:06:28.456897', 'step': 341, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:06:28.488586', 'step': 341, 'epoch': 1} {'type': 'loss', 'content': 0.038464076817035675, 'timestamp': '2025-09-30 22:06:28.495274', 'step': 342, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:28.526226', 'step': 342, 'epoch': 1} {'type': 'loss', 'content': 0.006711141671985388, 'timestamp': '2025-09-30 22:06:28.530589', 'step': 343, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:28.568572', 'step': 343, 'epoch': 1} {'type': 'loss', 'content': 0.018281709402799606, 'timestamp': '2025-09-30 22:06:28.597656', 'step': 344, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:28.636348', 'step': 344, 'epoch': 1} {'type': 'loss', 'content': 0.006692342925816774, 'timestamp': '2025-09-30 22:06:28.646848', 'step': 345, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:28.680244', 'step': 345, 'epoch': 1} {'type': 'loss', 'content': 0.010647530667483807, 'timestamp': '2025-09-30 22:06:28.684295', 'step': 346, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:28.719171', 'step': 346, 'epoch': 1} {'type': 'loss', 'content': 0.009429781697690487, 'timestamp': '2025-09-30 22:06:28.726414', 'step': 347, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:28.762319', 'step': 347, 'epoch': 1} {'type': 'loss', 'content': 0.008969319984316826, 'timestamp': '2025-09-30 22:06:28.787958', 'step': 348, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:28.821492', 'step': 348, 'epoch': 1} {'type': 'loss', 'content': 0.025091560557484627, 'timestamp': '2025-09-30 22:06:28.823676', 'step': 349, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:06:28.855306', 'step': 349, 'epoch': 1} {'type': 'loss', 'content': 0.018482675775885582, 'timestamp': '2025-09-30 22:06:28.862531', 'step': 350, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:28.905175', 'step': 350, 'epoch': 1} {'type': 'loss', 'content': 0.013928530737757683, 'timestamp': '2025-09-30 22:06:28.910346', 'step': 351, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-30 22:06:29.567369', 'step': 351, 'epoch': 1} {'type': 'pplx', 'content': 82378214.0221123, 'timestamp': '2025-09-30 22:06:29.573742', 'step': 351, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:29.606386', 'step': 351, 'epoch': 1} {'type': 'loss', 'content': 0.023360390216112137, 'timestamp': '2025-09-30 22:06:29.634118', 'step': 352, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:29.668425', 'step': 352, 'epoch': 1} {'type': 'loss', 'content': 0.04514763504266739, 'timestamp': '2025-09-30 22:06:29.672546', 'step': 353, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:06:29.711007', 'step': 353, 'epoch': 1} {'type': 'loss', 'content': 0.019704774022102356, 'timestamp': '2025-09-30 22:06:29.718836', 'step': 354, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:29.758574', 'step': 354, 'epoch': 1} {'type': 'loss', 'content': 0.01630558632314205, 'timestamp': '2025-09-30 22:06:29.769253', 'step': 355, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:06:29.812347', 'step': 355, 'epoch': 1} {'type': 'loss', 'content': 0.021960977464914322, 'timestamp': '2025-09-30 22:06:29.843635', 'step': 356, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:29.884019', 'step': 356, 'epoch': 1} {'type': 'loss', 'content': 0.022443512454628944, 'timestamp': '2025-09-30 22:06:29.888141', 'step': 357, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:06:29.928405', 'step': 357, 'epoch': 1} {'type': 'loss', 'content': 0.023133354261517525, 'timestamp': '2025-09-30 22:06:29.931560', 'step': 358, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:06:29.967827', 'step': 358, 'epoch': 1} {'type': 'loss', 'content': 0.013051935471594334, 'timestamp': '2025-09-30 22:06:29.971395', 'step': 359, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:06:30.014557', 'step': 359, 'epoch': 1} {'type': 'loss', 'content': 0.011638335883617401, 'timestamp': '2025-09-30 22:06:30.043593', 'step': 360, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:30.084777', 'step': 360, 'epoch': 1} {'type': 'loss', 'content': 0.01069671381264925, 'timestamp': '2025-09-30 22:06:30.093341', 'step': 361, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:30.133700', 'step': 361, 'epoch': 1} {'type': 'loss', 'content': 0.020872537046670914, 'timestamp': '2025-09-30 22:06:30.138123', 'step': 362, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:30.180226', 'step': 362, 'epoch': 1} {'type': 'loss', 'content': 0.01966075785458088, 'timestamp': '2025-09-30 22:06:30.188724', 'step': 363, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:30.227767', 'step': 363, 'epoch': 1} {'type': 'loss', 'content': 0.024045193567872047, 'timestamp': '2025-09-30 22:06:30.256400', 'step': 364, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:30.297037', 'step': 364, 'epoch': 1} {'type': 'loss', 'content': 0.011192423291504383, 'timestamp': '2025-09-30 22:06:30.300657', 'step': 365, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:30.341894', 'step': 365, 'epoch': 1} {'type': 'loss', 'content': 0.009517804719507694, 'timestamp': '2025-09-30 22:06:30.344602', 'step': 366, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:06:30.382839', 'step': 366, 'epoch': 1} {'type': 'loss', 'content': 0.011407798156142235, 'timestamp': '2025-09-30 22:06:30.390653', 'step': 367, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:30.424011', 'step': 367, 'epoch': 1} {'type': 'loss', 'content': 0.025280386209487915, 'timestamp': '2025-09-30 22:06:30.455338', 'step': 368, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:06:30.502843', 'step': 368, 'epoch': 1} {'type': 'loss', 'content': 0.011698196642100811, 'timestamp': '2025-09-30 22:06:30.510007', 'step': 369, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:30.562476', 'step': 369, 'epoch': 1} {'type': 'loss', 'content': 0.014684346504509449, 'timestamp': '2025-09-30 22:06:30.567961', 'step': 370, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:30.602348', 'step': 370, 'epoch': 1} {'type': 'loss', 'content': 0.022555148229002953, 'timestamp': '2025-09-30 22:06:30.609767', 'step': 371, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:30.647893', 'step': 371, 'epoch': 1} {'type': 'loss', 'content': 0.020193180069327354, 'timestamp': '2025-09-30 22:06:30.678853', 'step': 372, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:30.714691', 'step': 372, 'epoch': 1} {'type': 'loss', 'content': 0.014575188979506493, 'timestamp': '2025-09-30 22:06:30.717267', 'step': 373, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:30.755895', 'step': 373, 'epoch': 1} {'type': 'loss', 'content': 0.011546963825821877, 'timestamp': '2025-09-30 22:06:30.766148', 'step': 374, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:30.816703', 'step': 374, 'epoch': 1} {'type': 'loss', 'content': 0.018510419875383377, 'timestamp': '2025-09-30 22:06:30.825663', 'step': 375, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:30.859042', 'step': 375, 'epoch': 1} {'type': 'loss', 'content': 0.02678140625357628, 'timestamp': '2025-09-30 22:06:30.888898', 'step': 376, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:30.928626', 'step': 376, 'epoch': 1} {'type': 'loss', 'content': 0.014265132136642933, 'timestamp': '2025-09-30 22:06:30.932308', 'step': 377, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:06:30.970600', 'step': 377, 'epoch': 1} {'type': 'loss', 'content': 0.012481029145419598, 'timestamp': '2025-09-30 22:06:30.978508', 'step': 378, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:31.019119', 'step': 378, 'epoch': 1} {'type': 'loss', 'content': 0.025546716526150703, 'timestamp': '2025-09-30 22:06:31.023547', 'step': 379, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:31.057247', 'step': 379, 'epoch': 1} {'type': 'loss', 'content': 0.01196589320898056, 'timestamp': '2025-09-30 22:06:31.082411', 'step': 380, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:31.120796', 'step': 380, 'epoch': 1} {'type': 'loss', 'content': 0.011724757961928844, 'timestamp': '2025-09-30 22:06:31.129210', 'step': 381, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:31.166134', 'step': 381, 'epoch': 1} {'type': 'loss', 'content': 0.019267279654741287, 'timestamp': '2025-09-30 22:06:31.173179', 'step': 382, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:31.210994', 'step': 382, 'epoch': 1} {'type': 'loss', 'content': 0.024116748943924904, 'timestamp': '2025-09-30 22:06:31.218079', 'step': 383, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:31.249965', 'step': 383, 'epoch': 1} {'type': 'loss', 'content': 0.013756215572357178, 'timestamp': '2025-09-30 22:06:31.277885', 'step': 384, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:31.313944', 'step': 384, 'epoch': 1} {'type': 'loss', 'content': 0.011417214758694172, 'timestamp': '2025-09-30 22:06:31.320095', 'step': 385, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:06:31.354415', 'step': 385, 'epoch': 1} {'type': 'loss', 'content': 0.020715000107884407, 'timestamp': '2025-09-30 22:06:31.362141', 'step': 386, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:31.395711', 'step': 386, 'epoch': 1} {'type': 'loss', 'content': 0.007579857017844915, 'timestamp': '2025-09-30 22:06:31.398574', 'step': 387, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:31.429871', 'step': 387, 'epoch': 1} {'type': 'loss', 'content': 0.024890294298529625, 'timestamp': '2025-09-30 22:06:31.453667', 'step': 388, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:31.485946', 'step': 388, 'epoch': 1} {'type': 'loss', 'content': 0.013178860768675804, 'timestamp': '2025-09-30 22:06:31.488895', 'step': 389, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:31.520814', 'step': 389, 'epoch': 1} {'type': 'loss', 'content': 0.012439219281077385, 'timestamp': '2025-09-30 22:06:31.527916', 'step': 390, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-30 22:06:32.179927', 'step': 390, 'epoch': 1} {'type': 'pplx', 'content': 83420945.45658126, 'timestamp': '2025-09-30 22:06:32.185006', 'step': 390, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:06:32.227423', 'step': 390, 'epoch': 1} {'type': 'loss', 'content': 0.025691481307148933, 'timestamp': '2025-09-30 22:06:32.243948', 'step': 391, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:32.288555', 'step': 391, 'epoch': 1} {'type': 'loss', 'content': 0.021664824336767197, 'timestamp': '2025-09-30 22:06:32.314261', 'step': 392, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:06:32.351168', 'step': 392, 'epoch': 1} {'type': 'loss', 'content': 0.006012835539877415, 'timestamp': '2025-09-30 22:06:32.365804', 'step': 393, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:32.399460', 'step': 393, 'epoch': 1} {'type': 'loss', 'content': 0.014766584150493145, 'timestamp': '2025-09-30 22:06:32.414902', 'step': 394, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:32.465673', 'step': 394, 'epoch': 1} {'type': 'loss', 'content': 0.008303819224238396, 'timestamp': '2025-09-30 22:06:32.470427', 'step': 395, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:06:32.514446', 'step': 395, 'epoch': 1} {'type': 'loss', 'content': 0.005097079090774059, 'timestamp': '2025-09-30 22:06:32.551370', 'step': 396, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:32.585372', 'step': 396, 'epoch': 1} {'type': 'loss', 'content': 0.012645237147808075, 'timestamp': '2025-09-30 22:06:32.589263', 'step': 397, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:32.624533', 'step': 397, 'epoch': 1} {'type': 'loss', 'content': 0.032030485570430756, 'timestamp': '2025-09-30 22:06:32.641255', 'step': 398, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:06:32.688822', 'step': 398, 'epoch': 1} {'type': 'loss', 'content': 0.029321294277906418, 'timestamp': '2025-09-30 22:06:32.703140', 'step': 399, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:32.737028', 'step': 399, 'epoch': 1} {'type': 'loss', 'content': 0.014075069688260555, 'timestamp': '2025-09-30 22:06:32.762510', 'step': 400, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:32.801365', 'step': 400, 'epoch': 1} {'type': 'loss', 'content': 0.014327221550047398, 'timestamp': '2025-09-30 22:06:32.810199', 'step': 401, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:06:32.846501', 'step': 401, 'epoch': 1} {'type': 'loss', 'content': 0.04210897162556648, 'timestamp': '2025-09-30 22:06:32.854213', 'step': 402, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:06:32.893696', 'step': 402, 'epoch': 1} {'type': 'loss', 'content': 0.015767447650432587, 'timestamp': '2025-09-30 22:06:32.901172', 'step': 403, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:06:32.942741', 'step': 403, 'epoch': 1} {'type': 'loss', 'content': 0.025074811652302742, 'timestamp': '2025-09-30 22:06:32.967377', 'step': 404, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:06:33.000429', 'step': 404, 'epoch': 1} {'type': 'loss', 'content': 0.01254369132220745, 'timestamp': '2025-09-30 22:06:33.008230', 'step': 405, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:33.042356', 'step': 405, 'epoch': 1} {'type': 'loss', 'content': 0.0043205274268984795, 'timestamp': '2025-09-30 22:06:33.047248', 'step': 406, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:06:33.085128', 'step': 406, 'epoch': 1} {'type': 'loss', 'content': 0.019118309020996094, 'timestamp': '2025-09-30 22:06:33.092620', 'step': 407, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:33.126741', 'step': 407, 'epoch': 1} {'type': 'loss', 'content': 0.004540651571005583, 'timestamp': '2025-09-30 22:06:33.152227', 'step': 408, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:33.190413', 'step': 408, 'epoch': 1} {'type': 'loss', 'content': 0.012727133929729462, 'timestamp': '2025-09-30 22:06:33.194032', 'step': 409, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:33.230731', 'step': 409, 'epoch': 1} {'type': 'loss', 'content': 0.007929746992886066, 'timestamp': '2025-09-30 22:06:33.232861', 'step': 410, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:33.269768', 'step': 410, 'epoch': 1} {'type': 'loss', 'content': 0.01387192402034998, 'timestamp': '2025-09-30 22:06:33.273785', 'step': 411, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:33.307647', 'step': 411, 'epoch': 1} {'type': 'loss', 'content': 0.0033554253168404102, 'timestamp': '2025-09-30 22:06:33.331754', 'step': 412, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:06:33.369476', 'step': 412, 'epoch': 1} {'type': 'loss', 'content': 0.013880507089197636, 'timestamp': '2025-09-30 22:06:33.373824', 'step': 413, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:33.410753', 'step': 413, 'epoch': 1} {'type': 'loss', 'content': 0.009252405725419521, 'timestamp': '2025-09-30 22:06:33.415270', 'step': 414, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:33.454325', 'step': 414, 'epoch': 1} {'type': 'loss', 'content': 0.01770790107548237, 'timestamp': '2025-09-30 22:06:33.461238', 'step': 415, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:33.502321', 'step': 415, 'epoch': 1} {'type': 'loss', 'content': 0.019153054803609848, 'timestamp': '2025-09-30 22:06:33.531090', 'step': 416, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:06:33.572115', 'step': 416, 'epoch': 1} {'type': 'loss', 'content': 0.010953940451145172, 'timestamp': '2025-09-30 22:06:33.576740', 'step': 417, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:33.616997', 'step': 417, 'epoch': 1} {'type': 'loss', 'content': 0.014136207289993763, 'timestamp': '2025-09-30 22:06:33.623330', 'step': 418, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:06:33.676826', 'step': 418, 'epoch': 1} {'type': 'loss', 'content': 0.011414690874516964, 'timestamp': '2025-09-30 22:06:33.685449', 'step': 419, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:33.723416', 'step': 419, 'epoch': 1} {'type': 'loss', 'content': 0.02028883434832096, 'timestamp': '2025-09-30 22:06:33.753209', 'step': 420, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:06:33.790617', 'step': 420, 'epoch': 1} {'type': 'loss', 'content': 0.00771242706105113, 'timestamp': '2025-09-30 22:06:33.798423', 'step': 421, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:06:33.833347', 'step': 421, 'epoch': 1} {'type': 'loss', 'content': 0.030433373525738716, 'timestamp': '2025-09-30 22:06:33.843145', 'step': 422, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:06:33.884278', 'step': 422, 'epoch': 1} {'type': 'loss', 'content': 0.004011732060462236, 'timestamp': '2025-09-30 22:06:33.893197', 'step': 423, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:33.930303', 'step': 423, 'epoch': 1} {'type': 'loss', 'content': 0.014322788454592228, 'timestamp': '2025-09-30 22:06:33.954749', 'step': 424, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:33.990666', 'step': 424, 'epoch': 1} {'type': 'loss', 'content': 0.024081062525510788, 'timestamp': '2025-09-30 22:06:33.998713', 'step': 425, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:34.032908', 'step': 425, 'epoch': 1} {'type': 'loss', 'content': 0.004053601063787937, 'timestamp': '2025-09-30 22:06:34.039958', 'step': 426, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:34.078596', 'step': 426, 'epoch': 1} {'type': 'loss', 'content': 0.00806428026407957, 'timestamp': '2025-09-30 22:06:34.082886', 'step': 427, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:34.119467', 'step': 427, 'epoch': 1} {'type': 'loss', 'content': 0.013106881640851498, 'timestamp': '2025-09-30 22:06:34.143662', 'step': 428, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:34.179606', 'step': 428, 'epoch': 1} {'type': 'loss', 'content': 0.020720576867461205, 'timestamp': '2025-09-30 22:06:34.182418', 'step': 429, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-30 22:06:34.827798', 'step': 429, 'epoch': 1} {'type': 'pplx', 'content': 90011934.36599956, 'timestamp': '2025-09-30 22:06:34.837057', 'step': 429, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:34.875069', 'step': 429, 'epoch': 1} {'type': 'loss', 'content': 0.006069289054721594, 'timestamp': '2025-09-30 22:06:34.882360', 'step': 430, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:06:34.919387', 'step': 430, 'epoch': 1} {'type': 'loss', 'content': 0.011751479469239712, 'timestamp': '2025-09-30 22:06:34.927437', 'step': 431, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:34.969882', 'step': 431, 'epoch': 1} {'type': 'loss', 'content': 0.018163446336984634, 'timestamp': '2025-09-30 22:06:34.994177', 'step': 432, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:35.036152', 'step': 432, 'epoch': 1} {'type': 'loss', 'content': 0.00829459261149168, 'timestamp': '2025-09-30 22:06:35.046603', 'step': 433, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:35.081522', 'step': 433, 'epoch': 1} {'type': 'loss', 'content': 0.021889938041567802, 'timestamp': '2025-09-30 22:06:35.085410', 'step': 434, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:06:35.127262', 'step': 434, 'epoch': 1} {'type': 'loss', 'content': 0.017087522894144058, 'timestamp': '2025-09-30 22:06:35.133589', 'step': 435, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:06:35.169755', 'step': 435, 'epoch': 1} {'type': 'loss', 'content': 0.01923413760960102, 'timestamp': '2025-09-30 22:06:35.198620', 'step': 436, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:35.233508', 'step': 436, 'epoch': 1} {'type': 'loss', 'content': 0.0019212139304727316, 'timestamp': '2025-09-30 22:06:35.237869', 'step': 437, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:06:35.275977', 'step': 437, 'epoch': 1} {'type': 'loss', 'content': 0.012549689039587975, 'timestamp': '2025-09-30 22:06:35.283593', 'step': 438, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:35.323249', 'step': 438, 'epoch': 1} {'type': 'loss', 'content': 0.0119699751958251, 'timestamp': '2025-09-30 22:06:35.327989', 'step': 439, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:35.365424', 'step': 439, 'epoch': 1} {'type': 'loss', 'content': 0.030648184940218925, 'timestamp': '2025-09-30 22:06:35.396401', 'step': 440, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:35.435955', 'step': 440, 'epoch': 1} {'type': 'loss', 'content': 0.005409966688603163, 'timestamp': '2025-09-30 22:06:35.440599', 'step': 441, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:35.475640', 'step': 441, 'epoch': 1} {'type': 'loss', 'content': 0.03966454416513443, 'timestamp': '2025-09-30 22:06:35.483423', 'step': 442, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:06:35.521964', 'step': 442, 'epoch': 1} {'type': 'loss', 'content': 0.0014881438110023737, 'timestamp': '2025-09-30 22:06:35.529594', 'step': 443, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:35.567583', 'step': 443, 'epoch': 1} {'type': 'loss', 'content': 0.006455257534980774, 'timestamp': '2025-09-30 22:06:35.597721', 'step': 444, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:35.635404', 'step': 444, 'epoch': 1} {'type': 'loss', 'content': 0.0095746461302042, 'timestamp': '2025-09-30 22:06:35.643666', 'step': 445, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:35.678266', 'step': 445, 'epoch': 1} {'type': 'loss', 'content': 0.030881425365805626, 'timestamp': '2025-09-30 22:06:35.681990', 'step': 446, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:35.716077', 'step': 446, 'epoch': 1} {'type': 'loss', 'content': 0.014141733758151531, 'timestamp': '2025-09-30 22:06:35.719478', 'step': 447, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:06:35.751388', 'step': 447, 'epoch': 1} {'type': 'loss', 'content': 0.028649205341935158, 'timestamp': '2025-09-30 22:06:35.778834', 'step': 448, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:35.818945', 'step': 448, 'epoch': 1} {'type': 'loss', 'content': 0.010160253383219242, 'timestamp': '2025-09-30 22:06:35.826430', 'step': 449, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:35.866387', 'step': 449, 'epoch': 1} {'type': 'loss', 'content': 0.01143181324005127, 'timestamp': '2025-09-30 22:06:35.873409', 'step': 450, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:35.920147', 'step': 450, 'epoch': 1} {'type': 'loss', 'content': 0.0035703144967556, 'timestamp': '2025-09-30 22:06:35.926330', 'step': 451, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:35.968411', 'step': 451, 'epoch': 1} {'type': 'loss', 'content': 0.027362186461687088, 'timestamp': '2025-09-30 22:06:35.998323', 'step': 452, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:36.035135', 'step': 452, 'epoch': 1} {'type': 'loss', 'content': 0.003247453598305583, 'timestamp': '2025-09-30 22:06:36.039382', 'step': 453, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:36.073149', 'step': 453, 'epoch': 1} {'type': 'loss', 'content': 0.02386678196489811, 'timestamp': '2025-09-30 22:06:36.078054', 'step': 454, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:36.113306', 'step': 454, 'epoch': 1} {'type': 'loss', 'content': 0.016726380214095116, 'timestamp': '2025-09-30 22:06:36.117638', 'step': 455, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:06:36.152504', 'step': 455, 'epoch': 1} {'type': 'loss', 'content': 0.017773736268281937, 'timestamp': '2025-09-30 22:06:36.180545', 'step': 456, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:06:36.215074', 'step': 456, 'epoch': 1} {'type': 'loss', 'content': 0.0035495886113494635, 'timestamp': '2025-09-30 22:06:36.222851', 'step': 457, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:06:36.255024', 'step': 457, 'epoch': 1} {'type': 'loss', 'content': 0.011258290149271488, 'timestamp': '2025-09-30 22:06:36.262264', 'step': 458, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:36.295548', 'step': 458, 'epoch': 1} {'type': 'loss', 'content': 0.02230708673596382, 'timestamp': '2025-09-30 22:06:36.298473', 'step': 459, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:06:36.331412', 'step': 459, 'epoch': 1} {'type': 'loss', 'content': 0.011926496401429176, 'timestamp': '2025-09-30 22:06:36.355748', 'step': 460, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:06:36.387974', 'step': 460, 'epoch': 1} {'type': 'loss', 'content': 0.028327742591500282, 'timestamp': '2025-09-30 22:06:36.391390', 'step': 461, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:36.422876', 'step': 461, 'epoch': 1} {'type': 'loss', 'content': 0.012645123526453972, 'timestamp': '2025-09-30 22:06:36.425790', 'step': 462, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:36.457933', 'step': 462, 'epoch': 1} {'type': 'loss', 'content': 0.016447247937321663, 'timestamp': '2025-09-30 22:06:36.464285', 'step': 463, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:36.496547', 'step': 463, 'epoch': 1} {'type': 'loss', 'content': 0.006226507015526295, 'timestamp': '2025-09-30 22:06:36.521221', 'step': 464, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:36.552674', 'step': 464, 'epoch': 1} {'type': 'loss', 'content': 0.011151151731610298, 'timestamp': '2025-09-30 22:06:36.555628', 'step': 465, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:36.587307', 'step': 465, 'epoch': 1} {'type': 'loss', 'content': 0.004149741493165493, 'timestamp': '2025-09-30 22:06:36.590128', 'step': 466, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:36.622240', 'step': 466, 'epoch': 1} {'type': 'loss', 'content': 0.0014940766850486398, 'timestamp': '2025-09-30 22:06:36.625252', 'step': 467, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:06:36.656873', 'step': 467, 'epoch': 1} {'type': 'loss', 'content': 0.014169608242809772, 'timestamp': '2025-09-30 22:06:36.682050', 'step': 468, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-30 22:06:37.284277', 'step': 468, 'epoch': 1} {'type': 'pplx', 'content': 95003170.83514951, 'timestamp': '2025-09-30 22:06:37.286957', 'step': 468, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:06:37.316111', 'step': 468, 'epoch': 1} {'type': 'loss', 'content': 0.023486703634262085, 'timestamp': '2025-09-30 22:06:37.319361', 'step': 469, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:37.350717', 'step': 469, 'epoch': 1} {'type': 'loss', 'content': 0.008476106449961662, 'timestamp': '2025-09-30 22:06:37.353756', 'step': 470, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:37.385655', 'step': 470, 'epoch': 1} {'type': 'loss', 'content': 0.0017679741140455008, 'timestamp': '2025-09-30 22:06:37.390012', 'step': 471, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:37.424819', 'step': 471, 'epoch': 1} {'type': 'loss', 'content': 0.008707636035978794, 'timestamp': '2025-09-30 22:06:37.450531', 'step': 472, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:06:37.482016', 'step': 472, 'epoch': 1} {'type': 'loss', 'content': 0.03412850573658943, 'timestamp': '2025-09-30 22:06:37.486702', 'step': 473, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:37.518160', 'step': 473, 'epoch': 1} {'type': 'loss', 'content': 0.025019986554980278, 'timestamp': '2025-09-30 22:06:37.520467', 'step': 474, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:37.551497', 'step': 474, 'epoch': 1} {'type': 'loss', 'content': 0.001524191233329475, 'timestamp': '2025-09-30 22:06:37.555988', 'step': 475, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:06:37.588908', 'step': 475, 'epoch': 1} {'type': 'loss', 'content': 0.027583520859479904, 'timestamp': '2025-09-30 22:06:37.617003', 'step': 476, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:37.647683', 'step': 476, 'epoch': 1} {'type': 'loss', 'content': 0.03453077748417854, 'timestamp': '2025-09-30 22:06:37.649888', 'step': 477, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:37.679918', 'step': 477, 'epoch': 1} {'type': 'loss', 'content': 0.024835318326950073, 'timestamp': '2025-09-30 22:06:37.683358', 'step': 478, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:37.715537', 'step': 478, 'epoch': 1} {'type': 'loss', 'content': 0.040151309221982956, 'timestamp': '2025-09-30 22:06:37.719847', 'step': 479, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:37.750700', 'step': 479, 'epoch': 1} {'type': 'loss', 'content': 0.0036200087051838636, 'timestamp': '2025-09-30 22:06:37.774770', 'step': 480, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:37.808959', 'step': 480, 'epoch': 1} {'type': 'loss', 'content': 0.006020395550876856, 'timestamp': '2025-09-30 22:06:37.814228', 'step': 481, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:37.848101', 'step': 481, 'epoch': 1} {'type': 'loss', 'content': 0.003234736854210496, 'timestamp': '2025-09-30 22:06:37.853641', 'step': 482, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:37.888226', 'step': 482, 'epoch': 1} {'type': 'loss', 'content': 0.03566857427358627, 'timestamp': '2025-09-30 22:06:37.892668', 'step': 483, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:37.925052', 'step': 483, 'epoch': 1} {'type': 'loss', 'content': 0.019549470394849777, 'timestamp': '2025-09-30 22:06:37.949502', 'step': 484, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:06:37.981186', 'step': 484, 'epoch': 1} {'type': 'loss', 'content': 0.025290412828326225, 'timestamp': '2025-09-30 22:06:37.988894', 'step': 485, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:38.019618', 'step': 485, 'epoch': 1} {'type': 'loss', 'content': 0.012105834670364857, 'timestamp': '2025-09-30 22:06:38.022315', 'step': 486, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:38.052856', 'step': 486, 'epoch': 1} {'type': 'loss', 'content': 0.013042249716818333, 'timestamp': '2025-09-30 22:06:38.055536', 'step': 487, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:38.086616', 'step': 487, 'epoch': 1} {'type': 'loss', 'content': 0.01455900352448225, 'timestamp': '2025-09-30 22:06:38.110658', 'step': 488, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:06:38.140873', 'step': 488, 'epoch': 1} {'type': 'loss', 'content': 0.005409404635429382, 'timestamp': '2025-09-30 22:06:38.143175', 'step': 489, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:38.173671', 'step': 489, 'epoch': 1} {'type': 'loss', 'content': 0.009504382498562336, 'timestamp': '2025-09-30 22:06:38.177664', 'step': 490, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:38.208613', 'step': 490, 'epoch': 1} {'type': 'loss', 'content': 0.015173339284956455, 'timestamp': '2025-09-30 22:06:38.211266', 'step': 491, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:38.242847', 'step': 491, 'epoch': 1} {'type': 'loss', 'content': 0.007179237436503172, 'timestamp': '2025-09-30 22:06:38.266869', 'step': 492, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:06:38.298667', 'step': 492, 'epoch': 1} {'type': 'loss', 'content': 0.005810712929815054, 'timestamp': '2025-09-30 22:06:38.304119', 'step': 493, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:38.335720', 'step': 493, 'epoch': 1} {'type': 'loss', 'content': 0.015788041055202484, 'timestamp': '2025-09-30 22:06:38.339771', 'step': 494, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:38.371143', 'step': 494, 'epoch': 1} {'type': 'loss', 'content': 0.011624081991612911, 'timestamp': '2025-09-30 22:06:38.373470', 'step': 495, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:38.405351', 'step': 495, 'epoch': 1} {'type': 'loss', 'content': 0.022120699286460876, 'timestamp': '2025-09-30 22:06:38.430807', 'step': 496, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:38.462152', 'step': 496, 'epoch': 1} {'type': 'loss', 'content': 0.0297573059797287, 'timestamp': '2025-09-30 22:06:38.464812', 'step': 497, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:06:38.495593', 'step': 497, 'epoch': 1} {'type': 'loss', 'content': 0.01783009059727192, 'timestamp': '2025-09-30 22:06:38.503357', 'step': 498, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:38.535523', 'step': 498, 'epoch': 1} {'type': 'loss', 'content': 0.019278477877378464, 'timestamp': '2025-09-30 22:06:38.538085', 'step': 499, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:38.569936', 'step': 499, 'epoch': 1} {'type': 'loss', 'content': 0.022850925102829933, 'timestamp': '2025-09-30 22:06:38.597572', 'step': 500, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 500', 'timestamp': '2025-09-30 22:06:44.217899', 'step': 500, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:44.253369', 'step': 500, 'epoch': 1} {'type': 'loss', 'content': 0.041175130754709244, 'timestamp': '2025-09-30 22:06:44.256246', 'step': 501, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:44.292319', 'step': 501, 'epoch': 1} {'type': 'loss', 'content': 0.002944961655884981, 'timestamp': '2025-09-30 22:06:44.295123', 'step': 502, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:06:44.328460', 'step': 502, 'epoch': 1} {'type': 'loss', 'content': 0.01657816581428051, 'timestamp': '2025-09-30 22:06:44.340399', 'step': 503, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:44.374189', 'step': 503, 'epoch': 1} {'type': 'loss', 'content': 0.022021254524588585, 'timestamp': '2025-09-30 22:06:44.399146', 'step': 504, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:44.432597', 'step': 504, 'epoch': 1} {'type': 'loss', 'content': 0.033169493079185486, 'timestamp': '2025-09-30 22:06:44.434888', 'step': 505, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:44.468761', 'step': 505, 'epoch': 1} {'type': 'loss', 'content': 0.029171505942940712, 'timestamp': '2025-09-30 22:06:44.473321', 'step': 506, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:06:44.506028', 'step': 506, 'epoch': 1} {'type': 'loss', 'content': 0.010595436207950115, 'timestamp': '2025-09-30 22:06:44.513711', 'step': 507, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-30 22:06:45.133572', 'step': 507, 'epoch': 1} {'type': 'pplx', 'content': 97756475.1005249, 'timestamp': '2025-09-30 22:06:45.135637', 'step': 507, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:45.165574', 'step': 507, 'epoch': 1} {'type': 'loss', 'content': 0.04366924986243248, 'timestamp': '2025-09-30 22:06:45.189954', 'step': 508, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:45.221574', 'step': 508, 'epoch': 1} {'type': 'loss', 'content': 0.008732793852686882, 'timestamp': '2025-09-30 22:06:45.223449', 'step': 509, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:45.254977', 'step': 509, 'epoch': 1} {'type': 'loss', 'content': 0.007448369171470404, 'timestamp': '2025-09-30 22:06:45.261355', 'step': 510, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:45.296559', 'step': 510, 'epoch': 1} {'type': 'loss', 'content': 0.016796164214611053, 'timestamp': '2025-09-30 22:06:45.298505', 'step': 511, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:45.330981', 'step': 511, 'epoch': 1} {'type': 'loss', 'content': 0.03324023261666298, 'timestamp': '2025-09-30 22:06:45.355674', 'step': 512, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:06:45.389308', 'step': 512, 'epoch': 1} {'type': 'loss', 'content': 0.047144342213869095, 'timestamp': '2025-09-30 22:06:45.393762', 'step': 513, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:45.427924', 'step': 513, 'epoch': 1} {'type': 'loss', 'content': 0.007025440223515034, 'timestamp': '2025-09-30 22:06:45.431507', 'step': 514, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:06:45.463062', 'step': 514, 'epoch': 1} {'type': 'loss', 'content': 0.00455810921266675, 'timestamp': '2025-09-30 22:06:45.469968', 'step': 515, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 80], 'flops': 2373281365952}, 'timestamp': '2025-09-30 22:06:45.515132', 'step': 515, 'epoch': 1} {'type': 'loss', 'content': 0.013665534555912018, 'timestamp': '2025-09-30 22:06:45.539072', 'step': 516, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:06:45.570113', 'step': 516, 'epoch': 1} {'type': 'loss', 'content': 0.025386126711964607, 'timestamp': '2025-09-30 22:06:45.573121', 'step': 517, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:06:45.603901', 'step': 517, 'epoch': 1} {'type': 'loss', 'content': 0.005903006065636873, 'timestamp': '2025-09-30 22:06:45.611019', 'step': 518, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:45.642060', 'step': 518, 'epoch': 1} {'type': 'loss', 'content': 0.016727006062865257, 'timestamp': '2025-09-30 22:06:45.646298', 'step': 519, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:45.677346', 'step': 519, 'epoch': 1} {'type': 'loss', 'content': 0.018626097589731216, 'timestamp': '2025-09-30 22:06:45.700973', 'step': 520, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:45.734530', 'step': 520, 'epoch': 1} {'type': 'loss', 'content': 0.01859509013593197, 'timestamp': '2025-09-30 22:06:45.736463', 'step': 521, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:45.767401', 'step': 521, 'epoch': 1} {'type': 'loss', 'content': 0.006714188493788242, 'timestamp': '2025-09-30 22:06:45.769391', 'step': 522, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:45.799905', 'step': 522, 'epoch': 1} {'type': 'loss', 'content': 0.03969809412956238, 'timestamp': '2025-09-30 22:06:45.804326', 'step': 523, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:45.834668', 'step': 523, 'epoch': 1} {'type': 'loss', 'content': 0.012870178557932377, 'timestamp': '2025-09-30 22:06:45.862622', 'step': 524, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:45.894090', 'step': 524, 'epoch': 1} {'type': 'loss', 'content': 0.01598569191992283, 'timestamp': '2025-09-30 22:06:45.896567', 'step': 525, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:45.929336', 'step': 525, 'epoch': 1} {'type': 'loss', 'content': 0.014287451282143593, 'timestamp': '2025-09-30 22:06:45.933695', 'step': 526, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:06:45.965407', 'step': 526, 'epoch': 1} {'type': 'loss', 'content': 0.023874444887042046, 'timestamp': '2025-09-30 22:06:45.972665', 'step': 527, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:46.008302', 'step': 527, 'epoch': 1} {'type': 'loss', 'content': 0.008932037279009819, 'timestamp': '2025-09-30 22:06:46.038681', 'step': 528, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:46.073979', 'step': 528, 'epoch': 1} {'type': 'loss', 'content': 0.02752871811389923, 'timestamp': '2025-09-30 22:06:46.075778', 'step': 529, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:06:46.106264', 'step': 529, 'epoch': 1} {'type': 'loss', 'content': 0.023428723216056824, 'timestamp': '2025-09-30 22:06:46.108639', 'step': 530, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:46.139631', 'step': 530, 'epoch': 1} {'type': 'loss', 'content': 0.00998811237514019, 'timestamp': '2025-09-30 22:06:46.141840', 'step': 531, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:46.172381', 'step': 531, 'epoch': 1} {'type': 'loss', 'content': 0.013783496804535389, 'timestamp': '2025-09-30 22:06:46.196515', 'step': 532, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:46.228967', 'step': 532, 'epoch': 1} {'type': 'loss', 'content': 0.023418204858899117, 'timestamp': '2025-09-30 22:06:46.231291', 'step': 533, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:46.262575', 'step': 533, 'epoch': 1} {'type': 'loss', 'content': 0.004461606964468956, 'timestamp': '2025-09-30 22:06:46.266706', 'step': 534, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:46.296963', 'step': 534, 'epoch': 1} {'type': 'loss', 'content': 0.009363941848278046, 'timestamp': '2025-09-30 22:06:46.299277', 'step': 535, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:46.330624', 'step': 535, 'epoch': 1} {'type': 'loss', 'content': 0.00438813166692853, 'timestamp': '2025-09-30 22:06:46.355677', 'step': 536, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:06:46.387169', 'step': 536, 'epoch': 1} {'type': 'loss', 'content': 0.014785653911530972, 'timestamp': '2025-09-30 22:06:46.389247', 'step': 537, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:46.420169', 'step': 537, 'epoch': 1} {'type': 'loss', 'content': 0.017323726788163185, 'timestamp': '2025-09-30 22:06:46.424442', 'step': 538, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:46.457355', 'step': 538, 'epoch': 1} {'type': 'loss', 'content': 0.009038321673870087, 'timestamp': '2025-09-30 22:06:46.464317', 'step': 539, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:46.495231', 'step': 539, 'epoch': 1} {'type': 'loss', 'content': 0.010455826297402382, 'timestamp': '2025-09-30 22:06:46.520585', 'step': 540, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:46.551367', 'step': 540, 'epoch': 1} {'type': 'loss', 'content': 0.028162604197859764, 'timestamp': '2025-09-30 22:06:46.555749', 'step': 541, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:46.586131', 'step': 541, 'epoch': 1} {'type': 'loss', 'content': 0.02013913355767727, 'timestamp': '2025-09-30 22:06:46.588948', 'step': 542, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:46.619608', 'step': 542, 'epoch': 1} {'type': 'loss', 'content': 0.01947060227394104, 'timestamp': '2025-09-30 22:06:46.623840', 'step': 543, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:06:46.655541', 'step': 543, 'epoch': 1} {'type': 'loss', 'content': 0.01323962863534689, 'timestamp': '2025-09-30 22:06:46.678950', 'step': 544, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:46.710010', 'step': 544, 'epoch': 1} {'type': 'loss', 'content': 0.01126060914248228, 'timestamp': '2025-09-30 22:06:46.712091', 'step': 545, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:46.742635', 'step': 545, 'epoch': 1} {'type': 'loss', 'content': 0.013927541673183441, 'timestamp': '2025-09-30 22:06:46.747131', 'step': 546, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-30 22:06:47.352604', 'step': 546, 'epoch': 1} {'type': 'pplx', 'content': 94871955.23744044, 'timestamp': '2025-09-30 22:06:47.354628', 'step': 546, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:06:47.383705', 'step': 546, 'epoch': 1} {'type': 'loss', 'content': 0.01951242797076702, 'timestamp': '2025-09-30 22:06:47.390754', 'step': 547, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:47.420889', 'step': 547, 'epoch': 1} {'type': 'loss', 'content': 0.015194809064269066, 'timestamp': '2025-09-30 22:06:47.444712', 'step': 548, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:47.475353', 'step': 548, 'epoch': 1} {'type': 'loss', 'content': 0.008021430112421513, 'timestamp': '2025-09-30 22:06:47.477531', 'step': 549, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:47.507989', 'step': 549, 'epoch': 1} {'type': 'loss', 'content': 0.0122072147205472, 'timestamp': '2025-09-30 22:06:47.509933', 'step': 550, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:47.539927', 'step': 550, 'epoch': 1} {'type': 'loss', 'content': 0.008672057650983334, 'timestamp': '2025-09-30 22:06:47.550966', 'step': 551, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:06:47.581627', 'step': 551, 'epoch': 1} {'type': 'loss', 'content': 0.012174529954791069, 'timestamp': '2025-09-30 22:06:47.609948', 'step': 552, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:47.644397', 'step': 552, 'epoch': 1} {'type': 'loss', 'content': 0.015683891251683235, 'timestamp': '2025-09-30 22:06:47.646238', 'step': 553, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:47.678675', 'step': 553, 'epoch': 1} {'type': 'loss', 'content': 0.013084937818348408, 'timestamp': '2025-09-30 22:06:47.682799', 'step': 554, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:06:47.718072', 'step': 554, 'epoch': 1} {'type': 'loss', 'content': 0.00986995454877615, 'timestamp': '2025-09-30 22:06:47.719894', 'step': 555, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:47.753102', 'step': 555, 'epoch': 1} {'type': 'loss', 'content': 0.018128668889403343, 'timestamp': '2025-09-30 22:06:47.778525', 'step': 556, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:47.811602', 'step': 556, 'epoch': 1} {'type': 'loss', 'content': 0.009639596566557884, 'timestamp': '2025-09-30 22:06:47.813796', 'step': 557, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:47.850136', 'step': 557, 'epoch': 1} {'type': 'loss', 'content': 0.007117317058146, 'timestamp': '2025-09-30 22:06:47.854701', 'step': 558, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:06:47.885597', 'step': 558, 'epoch': 1} {'type': 'loss', 'content': 0.011486091651022434, 'timestamp': '2025-09-30 22:06:47.888372', 'step': 559, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:47.921996', 'step': 559, 'epoch': 1} {'type': 'loss', 'content': 0.010793984867632389, 'timestamp': '2025-09-30 22:06:47.947000', 'step': 560, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:47.978081', 'step': 560, 'epoch': 1} {'type': 'loss', 'content': 0.020686279982328415, 'timestamp': '2025-09-30 22:06:47.982606', 'step': 561, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:48.013558', 'step': 561, 'epoch': 1} {'type': 'loss', 'content': 0.007002585101872683, 'timestamp': '2025-09-30 22:06:48.020464', 'step': 562, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:48.051491', 'step': 562, 'epoch': 1} {'type': 'loss', 'content': 0.020491817966103554, 'timestamp': '2025-09-30 22:06:48.053816', 'step': 563, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:48.085236', 'step': 563, 'epoch': 1} {'type': 'loss', 'content': 0.008508813567459583, 'timestamp': '2025-09-30 22:06:48.110669', 'step': 564, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:06:48.141790', 'step': 564, 'epoch': 1} {'type': 'loss', 'content': 0.011555018834769726, 'timestamp': '2025-09-30 22:06:48.146402', 'step': 565, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:48.178082', 'step': 565, 'epoch': 1} {'type': 'loss', 'content': 0.008152040652930737, 'timestamp': '2025-09-30 22:06:48.180043', 'step': 566, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:48.211126', 'step': 566, 'epoch': 1} {'type': 'loss', 'content': 0.008576711639761925, 'timestamp': '2025-09-30 22:06:48.213275', 'step': 567, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:48.244360', 'step': 567, 'epoch': 1} {'type': 'loss', 'content': 0.035935334861278534, 'timestamp': '2025-09-30 22:06:48.272396', 'step': 568, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:06:48.302643', 'step': 568, 'epoch': 1} {'type': 'loss', 'content': 0.0031208812724798918, 'timestamp': '2025-09-30 22:06:48.304735', 'step': 569, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:48.334856', 'step': 569, 'epoch': 1} {'type': 'loss', 'content': 0.01960751973092556, 'timestamp': '2025-09-30 22:06:48.337107', 'step': 570, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:48.367701', 'step': 570, 'epoch': 1} {'type': 'loss', 'content': 0.0027217951137572527, 'timestamp': '2025-09-30 22:06:48.374781', 'step': 571, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:48.407413', 'step': 571, 'epoch': 1} {'type': 'loss', 'content': 0.0036713494919240475, 'timestamp': '2025-09-30 22:06:48.435047', 'step': 572, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:06:48.468151', 'step': 572, 'epoch': 1} {'type': 'loss', 'content': 0.015845585614442825, 'timestamp': '2025-09-30 22:06:48.470177', 'step': 573, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:48.501027', 'step': 573, 'epoch': 1} {'type': 'loss', 'content': 0.0014490272151306272, 'timestamp': '2025-09-30 22:06:48.508020', 'step': 574, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:06:48.542698', 'step': 574, 'epoch': 1} {'type': 'loss', 'content': 0.01992383971810341, 'timestamp': '2025-09-30 22:06:48.550367', 'step': 575, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:48.581294', 'step': 575, 'epoch': 1} {'type': 'loss', 'content': 0.003482742002233863, 'timestamp': '2025-09-30 22:06:48.605318', 'step': 576, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:48.635302', 'step': 576, 'epoch': 1} {'type': 'loss', 'content': 0.0027481913566589355, 'timestamp': '2025-09-30 22:06:48.637620', 'step': 577, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:48.671067', 'step': 577, 'epoch': 1} {'type': 'loss', 'content': 0.005112276412546635, 'timestamp': '2025-09-30 22:06:48.675234', 'step': 578, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:06:48.706603', 'step': 578, 'epoch': 1} {'type': 'loss', 'content': 0.025796279311180115, 'timestamp': '2025-09-30 22:06:48.713700', 'step': 579, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:48.747278', 'step': 579, 'epoch': 1} {'type': 'loss', 'content': 0.007455120328813791, 'timestamp': '2025-09-30 22:06:48.772475', 'step': 580, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:48.803941', 'step': 580, 'epoch': 1} {'type': 'loss', 'content': 0.01217992790043354, 'timestamp': '2025-09-30 22:06:48.806195', 'step': 581, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:06:48.838109', 'step': 581, 'epoch': 1} {'type': 'loss', 'content': 0.008093575946986675, 'timestamp': '2025-09-30 22:06:48.845630', 'step': 582, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:48.877918', 'step': 582, 'epoch': 1} {'type': 'loss', 'content': 0.008411646820604801, 'timestamp': '2025-09-30 22:06:48.882043', 'step': 583, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:48.914372', 'step': 583, 'epoch': 1} {'type': 'loss', 'content': 0.012758438475430012, 'timestamp': '2025-09-30 22:06:48.941051', 'step': 584, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:06:48.974717', 'step': 584, 'epoch': 1} {'type': 'loss', 'content': 0.03811389580368996, 'timestamp': '2025-09-30 22:06:48.978889', 'step': 585, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-30 22:06:49.627435', 'step': 585, 'epoch': 1} {'type': 'pplx', 'content': 102180968.26728095, 'timestamp': '2025-09-30 22:06:49.632578', 'step': 585, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:06:49.664846', 'step': 585, 'epoch': 1} {'type': 'loss', 'content': 0.0035328001249581575, 'timestamp': '2025-09-30 22:06:49.666935', 'step': 586, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:49.700507', 'step': 586, 'epoch': 1} {'type': 'loss', 'content': 0.00914231687784195, 'timestamp': '2025-09-30 22:06:49.705252', 'step': 587, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:49.737368', 'step': 587, 'epoch': 1} {'type': 'loss', 'content': 0.04065397009253502, 'timestamp': '2025-09-30 22:06:49.760855', 'step': 588, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:49.792496', 'step': 588, 'epoch': 1} {'type': 'loss', 'content': 0.010214920155704021, 'timestamp': '2025-09-30 22:06:49.794593', 'step': 589, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:49.825056', 'step': 589, 'epoch': 1} {'type': 'loss', 'content': 0.0308766420930624, 'timestamp': '2025-09-30 22:06:49.827892', 'step': 590, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:49.858220', 'step': 590, 'epoch': 1} {'type': 'loss', 'content': 0.024278799071907997, 'timestamp': '2025-09-30 22:06:49.862898', 'step': 591, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:49.893886', 'step': 591, 'epoch': 1} {'type': 'loss', 'content': 0.016640285030007362, 'timestamp': '2025-09-30 22:06:49.918485', 'step': 592, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:49.950722', 'step': 592, 'epoch': 1} {'type': 'loss', 'content': 0.02836495451629162, 'timestamp': '2025-09-30 22:06:49.952780', 'step': 593, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:06:49.982820', 'step': 593, 'epoch': 1} {'type': 'loss', 'content': 0.01918208971619606, 'timestamp': '2025-09-30 22:06:49.985339', 'step': 594, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:50.017009', 'step': 594, 'epoch': 1} {'type': 'loss', 'content': 0.021310951560735703, 'timestamp': '2025-09-30 22:06:50.019831', 'step': 595, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:50.051700', 'step': 595, 'epoch': 1} {'type': 'loss', 'content': 0.017431020736694336, 'timestamp': '2025-09-30 22:06:50.075339', 'step': 596, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:06:50.105535', 'step': 596, 'epoch': 1} {'type': 'loss', 'content': 0.03336421772837639, 'timestamp': '2025-09-30 22:06:50.107671', 'step': 597, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:06:50.138886', 'step': 597, 'epoch': 1} {'type': 'loss', 'content': 0.005631003994494677, 'timestamp': '2025-09-30 22:06:50.141007', 'step': 598, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:50.171960', 'step': 598, 'epoch': 1} {'type': 'loss', 'content': 0.04496116191148758, 'timestamp': '2025-09-30 22:06:50.173807', 'step': 599, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:06:50.204838', 'step': 599, 'epoch': 1} {'type': 'loss', 'content': 0.0018515412230044603, 'timestamp': '2025-09-30 22:06:50.233766', 'step': 600, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:50.265278', 'step': 600, 'epoch': 1} {'type': 'loss', 'content': 0.02544139325618744, 'timestamp': '2025-09-30 22:06:50.267386', 'step': 601, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:50.299119', 'step': 601, 'epoch': 1} {'type': 'loss', 'content': 0.015904691070318222, 'timestamp': '2025-09-30 22:06:50.306245', 'step': 602, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:50.337712', 'step': 602, 'epoch': 1} {'type': 'loss', 'content': 0.010545952245593071, 'timestamp': '2025-09-30 22:06:50.340706', 'step': 603, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:50.372483', 'step': 603, 'epoch': 1} {'type': 'loss', 'content': 0.0034892302937805653, 'timestamp': '2025-09-30 22:06:50.397862', 'step': 604, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:50.430005', 'step': 604, 'epoch': 1} {'type': 'loss', 'content': 0.025599991902709007, 'timestamp': '2025-09-30 22:06:50.432156', 'step': 605, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:50.463610', 'step': 605, 'epoch': 1} {'type': 'loss', 'content': 0.038173574954271317, 'timestamp': '2025-09-30 22:06:50.465668', 'step': 606, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:50.498393', 'step': 606, 'epoch': 1} {'type': 'loss', 'content': 0.04549602046608925, 'timestamp': '2025-09-30 22:06:50.500506', 'step': 607, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:50.531694', 'step': 607, 'epoch': 1} {'type': 'loss', 'content': 0.022459452971816063, 'timestamp': '2025-09-30 22:06:50.555227', 'step': 608, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:50.586267', 'step': 608, 'epoch': 1} {'type': 'loss', 'content': 0.032032039016485214, 'timestamp': '2025-09-30 22:06:50.588709', 'step': 609, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:50.621468', 'step': 609, 'epoch': 1} {'type': 'loss', 'content': 0.009997963905334473, 'timestamp': '2025-09-30 22:06:50.623416', 'step': 610, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:50.653701', 'step': 610, 'epoch': 1} {'type': 'loss', 'content': 0.015141839161515236, 'timestamp': '2025-09-30 22:06:50.658433', 'step': 611, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:50.690009', 'step': 611, 'epoch': 1} {'type': 'loss', 'content': 0.005697866436094046, 'timestamp': '2025-09-30 22:06:50.718167', 'step': 612, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:50.749478', 'step': 612, 'epoch': 1} {'type': 'loss', 'content': 0.014740964397788048, 'timestamp': '2025-09-30 22:06:50.754150', 'step': 613, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:06:50.785880', 'step': 613, 'epoch': 1} {'type': 'loss', 'content': 0.01579163782298565, 'timestamp': '2025-09-30 22:06:50.787879', 'step': 614, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:06:50.821175', 'step': 614, 'epoch': 1} {'type': 'loss', 'content': 0.01162130106240511, 'timestamp': '2025-09-30 22:06:50.828468', 'step': 615, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:50.859442', 'step': 615, 'epoch': 1} {'type': 'loss', 'content': 0.03986949473619461, 'timestamp': '2025-09-30 22:06:50.883321', 'step': 616, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:06:50.916008', 'step': 616, 'epoch': 1} {'type': 'loss', 'content': 0.023310085758566856, 'timestamp': '2025-09-30 22:06:50.920911', 'step': 617, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:06:50.953321', 'step': 617, 'epoch': 1} {'type': 'loss', 'content': 0.030140681192278862, 'timestamp': '2025-09-30 22:06:50.955739', 'step': 618, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:50.987428', 'step': 618, 'epoch': 1} {'type': 'loss', 'content': 0.019153164699673653, 'timestamp': '2025-09-30 22:06:50.991908', 'step': 619, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:51.027299', 'step': 619, 'epoch': 1} {'type': 'loss', 'content': 0.009747117757797241, 'timestamp': '2025-09-30 22:06:51.054890', 'step': 620, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:51.086707', 'step': 620, 'epoch': 1} {'type': 'loss', 'content': 0.021668700501322746, 'timestamp': '2025-09-30 22:06:51.088771', 'step': 621, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:51.122022', 'step': 621, 'epoch': 1} {'type': 'loss', 'content': 0.004393186420202255, 'timestamp': '2025-09-30 22:06:51.129028', 'step': 622, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [2, 192], 'flops': 2847885110400}, 'timestamp': '2025-09-30 22:06:51.160010', 'step': 622, 'epoch': 1} {'type': 'loss', 'content': 0.027998492121696472, 'timestamp': '2025-09-30 22:06:51.162315', 'step': 623, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:51.211000', 'step': 623, 'epoch': 2} {'type': 'loss', 'content': 0.015790753066539764, 'timestamp': '2025-09-30 22:06:51.234771', 'step': 624, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-30 22:06:51.887027', 'step': 624, 'epoch': 2} {'type': 'pplx', 'content': 81566219.8820279, 'timestamp': '2025-09-30 22:06:51.892545', 'step': 624, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:51.925548', 'step': 624, 'epoch': 2} {'type': 'loss', 'content': 0.020654765889048576, 'timestamp': '2025-09-30 22:06:51.930505', 'step': 625, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:51.965820', 'step': 625, 'epoch': 2} {'type': 'loss', 'content': 0.011045219376683235, 'timestamp': '2025-09-30 22:06:51.972662', 'step': 626, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:52.006161', 'step': 626, 'epoch': 2} {'type': 'loss', 'content': 0.009478067047894001, 'timestamp': '2025-09-30 22:06:52.017673', 'step': 627, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:06:52.054213', 'step': 627, 'epoch': 2} {'type': 'loss', 'content': 0.004149326588958502, 'timestamp': '2025-09-30 22:06:52.082814', 'step': 628, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:52.115593', 'step': 628, 'epoch': 2} {'type': 'loss', 'content': 0.015558170154690742, 'timestamp': '2025-09-30 22:06:52.120693', 'step': 629, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:06:52.155400', 'step': 629, 'epoch': 2} {'type': 'loss', 'content': 0.012335834093391895, 'timestamp': '2025-09-30 22:06:52.160827', 'step': 630, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:52.194776', 'step': 630, 'epoch': 2} {'type': 'loss', 'content': 0.029784584417939186, 'timestamp': '2025-09-30 22:06:52.199499', 'step': 631, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:06:52.232499', 'step': 631, 'epoch': 2} {'type': 'loss', 'content': 0.015622490085661411, 'timestamp': '2025-09-30 22:06:52.256461', 'step': 632, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:52.291936', 'step': 632, 'epoch': 2} {'type': 'loss', 'content': 0.012185090221464634, 'timestamp': '2025-09-30 22:06:52.295534', 'step': 633, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:06:52.329014', 'step': 633, 'epoch': 2} {'type': 'loss', 'content': 0.040238264948129654, 'timestamp': '2025-09-30 22:06:52.336786', 'step': 634, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:52.369335', 'step': 634, 'epoch': 2} {'type': 'loss', 'content': 0.008953387849032879, 'timestamp': '2025-09-30 22:06:52.373707', 'step': 635, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:52.406464', 'step': 635, 'epoch': 2} {'type': 'loss', 'content': 0.008726871572434902, 'timestamp': '2025-09-30 22:06:52.433313', 'step': 636, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:52.471470', 'step': 636, 'epoch': 2} {'type': 'loss', 'content': 0.02807791158556938, 'timestamp': '2025-09-30 22:06:52.477576', 'step': 637, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:52.513510', 'step': 637, 'epoch': 2} {'type': 'loss', 'content': 0.021706465631723404, 'timestamp': '2025-09-30 22:06:52.519779', 'step': 638, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:52.555123', 'step': 638, 'epoch': 2} {'type': 'loss', 'content': 0.028942091390490532, 'timestamp': '2025-09-30 22:06:52.562247', 'step': 639, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:52.598303', 'step': 639, 'epoch': 2} {'type': 'loss', 'content': 0.021636173129081726, 'timestamp': '2025-09-30 22:06:52.623774', 'step': 640, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:52.658416', 'step': 640, 'epoch': 2} {'type': 'loss', 'content': 0.010957852937281132, 'timestamp': '2025-09-30 22:06:52.663084', 'step': 641, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:06:52.696076', 'step': 641, 'epoch': 2} {'type': 'loss', 'content': 0.015631377696990967, 'timestamp': '2025-09-30 22:06:52.699458', 'step': 642, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:52.733204', 'step': 642, 'epoch': 2} {'type': 'loss', 'content': 0.01614670641720295, 'timestamp': '2025-09-30 22:06:52.740166', 'step': 643, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:52.774097', 'step': 643, 'epoch': 2} {'type': 'loss', 'content': 0.020787807181477547, 'timestamp': '2025-09-30 22:06:52.802283', 'step': 644, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:52.833998', 'step': 644, 'epoch': 2} {'type': 'loss', 'content': 0.017705578356981277, 'timestamp': '2025-09-30 22:06:52.836302', 'step': 645, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:52.869466', 'step': 645, 'epoch': 2} {'type': 'loss', 'content': 0.022793063893914223, 'timestamp': '2025-09-30 22:06:52.873043', 'step': 646, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:52.906786', 'step': 646, 'epoch': 2} {'type': 'loss', 'content': 0.03414614126086235, 'timestamp': '2025-09-30 22:06:52.910099', 'step': 647, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:52.948576', 'step': 647, 'epoch': 2} {'type': 'loss', 'content': 0.006914376746863127, 'timestamp': '2025-09-30 22:06:52.976574', 'step': 648, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:06:53.016760', 'step': 648, 'epoch': 2} {'type': 'loss', 'content': 0.00787010695785284, 'timestamp': '2025-09-30 22:06:53.026794', 'step': 649, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:53.058331', 'step': 649, 'epoch': 2} {'type': 'loss', 'content': 0.037422794848680496, 'timestamp': '2025-09-30 22:06:53.064791', 'step': 650, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:06:53.099702', 'step': 650, 'epoch': 2} {'type': 'loss', 'content': 0.024876395240426064, 'timestamp': '2025-09-30 22:06:53.105407', 'step': 651, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:06:53.140219', 'step': 651, 'epoch': 2} {'type': 'loss', 'content': 0.013162984512746334, 'timestamp': '2025-09-30 22:06:53.168991', 'step': 652, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:53.207410', 'step': 652, 'epoch': 2} {'type': 'loss', 'content': 0.00827017705887556, 'timestamp': '2025-09-30 22:06:53.212882', 'step': 653, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:53.246307', 'step': 653, 'epoch': 2} {'type': 'loss', 'content': 0.005606896709650755, 'timestamp': '2025-09-30 22:06:53.252067', 'step': 654, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:53.287654', 'step': 654, 'epoch': 2} {'type': 'loss', 'content': 0.020719783380627632, 'timestamp': '2025-09-30 22:06:53.290458', 'step': 655, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:53.326242', 'step': 655, 'epoch': 2} {'type': 'loss', 'content': 0.018828241154551506, 'timestamp': '2025-09-30 22:06:53.354213', 'step': 656, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:53.385968', 'step': 656, 'epoch': 2} {'type': 'loss', 'content': 0.021952269598841667, 'timestamp': '2025-09-30 22:06:53.390774', 'step': 657, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:53.423627', 'step': 657, 'epoch': 2} {'type': 'loss', 'content': 0.012793275527656078, 'timestamp': '2025-09-30 22:06:53.429179', 'step': 658, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:53.464103', 'step': 658, 'epoch': 2} {'type': 'loss', 'content': 0.01424416247755289, 'timestamp': '2025-09-30 22:06:53.466970', 'step': 659, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:53.500331', 'step': 659, 'epoch': 2} {'type': 'loss', 'content': 0.020341385155916214, 'timestamp': '2025-09-30 22:06:53.525516', 'step': 660, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:53.556820', 'step': 660, 'epoch': 2} {'type': 'loss', 'content': 0.01396105531603098, 'timestamp': '2025-09-30 22:06:53.560714', 'step': 661, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:53.593855', 'step': 661, 'epoch': 2} {'type': 'loss', 'content': 0.015063964761793613, 'timestamp': '2025-09-30 22:06:53.597857', 'step': 662, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:53.630838', 'step': 662, 'epoch': 2} {'type': 'loss', 'content': 0.019370300695300102, 'timestamp': '2025-09-30 22:06:53.636632', 'step': 663, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-30 22:06:54.267786', 'step': 663, 'epoch': 2} {'type': 'pplx', 'content': 77489775.34361932, 'timestamp': '2025-09-30 22:06:54.273186', 'step': 663, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:54.306633', 'step': 663, 'epoch': 2} {'type': 'loss', 'content': 0.014374351128935814, 'timestamp': '2025-09-30 22:06:54.336794', 'step': 664, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:54.368160', 'step': 664, 'epoch': 2} {'type': 'loss', 'content': 0.027066349983215332, 'timestamp': '2025-09-30 22:06:54.371663', 'step': 665, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:54.403813', 'step': 665, 'epoch': 2} {'type': 'loss', 'content': 0.013484174385666847, 'timestamp': '2025-09-30 22:06:54.408358', 'step': 666, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:06:54.440538', 'step': 666, 'epoch': 2} {'type': 'loss', 'content': 0.045839279890060425, 'timestamp': '2025-09-30 22:06:54.447773', 'step': 667, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:54.479727', 'step': 667, 'epoch': 2} {'type': 'loss', 'content': 0.0061507937498390675, 'timestamp': '2025-09-30 22:06:54.504350', 'step': 668, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:54.535714', 'step': 668, 'epoch': 2} {'type': 'loss', 'content': 0.01860460638999939, 'timestamp': '2025-09-30 22:06:54.538570', 'step': 669, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:54.570544', 'step': 669, 'epoch': 2} {'type': 'loss', 'content': 0.02531057968735695, 'timestamp': '2025-09-30 22:06:54.575076', 'step': 670, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:06:54.609118', 'step': 670, 'epoch': 2} {'type': 'loss', 'content': 0.02880782261490822, 'timestamp': '2025-09-30 22:06:54.613357', 'step': 671, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:54.649347', 'step': 671, 'epoch': 2} {'type': 'loss', 'content': 0.02219448611140251, 'timestamp': '2025-09-30 22:06:54.673704', 'step': 672, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:54.704753', 'step': 672, 'epoch': 2} {'type': 'loss', 'content': 0.012237013317644596, 'timestamp': '2025-09-30 22:06:54.713283', 'step': 673, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:54.748688', 'step': 673, 'epoch': 2} {'type': 'loss', 'content': 0.020764613524079323, 'timestamp': '2025-09-30 22:06:54.750742', 'step': 674, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:06:54.781620', 'step': 674, 'epoch': 2} {'type': 'loss', 'content': 0.003978596068918705, 'timestamp': '2025-09-30 22:06:54.786220', 'step': 675, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:54.818060', 'step': 675, 'epoch': 2} {'type': 'loss', 'content': 0.028562063351273537, 'timestamp': '2025-09-30 22:06:54.842486', 'step': 676, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:54.876042', 'step': 676, 'epoch': 2} {'type': 'loss', 'content': 0.009312150999903679, 'timestamp': '2025-09-30 22:06:54.879318', 'step': 677, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:54.911098', 'step': 677, 'epoch': 2} {'type': 'loss', 'content': 0.014050443656742573, 'timestamp': '2025-09-30 22:06:54.916843', 'step': 678, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:54.951884', 'step': 678, 'epoch': 2} {'type': 'loss', 'content': 0.020990783348679543, 'timestamp': '2025-09-30 22:06:54.956016', 'step': 679, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:54.988202', 'step': 679, 'epoch': 2} {'type': 'loss', 'content': 0.01680334284901619, 'timestamp': '2025-09-30 22:06:55.012484', 'step': 680, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:06:55.046218', 'step': 680, 'epoch': 2} {'type': 'loss', 'content': 0.027634665369987488, 'timestamp': '2025-09-30 22:06:55.050194', 'step': 681, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:06:55.084154', 'step': 681, 'epoch': 2} {'type': 'loss', 'content': 0.017819708213210106, 'timestamp': '2025-09-30 22:06:55.086959', 'step': 682, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:55.117628', 'step': 682, 'epoch': 2} {'type': 'loss', 'content': 0.022069228813052177, 'timestamp': '2025-09-30 22:06:55.121770', 'step': 683, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:06:55.153780', 'step': 683, 'epoch': 2} {'type': 'loss', 'content': 0.013544796034693718, 'timestamp': '2025-09-30 22:06:55.178806', 'step': 684, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:55.211027', 'step': 684, 'epoch': 2} {'type': 'loss', 'content': 0.01913309469819069, 'timestamp': '2025-09-30 22:06:55.213480', 'step': 685, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:06:55.245152', 'step': 685, 'epoch': 2} {'type': 'loss', 'content': 0.009119086898863316, 'timestamp': '2025-09-30 22:06:55.249343', 'step': 686, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:55.281575', 'step': 686, 'epoch': 2} {'type': 'loss', 'content': 0.016176508739590645, 'timestamp': '2025-09-30 22:06:55.286110', 'step': 687, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:55.317259', 'step': 687, 'epoch': 2} {'type': 'loss', 'content': 0.020089006051421165, 'timestamp': '2025-09-30 22:06:55.342494', 'step': 688, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:55.373780', 'step': 688, 'epoch': 2} {'type': 'loss', 'content': 0.019316306337714195, 'timestamp': '2025-09-30 22:06:55.376470', 'step': 689, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:55.410938', 'step': 689, 'epoch': 2} {'type': 'loss', 'content': 0.012639305554330349, 'timestamp': '2025-09-30 22:06:55.416491', 'step': 690, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:06:55.450366', 'step': 690, 'epoch': 2} {'type': 'loss', 'content': 0.010887724347412586, 'timestamp': '2025-09-30 22:06:55.454657', 'step': 691, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:55.486573', 'step': 691, 'epoch': 2} {'type': 'loss', 'content': 0.029533464461565018, 'timestamp': '2025-09-30 22:06:55.511717', 'step': 692, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:55.543655', 'step': 692, 'epoch': 2} {'type': 'loss', 'content': 0.01161243673413992, 'timestamp': '2025-09-30 22:06:55.546033', 'step': 693, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:06:55.578061', 'step': 693, 'epoch': 2} {'type': 'loss', 'content': 0.014599710702896118, 'timestamp': '2025-09-30 22:06:55.580023', 'step': 694, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:06:55.613907', 'step': 694, 'epoch': 2} {'type': 'loss', 'content': 0.013431690633296967, 'timestamp': '2025-09-30 22:06:55.615914', 'step': 695, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:06:55.647063', 'step': 695, 'epoch': 2} {'type': 'loss', 'content': 0.015383687801659107, 'timestamp': '2025-09-30 22:06:55.674080', 'step': 696, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:55.707054', 'step': 696, 'epoch': 2} {'type': 'loss', 'content': 0.008616768755018711, 'timestamp': '2025-09-30 22:06:55.710421', 'step': 697, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:55.742911', 'step': 697, 'epoch': 2} {'type': 'loss', 'content': 0.004892031196504831, 'timestamp': '2025-09-30 22:06:55.745233', 'step': 698, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:55.776553', 'step': 698, 'epoch': 2} {'type': 'loss', 'content': 0.004173830151557922, 'timestamp': '2025-09-30 22:06:55.781131', 'step': 699, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:55.812352', 'step': 699, 'epoch': 2} {'type': 'loss', 'content': 0.019902020692825317, 'timestamp': '2025-09-30 22:06:55.836066', 'step': 700, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:55.867295', 'step': 700, 'epoch': 2} {'type': 'loss', 'content': 0.014127095229923725, 'timestamp': '2025-09-30 22:06:55.873163', 'step': 701, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:06:55.909001', 'step': 701, 'epoch': 2} {'type': 'loss', 'content': 0.018558593466877937, 'timestamp': '2025-09-30 22:06:55.916765', 'step': 702, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-30 22:06:56.536558', 'step': 702, 'epoch': 2} {'type': 'pplx', 'content': 78927595.24360396, 'timestamp': '2025-09-30 22:06:56.538322', 'step': 702, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:56.567337', 'step': 702, 'epoch': 2} {'type': 'loss', 'content': 0.006854670587927103, 'timestamp': '2025-09-30 22:06:56.571745', 'step': 703, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:56.604593', 'step': 703, 'epoch': 2} {'type': 'loss', 'content': 0.010047112591564655, 'timestamp': '2025-09-30 22:06:56.632147', 'step': 704, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:56.663550', 'step': 704, 'epoch': 2} {'type': 'loss', 'content': 0.007333940826356411, 'timestamp': '2025-09-30 22:06:56.665848', 'step': 705, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:06:56.696003', 'step': 705, 'epoch': 2} {'type': 'loss', 'content': 0.01448789145797491, 'timestamp': '2025-09-30 22:06:56.703808', 'step': 706, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:56.735622', 'step': 706, 'epoch': 2} {'type': 'loss', 'content': 0.016120653599500656, 'timestamp': '2025-09-30 22:06:56.739942', 'step': 707, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:06:56.770880', 'step': 707, 'epoch': 2} {'type': 'loss', 'content': 0.0023472048342227936, 'timestamp': '2025-09-30 22:06:56.799131', 'step': 708, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:56.829559', 'step': 708, 'epoch': 2} {'type': 'loss', 'content': 0.005357666406780481, 'timestamp': '2025-09-30 22:06:56.831489', 'step': 709, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:06:56.862387', 'step': 709, 'epoch': 2} {'type': 'loss', 'content': 0.016620632261037827, 'timestamp': '2025-09-30 22:06:56.870095', 'step': 710, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:56.900537', 'step': 710, 'epoch': 2} {'type': 'loss', 'content': 0.004693766124546528, 'timestamp': '2025-09-30 22:06:56.902712', 'step': 711, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:56.933407', 'step': 711, 'epoch': 2} {'type': 'loss', 'content': 0.020208735018968582, 'timestamp': '2025-09-30 22:06:56.961351', 'step': 712, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:56.994316', 'step': 712, 'epoch': 2} {'type': 'loss', 'content': 0.00886120367795229, 'timestamp': '2025-09-30 22:06:56.996243', 'step': 713, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:57.054146', 'step': 713, 'epoch': 2} {'type': 'loss', 'content': 0.008350329473614693, 'timestamp': '2025-09-30 22:06:57.057065', 'step': 714, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:57.089829', 'step': 714, 'epoch': 2} {'type': 'loss', 'content': 0.009579906240105629, 'timestamp': '2025-09-30 22:06:57.094350', 'step': 715, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:57.125239', 'step': 715, 'epoch': 2} {'type': 'loss', 'content': 0.02509910985827446, 'timestamp': '2025-09-30 22:06:57.150330', 'step': 716, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:57.181409', 'step': 716, 'epoch': 2} {'type': 'loss', 'content': 0.01436839159578085, 'timestamp': '2025-09-30 22:06:57.183643', 'step': 717, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:57.213826', 'step': 717, 'epoch': 2} {'type': 'loss', 'content': 0.0069790855050086975, 'timestamp': '2025-09-30 22:06:57.218317', 'step': 718, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:57.249048', 'step': 718, 'epoch': 2} {'type': 'loss', 'content': 0.023231515660881996, 'timestamp': '2025-09-30 22:06:57.253207', 'step': 719, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:57.284159', 'step': 719, 'epoch': 2} {'type': 'loss', 'content': 0.013670231215655804, 'timestamp': '2025-09-30 22:06:57.307905', 'step': 720, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:57.338303', 'step': 720, 'epoch': 2} {'type': 'loss', 'content': 0.025221526622772217, 'timestamp': '2025-09-30 22:06:57.340345', 'step': 721, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:06:57.371395', 'step': 721, 'epoch': 2} {'type': 'loss', 'content': 0.003092572558671236, 'timestamp': '2025-09-30 22:06:57.379091', 'step': 722, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:57.409678', 'step': 722, 'epoch': 2} {'type': 'loss', 'content': 0.0018878942355513573, 'timestamp': '2025-09-30 22:06:57.412535', 'step': 723, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:57.444380', 'step': 723, 'epoch': 2} {'type': 'loss', 'content': 0.005184180103242397, 'timestamp': '2025-09-30 22:06:57.472480', 'step': 724, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:57.503000', 'step': 724, 'epoch': 2} {'type': 'loss', 'content': 0.004255128558725119, 'timestamp': '2025-09-30 22:06:57.505485', 'step': 725, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:57.536205', 'step': 725, 'epoch': 2} {'type': 'loss', 'content': 0.01653963141143322, 'timestamp': '2025-09-30 22:06:57.538885', 'step': 726, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:57.569237', 'step': 726, 'epoch': 2} {'type': 'loss', 'content': 0.012326141819357872, 'timestamp': '2025-09-30 22:06:57.573907', 'step': 727, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:06:57.605730', 'step': 727, 'epoch': 2} {'type': 'loss', 'content': 0.00445201713591814, 'timestamp': '2025-09-30 22:06:57.634845', 'step': 728, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:06:57.668996', 'step': 728, 'epoch': 2} {'type': 'loss', 'content': 0.0036043657455593348, 'timestamp': '2025-09-30 22:06:57.671120', 'step': 729, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:57.701846', 'step': 729, 'epoch': 2} {'type': 'loss', 'content': 0.009894789196550846, 'timestamp': '2025-09-30 22:06:57.706193', 'step': 730, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:57.738040', 'step': 730, 'epoch': 2} {'type': 'loss', 'content': 0.00579670537263155, 'timestamp': '2025-09-30 22:06:57.740011', 'step': 731, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:57.771121', 'step': 731, 'epoch': 2} {'type': 'loss', 'content': 0.01987524703145027, 'timestamp': '2025-09-30 22:06:57.794816', 'step': 732, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:57.826959', 'step': 732, 'epoch': 2} {'type': 'loss', 'content': 0.01371761504560709, 'timestamp': '2025-09-30 22:06:57.829307', 'step': 733, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:57.860216', 'step': 733, 'epoch': 2} {'type': 'loss', 'content': 0.02026171050965786, 'timestamp': '2025-09-30 22:06:57.864912', 'step': 734, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:57.896655', 'step': 734, 'epoch': 2} {'type': 'loss', 'content': 0.009422077797353268, 'timestamp': '2025-09-30 22:06:57.899037', 'step': 735, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:57.931479', 'step': 735, 'epoch': 2} {'type': 'loss', 'content': 0.007785578723996878, 'timestamp': '2025-09-30 22:06:57.956695', 'step': 736, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:06:57.987650', 'step': 736, 'epoch': 2} {'type': 'loss', 'content': 0.01787823997437954, 'timestamp': '2025-09-30 22:06:57.992954', 'step': 737, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:58.022947', 'step': 737, 'epoch': 2} {'type': 'loss', 'content': 0.01570359617471695, 'timestamp': '2025-09-30 22:06:58.026151', 'step': 738, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:58.057153', 'step': 738, 'epoch': 2} {'type': 'loss', 'content': 0.005546352826058865, 'timestamp': '2025-09-30 22:06:58.061668', 'step': 739, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:06:58.093400', 'step': 739, 'epoch': 2} {'type': 'loss', 'content': 0.002934214426204562, 'timestamp': '2025-09-30 22:06:58.118087', 'step': 740, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:58.149705', 'step': 740, 'epoch': 2} {'type': 'loss', 'content': 0.017422713339328766, 'timestamp': '2025-09-30 22:06:58.154336', 'step': 741, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-30 22:06:58.767143', 'step': 741, 'epoch': 2} {'type': 'pplx', 'content': 89899745.22455357, 'timestamp': '2025-09-30 22:06:58.769114', 'step': 741, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:58.806015', 'step': 741, 'epoch': 2} {'type': 'loss', 'content': 0.009376317262649536, 'timestamp': '2025-09-30 22:06:58.810769', 'step': 742, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:06:58.841216', 'step': 742, 'epoch': 2} {'type': 'loss', 'content': 0.010158979333937168, 'timestamp': '2025-09-30 22:06:58.852371', 'step': 743, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:58.883114', 'step': 743, 'epoch': 2} {'type': 'loss', 'content': 0.00397266773506999, 'timestamp': '2025-09-30 22:06:58.908797', 'step': 744, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:58.939482', 'step': 744, 'epoch': 2} {'type': 'loss', 'content': 0.04606813192367554, 'timestamp': '2025-09-30 22:06:58.941486', 'step': 745, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:58.972123', 'step': 745, 'epoch': 2} {'type': 'loss', 'content': 0.03550694137811661, 'timestamp': '2025-09-30 22:06:58.976626', 'step': 746, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:06:59.007952', 'step': 746, 'epoch': 2} {'type': 'loss', 'content': 0.015329292044043541, 'timestamp': '2025-09-30 22:06:59.015089', 'step': 747, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:06:59.046113', 'step': 747, 'epoch': 2} {'type': 'loss', 'content': 0.019602473825216293, 'timestamp': '2025-09-30 22:06:59.070905', 'step': 748, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:59.101710', 'step': 748, 'epoch': 2} {'type': 'loss', 'content': 0.0035281218588352203, 'timestamp': '2025-09-30 22:06:59.106663', 'step': 749, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:06:59.140814', 'step': 749, 'epoch': 2} {'type': 'loss', 'content': 0.0005557361873798072, 'timestamp': '2025-09-30 22:06:59.145253', 'step': 750, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:06:59.179556', 'step': 750, 'epoch': 2} {'type': 'loss', 'content': 0.0015190315898507833, 'timestamp': '2025-09-30 22:06:59.183421', 'step': 751, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:59.216433', 'step': 751, 'epoch': 2} {'type': 'loss', 'content': 0.0053230589255690575, 'timestamp': '2025-09-30 22:06:59.243120', 'step': 752, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:06:59.275132', 'step': 752, 'epoch': 2} {'type': 'loss', 'content': 0.009202539920806885, 'timestamp': '2025-09-30 22:06:59.282949', 'step': 753, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:59.314704', 'step': 753, 'epoch': 2} {'type': 'loss', 'content': 0.0009166055242531002, 'timestamp': '2025-09-30 22:06:59.318987', 'step': 754, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:06:59.351436', 'step': 754, 'epoch': 2} {'type': 'loss', 'content': 0.0019864339847117662, 'timestamp': '2025-09-30 22:06:59.354462', 'step': 755, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:59.387145', 'step': 755, 'epoch': 2} {'type': 'loss', 'content': 0.0035643898881971836, 'timestamp': '2025-09-30 22:06:59.412191', 'step': 756, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:06:59.444629', 'step': 756, 'epoch': 2} {'type': 'loss', 'content': 0.006387303117662668, 'timestamp': '2025-09-30 22:06:59.449290', 'step': 757, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:06:59.481167', 'step': 757, 'epoch': 2} {'type': 'loss', 'content': 0.05764467641711235, 'timestamp': '2025-09-30 22:06:59.488748', 'step': 758, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:59.520634', 'step': 758, 'epoch': 2} {'type': 'loss', 'content': 0.0008298616739921272, 'timestamp': '2025-09-30 22:06:59.525089', 'step': 759, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:06:59.556392', 'step': 759, 'epoch': 2} {'type': 'loss', 'content': 0.01580825448036194, 'timestamp': '2025-09-30 22:06:59.580149', 'step': 760, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:59.610452', 'step': 760, 'epoch': 2} {'type': 'loss', 'content': 0.0028829979710280895, 'timestamp': '2025-09-30 22:06:59.613808', 'step': 761, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:06:59.649586', 'step': 761, 'epoch': 2} {'type': 'loss', 'content': 0.06186835095286369, 'timestamp': '2025-09-30 22:06:59.654187', 'step': 762, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:06:59.686470', 'step': 762, 'epoch': 2} {'type': 'loss', 'content': 0.0009510466479696333, 'timestamp': '2025-09-30 22:06:59.688793', 'step': 763, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:06:59.719817', 'step': 763, 'epoch': 2} {'type': 'loss', 'content': 0.001385956653393805, 'timestamp': '2025-09-30 22:06:59.743490', 'step': 764, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:06:59.774084', 'step': 764, 'epoch': 2} {'type': 'loss', 'content': 0.021189337596297264, 'timestamp': '2025-09-30 22:06:59.779826', 'step': 765, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:59.810373', 'step': 765, 'epoch': 2} {'type': 'loss', 'content': 0.001855433569289744, 'timestamp': '2025-09-30 22:06:59.812619', 'step': 766, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:59.843261', 'step': 766, 'epoch': 2} {'type': 'loss', 'content': 0.0033974931575357914, 'timestamp': '2025-09-30 22:06:59.845376', 'step': 767, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:59.875644', 'step': 767, 'epoch': 2} {'type': 'loss', 'content': 0.024456506595015526, 'timestamp': '2025-09-30 22:06:59.899138', 'step': 768, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:06:59.929626', 'step': 768, 'epoch': 2} {'type': 'loss', 'content': 0.009055771864950657, 'timestamp': '2025-09-30 22:06:59.931610', 'step': 769, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:06:59.962127', 'step': 769, 'epoch': 2} {'type': 'loss', 'content': 0.005355574190616608, 'timestamp': '2025-09-30 22:06:59.964077', 'step': 770, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:06:59.994278', 'step': 770, 'epoch': 2} {'type': 'loss', 'content': 0.006609226576983929, 'timestamp': '2025-09-30 22:06:59.998682', 'step': 771, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:00.030577', 'step': 771, 'epoch': 2} {'type': 'loss', 'content': 0.014681574888527393, 'timestamp': '2025-09-30 22:07:00.055805', 'step': 772, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:00.087352', 'step': 772, 'epoch': 2} {'type': 'loss', 'content': 0.04299915209412575, 'timestamp': '2025-09-30 22:07:00.089926', 'step': 773, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:00.120826', 'step': 773, 'epoch': 2} {'type': 'loss', 'content': 0.0018289716681465507, 'timestamp': '2025-09-30 22:07:00.123339', 'step': 774, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:00.154141', 'step': 774, 'epoch': 2} {'type': 'loss', 'content': 0.029209045693278313, 'timestamp': '2025-09-30 22:07:00.155948', 'step': 775, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:00.186556', 'step': 775, 'epoch': 2} {'type': 'loss', 'content': 0.010646182112395763, 'timestamp': '2025-09-30 22:07:00.210376', 'step': 776, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:00.240818', 'step': 776, 'epoch': 2} {'type': 'loss', 'content': 0.037542395293712616, 'timestamp': '2025-09-30 22:07:00.243006', 'step': 777, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:00.273017', 'step': 777, 'epoch': 2} {'type': 'loss', 'content': 0.008672518655657768, 'timestamp': '2025-09-30 22:07:00.277318', 'step': 778, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:00.308323', 'step': 778, 'epoch': 2} {'type': 'loss', 'content': 0.012219659052789211, 'timestamp': '2025-09-30 22:07:00.315325', 'step': 779, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:07:00.345963', 'step': 779, 'epoch': 2} {'type': 'loss', 'content': 0.001221206970512867, 'timestamp': '2025-09-30 22:07:00.369744', 'step': 780, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-30 22:07:00.970836', 'step': 780, 'epoch': 2} {'type': 'pplx', 'content': 90265809.0697191, 'timestamp': '2025-09-30 22:07:00.972470', 'step': 780, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:01.000328', 'step': 780, 'epoch': 2} {'type': 'loss', 'content': 0.05539524182677269, 'timestamp': '2025-09-30 22:07:01.003114', 'step': 781, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:01.035736', 'step': 781, 'epoch': 2} {'type': 'loss', 'content': 0.03203495219349861, 'timestamp': '2025-09-30 22:07:01.037895', 'step': 782, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:07:01.074428', 'step': 782, 'epoch': 2} {'type': 'loss', 'content': 0.013802976347506046, 'timestamp': '2025-09-30 22:07:01.081838', 'step': 783, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:01.113051', 'step': 783, 'epoch': 2} {'type': 'loss', 'content': 0.017052913084626198, 'timestamp': '2025-09-30 22:07:01.138288', 'step': 784, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:01.169140', 'step': 784, 'epoch': 2} {'type': 'loss', 'content': 0.007087051402777433, 'timestamp': '2025-09-30 22:07:01.171053', 'step': 785, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:07:01.202539', 'step': 785, 'epoch': 2} {'type': 'loss', 'content': 0.022962896153330803, 'timestamp': '2025-09-30 22:07:01.210296', 'step': 786, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:01.240714', 'step': 786, 'epoch': 2} {'type': 'loss', 'content': 0.0020143541041761637, 'timestamp': '2025-09-30 22:07:01.245325', 'step': 787, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:01.275591', 'step': 787, 'epoch': 2} {'type': 'loss', 'content': 0.004092036280781031, 'timestamp': '2025-09-30 22:07:01.299424', 'step': 788, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:01.329115', 'step': 788, 'epoch': 2} {'type': 'loss', 'content': 0.014684909954667091, 'timestamp': '2025-09-30 22:07:01.331208', 'step': 789, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:01.365841', 'step': 789, 'epoch': 2} {'type': 'loss', 'content': 0.003241106402128935, 'timestamp': '2025-09-30 22:07:01.370238', 'step': 790, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:01.400945', 'step': 790, 'epoch': 2} {'type': 'loss', 'content': 0.015645338222384453, 'timestamp': '2025-09-30 22:07:01.403706', 'step': 791, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:07:01.434597', 'step': 791, 'epoch': 2} {'type': 'loss', 'content': 0.0030750453006476164, 'timestamp': '2025-09-30 22:07:01.463080', 'step': 792, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:01.496741', 'step': 792, 'epoch': 2} {'type': 'loss', 'content': 0.0076875039376318455, 'timestamp': '2025-09-30 22:07:01.499595', 'step': 793, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:01.530808', 'step': 793, 'epoch': 2} {'type': 'loss', 'content': 0.02010907046496868, 'timestamp': '2025-09-30 22:07:01.533212', 'step': 794, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:07:01.565119', 'step': 794, 'epoch': 2} {'type': 'loss', 'content': 0.014395522885024548, 'timestamp': '2025-09-30 22:07:01.572445', 'step': 795, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:01.602870', 'step': 795, 'epoch': 2} {'type': 'loss', 'content': 0.019928177818655968, 'timestamp': '2025-09-30 22:07:01.628077', 'step': 796, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:01.660832', 'step': 796, 'epoch': 2} {'type': 'loss', 'content': 0.036851465702056885, 'timestamp': '2025-09-30 22:07:01.662989', 'step': 797, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:01.700539', 'step': 797, 'epoch': 2} {'type': 'loss', 'content': 0.010141278617084026, 'timestamp': '2025-09-30 22:07:01.705207', 'step': 798, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:01.737319', 'step': 798, 'epoch': 2} {'type': 'loss', 'content': 0.0025164049584418535, 'timestamp': '2025-09-30 22:07:01.740106', 'step': 799, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:07:01.771272', 'step': 799, 'epoch': 2} {'type': 'loss', 'content': 0.019155463203787804, 'timestamp': '2025-09-30 22:07:01.794744', 'step': 800, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:01.826250', 'step': 800, 'epoch': 2} {'type': 'loss', 'content': 0.001676939777098596, 'timestamp': '2025-09-30 22:07:01.829063', 'step': 801, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:01.862176', 'step': 801, 'epoch': 2} {'type': 'loss', 'content': 0.0038142234552651644, 'timestamp': '2025-09-30 22:07:01.866392', 'step': 802, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:01.898311', 'step': 802, 'epoch': 2} {'type': 'loss', 'content': 0.008628753945231438, 'timestamp': '2025-09-30 22:07:01.901063', 'step': 803, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:01.934177', 'step': 803, 'epoch': 2} {'type': 'loss', 'content': 0.022448832169175148, 'timestamp': '2025-09-30 22:07:01.959464', 'step': 804, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:07:01.994813', 'step': 804, 'epoch': 2} {'type': 'loss', 'content': 0.03241322934627533, 'timestamp': '2025-09-30 22:07:02.000017', 'step': 805, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:07:02.033172', 'step': 805, 'epoch': 2} {'type': 'loss', 'content': 0.005572688765823841, 'timestamp': '2025-09-30 22:07:02.035635', 'step': 806, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:02.066587', 'step': 806, 'epoch': 2} {'type': 'loss', 'content': 0.0072050197049975395, 'timestamp': '2025-09-30 22:07:02.070044', 'step': 807, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:02.103462', 'step': 807, 'epoch': 2} {'type': 'loss', 'content': 0.009295523166656494, 'timestamp': '2025-09-30 22:07:02.128733', 'step': 808, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:02.161898', 'step': 808, 'epoch': 2} {'type': 'loss', 'content': 0.014571547508239746, 'timestamp': '2025-09-30 22:07:02.163856', 'step': 809, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:02.197047', 'step': 809, 'epoch': 2} {'type': 'loss', 'content': 0.015193559229373932, 'timestamp': '2025-09-30 22:07:02.200539', 'step': 810, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:07:02.235139', 'step': 810, 'epoch': 2} {'type': 'loss', 'content': 0.005540668033063412, 'timestamp': '2025-09-30 22:07:02.242422', 'step': 811, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:02.274900', 'step': 811, 'epoch': 2} {'type': 'loss', 'content': 0.008927380666136742, 'timestamp': '2025-09-30 22:07:02.300147', 'step': 812, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:02.334365', 'step': 812, 'epoch': 2} {'type': 'loss', 'content': 0.006226464174687862, 'timestamp': '2025-09-30 22:07:02.337229', 'step': 813, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:02.368031', 'step': 813, 'epoch': 2} {'type': 'loss', 'content': 0.02287706360220909, 'timestamp': '2025-09-30 22:07:02.370472', 'step': 814, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:02.402790', 'step': 814, 'epoch': 2} {'type': 'loss', 'content': 0.02037571184337139, 'timestamp': '2025-09-30 22:07:02.405310', 'step': 815, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:02.437854', 'step': 815, 'epoch': 2} {'type': 'loss', 'content': 0.005328350700438023, 'timestamp': '2025-09-30 22:07:02.469644', 'step': 816, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:02.501613', 'step': 816, 'epoch': 2} {'type': 'loss', 'content': 0.019001901149749756, 'timestamp': '2025-09-30 22:07:02.505118', 'step': 817, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:02.538290', 'step': 817, 'epoch': 2} {'type': 'loss', 'content': 0.0028542112559080124, 'timestamp': '2025-09-30 22:07:02.540778', 'step': 818, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:02.572538', 'step': 818, 'epoch': 2} {'type': 'loss', 'content': 0.015463702380657196, 'timestamp': '2025-09-30 22:07:02.577204', 'step': 819, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-30 22:07:03.192026', 'step': 819, 'epoch': 2} {'type': 'pplx', 'content': 84273412.49839544, 'timestamp': '2025-09-30 22:07:03.195515', 'step': 819, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:03.226123', 'step': 819, 'epoch': 2} {'type': 'loss', 'content': 0.010286093689501286, 'timestamp': '2025-09-30 22:07:03.251787', 'step': 820, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:03.286105', 'step': 820, 'epoch': 2} {'type': 'loss', 'content': 0.040026549249887466, 'timestamp': '2025-09-30 22:07:03.288272', 'step': 821, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:07:03.320426', 'step': 821, 'epoch': 2} {'type': 'loss', 'content': 0.0017085708677768707, 'timestamp': '2025-09-30 22:07:03.327568', 'step': 822, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:03.360075', 'step': 822, 'epoch': 2} {'type': 'loss', 'content': 0.002073936630040407, 'timestamp': '2025-09-30 22:07:03.367245', 'step': 823, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:03.403097', 'step': 823, 'epoch': 2} {'type': 'loss', 'content': 0.012383386492729187, 'timestamp': '2025-09-30 22:07:03.431148', 'step': 824, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:03.465505', 'step': 824, 'epoch': 2} {'type': 'loss', 'content': 0.005613238550722599, 'timestamp': '2025-09-30 22:07:03.467662', 'step': 825, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:07:03.499264', 'step': 825, 'epoch': 2} {'type': 'loss', 'content': 0.010608835145831108, 'timestamp': '2025-09-30 22:07:03.501447', 'step': 826, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:03.533710', 'step': 826, 'epoch': 2} {'type': 'loss', 'content': 0.016609007492661476, 'timestamp': '2025-09-30 22:07:03.536995', 'step': 827, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:03.570057', 'step': 827, 'epoch': 2} {'type': 'loss', 'content': 0.01166166365146637, 'timestamp': '2025-09-30 22:07:03.594362', 'step': 828, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:03.626169', 'step': 828, 'epoch': 2} {'type': 'loss', 'content': 0.013758150860667229, 'timestamp': '2025-09-30 22:07:03.631126', 'step': 829, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:07:03.662702', 'step': 829, 'epoch': 2} {'type': 'loss', 'content': 0.03357986733317375, 'timestamp': '2025-09-30 22:07:03.665896', 'step': 830, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:07:03.697811', 'step': 830, 'epoch': 2} {'type': 'loss', 'content': 0.019499024376273155, 'timestamp': '2025-09-30 22:07:03.705171', 'step': 831, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:03.740713', 'step': 831, 'epoch': 2} {'type': 'loss', 'content': 0.01842236891388893, 'timestamp': '2025-09-30 22:07:03.764721', 'step': 832, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:03.797386', 'step': 832, 'epoch': 2} {'type': 'loss', 'content': 0.018398495391011238, 'timestamp': '2025-09-30 22:07:03.800051', 'step': 833, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:03.831348', 'step': 833, 'epoch': 2} {'type': 'loss', 'content': 0.019753288477659225, 'timestamp': '2025-09-30 22:07:03.838535', 'step': 834, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:03.873976', 'step': 834, 'epoch': 2} {'type': 'loss', 'content': 0.0059713092632591724, 'timestamp': '2025-09-30 22:07:03.876889', 'step': 835, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:07:03.908398', 'step': 835, 'epoch': 2} {'type': 'loss', 'content': 0.008703912608325481, 'timestamp': '2025-09-30 22:07:03.931557', 'step': 836, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:03.963534', 'step': 836, 'epoch': 2} {'type': 'loss', 'content': 0.022324806079268456, 'timestamp': '2025-09-30 22:07:03.966052', 'step': 837, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:03.996077', 'step': 837, 'epoch': 2} {'type': 'loss', 'content': 0.01000311691313982, 'timestamp': '2025-09-30 22:07:03.998268', 'step': 838, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:04.031057', 'step': 838, 'epoch': 2} {'type': 'loss', 'content': 0.0029391685966402292, 'timestamp': '2025-09-30 22:07:04.033715', 'step': 839, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:07:04.066955', 'step': 839, 'epoch': 2} {'type': 'loss', 'content': 0.005705764051526785, 'timestamp': '2025-09-30 22:07:04.090396', 'step': 840, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:07:04.122879', 'step': 840, 'epoch': 2} {'type': 'loss', 'content': 0.003614378860220313, 'timestamp': '2025-09-30 22:07:04.128426', 'step': 841, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:04.163958', 'step': 841, 'epoch': 2} {'type': 'loss', 'content': 0.0036180841270834208, 'timestamp': '2025-09-30 22:07:04.168311', 'step': 842, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:07:04.203861', 'step': 842, 'epoch': 2} {'type': 'loss', 'content': 0.015567810274660587, 'timestamp': '2025-09-30 22:07:04.211802', 'step': 843, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:04.242892', 'step': 843, 'epoch': 2} {'type': 'loss', 'content': 0.018757859244942665, 'timestamp': '2025-09-30 22:07:04.268474', 'step': 844, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:07:04.299760', 'step': 844, 'epoch': 2} {'type': 'loss', 'content': 0.012565111741423607, 'timestamp': '2025-09-30 22:07:04.307580', 'step': 845, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:07:04.338572', 'step': 845, 'epoch': 2} {'type': 'loss', 'content': 0.010797788389027119, 'timestamp': '2025-09-30 22:07:04.345768', 'step': 846, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:04.378499', 'step': 846, 'epoch': 2} {'type': 'loss', 'content': 0.015541709028184414, 'timestamp': '2025-09-30 22:07:04.385672', 'step': 847, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:04.416407', 'step': 847, 'epoch': 2} {'type': 'loss', 'content': 0.03071592189371586, 'timestamp': '2025-09-30 22:07:04.440055', 'step': 848, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:04.471043', 'step': 848, 'epoch': 2} {'type': 'loss', 'content': 0.00239894213154912, 'timestamp': '2025-09-30 22:07:04.472802', 'step': 849, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:04.512397', 'step': 849, 'epoch': 2} {'type': 'loss', 'content': 0.029214853420853615, 'timestamp': '2025-09-30 22:07:04.516927', 'step': 850, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:04.555011', 'step': 850, 'epoch': 2} {'type': 'loss', 'content': 0.017962682992219925, 'timestamp': '2025-09-30 22:07:04.556939', 'step': 851, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:07:04.587609', 'step': 851, 'epoch': 2} {'type': 'loss', 'content': 0.002266437280923128, 'timestamp': '2025-09-30 22:07:04.615918', 'step': 852, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:04.647708', 'step': 852, 'epoch': 2} {'type': 'loss', 'content': 0.011912612244486809, 'timestamp': '2025-09-30 22:07:04.649717', 'step': 853, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:04.680773', 'step': 853, 'epoch': 2} {'type': 'loss', 'content': 0.010540487244725227, 'timestamp': '2025-09-30 22:07:04.682893', 'step': 854, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:07:04.714393', 'step': 854, 'epoch': 2} {'type': 'loss', 'content': 0.0030493498779833317, 'timestamp': '2025-09-30 22:07:04.722045', 'step': 855, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:07:04.755199', 'step': 855, 'epoch': 2} {'type': 'loss', 'content': 0.007540632504969835, 'timestamp': '2025-09-30 22:07:04.783748', 'step': 856, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:07:04.817129', 'step': 856, 'epoch': 2} {'type': 'loss', 'content': 0.02822980284690857, 'timestamp': '2025-09-30 22:07:04.819244', 'step': 857, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:07:04.852753', 'step': 857, 'epoch': 2} {'type': 'loss', 'content': 0.02981366217136383, 'timestamp': '2025-09-30 22:07:04.855144', 'step': 858, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-30 22:07:05.549713', 'step': 858, 'epoch': 2} {'type': 'pplx', 'content': 86256591.69055031, 'timestamp': '2025-09-30 22:07:05.551227', 'step': 858, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:07:05.580616', 'step': 858, 'epoch': 2} {'type': 'loss', 'content': 0.0039618550799787045, 'timestamp': '2025-09-30 22:07:05.588128', 'step': 859, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:05.620463', 'step': 859, 'epoch': 2} {'type': 'loss', 'content': 0.030887721106410027, 'timestamp': '2025-09-30 22:07:05.645889', 'step': 860, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:05.680208', 'step': 860, 'epoch': 2} {'type': 'loss', 'content': 0.017998460680246353, 'timestamp': '2025-09-30 22:07:05.682308', 'step': 861, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:05.719932', 'step': 861, 'epoch': 2} {'type': 'loss', 'content': 0.0152524309232831, 'timestamp': '2025-09-30 22:07:05.722640', 'step': 862, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:05.761132', 'step': 862, 'epoch': 2} {'type': 'loss', 'content': 0.01634639874100685, 'timestamp': '2025-09-30 22:07:05.765485', 'step': 863, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:07:05.804411', 'step': 863, 'epoch': 2} {'type': 'loss', 'content': 0.010691942647099495, 'timestamp': '2025-09-30 22:07:05.832323', 'step': 864, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:05.864370', 'step': 864, 'epoch': 2} {'type': 'loss', 'content': 0.004576417151838541, 'timestamp': '2025-09-30 22:07:05.866430', 'step': 865, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:05.897941', 'step': 865, 'epoch': 2} {'type': 'loss', 'content': 0.02053973264992237, 'timestamp': '2025-09-30 22:07:05.905133', 'step': 866, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:05.943773', 'step': 866, 'epoch': 2} {'type': 'loss', 'content': 0.011172892525792122, 'timestamp': '2025-09-30 22:07:05.948064', 'step': 867, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:07:05.979520', 'step': 867, 'epoch': 2} {'type': 'loss', 'content': 0.00478165689855814, 'timestamp': '2025-09-30 22:07:06.007609', 'step': 868, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:07:06.046473', 'step': 868, 'epoch': 2} {'type': 'loss', 'content': 0.00597166595980525, 'timestamp': '2025-09-30 22:07:06.051835', 'step': 869, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:06.084572', 'step': 869, 'epoch': 2} {'type': 'loss', 'content': 0.014478781260550022, 'timestamp': '2025-09-30 22:07:06.088892', 'step': 870, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:06.121316', 'step': 870, 'epoch': 2} {'type': 'loss', 'content': 0.008051961660385132, 'timestamp': '2025-09-30 22:07:06.125304', 'step': 871, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:06.166173', 'step': 871, 'epoch': 2} {'type': 'loss', 'content': 0.0020913986954838037, 'timestamp': '2025-09-30 22:07:06.194128', 'step': 872, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:06.228988', 'step': 872, 'epoch': 2} {'type': 'loss', 'content': 0.004229079000651836, 'timestamp': '2025-09-30 22:07:06.230955', 'step': 873, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:06.262070', 'step': 873, 'epoch': 2} {'type': 'loss', 'content': 0.00472915219143033, 'timestamp': '2025-09-30 22:07:06.264968', 'step': 874, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:06.296056', 'step': 874, 'epoch': 2} {'type': 'loss', 'content': 0.009399129077792168, 'timestamp': '2025-09-30 22:07:06.297968', 'step': 875, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:06.337125', 'step': 875, 'epoch': 2} {'type': 'loss', 'content': 0.017205609008669853, 'timestamp': '2025-09-30 22:07:06.360732', 'step': 876, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:07:06.394265', 'step': 876, 'epoch': 2} {'type': 'loss', 'content': 0.019868528470396996, 'timestamp': '2025-09-30 22:07:06.396608', 'step': 877, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:06.430534', 'step': 877, 'epoch': 2} {'type': 'loss', 'content': 0.020020518451929092, 'timestamp': '2025-09-30 22:07:06.432585', 'step': 878, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:06.469670', 'step': 878, 'epoch': 2} {'type': 'loss', 'content': 0.0024727715644985437, 'timestamp': '2025-09-30 22:07:06.471774', 'step': 879, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:06.504053', 'step': 879, 'epoch': 2} {'type': 'loss', 'content': 0.002944986103102565, 'timestamp': '2025-09-30 22:07:06.531947', 'step': 880, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:07:06.568330', 'step': 880, 'epoch': 2} {'type': 'loss', 'content': 0.004082814324647188, 'timestamp': '2025-09-30 22:07:06.573756', 'step': 881, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:06.605310', 'step': 881, 'epoch': 2} {'type': 'loss', 'content': 0.004995688796043396, 'timestamp': '2025-09-30 22:07:06.607374', 'step': 882, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:06.641713', 'step': 882, 'epoch': 2} {'type': 'loss', 'content': 0.004161830525845289, 'timestamp': '2025-09-30 22:07:06.648484', 'step': 883, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:06.688386', 'step': 883, 'epoch': 2} {'type': 'loss', 'content': 0.005772753152996302, 'timestamp': '2025-09-30 22:07:06.712059', 'step': 884, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:07:06.743909', 'step': 884, 'epoch': 2} {'type': 'loss', 'content': 0.003844219958409667, 'timestamp': '2025-09-30 22:07:06.746374', 'step': 885, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:06.778035', 'step': 885, 'epoch': 2} {'type': 'loss', 'content': 0.00576426088809967, 'timestamp': '2025-09-30 22:07:06.780467', 'step': 886, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:06.819822', 'step': 886, 'epoch': 2} {'type': 'loss', 'content': 0.02476349100470543, 'timestamp': '2025-09-30 22:07:06.822369', 'step': 887, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:06.861347', 'step': 887, 'epoch': 2} {'type': 'loss', 'content': 0.005247528199106455, 'timestamp': '2025-09-30 22:07:06.885046', 'step': 888, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:06.916521', 'step': 888, 'epoch': 2} {'type': 'loss', 'content': 0.012756898067891598, 'timestamp': '2025-09-30 22:07:06.918584', 'step': 889, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:06.950976', 'step': 889, 'epoch': 2} {'type': 'loss', 'content': 0.0024588818196207285, 'timestamp': '2025-09-30 22:07:06.955126', 'step': 890, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:07:06.990935', 'step': 890, 'epoch': 2} {'type': 'loss', 'content': 0.004403135273605585, 'timestamp': '2025-09-30 22:07:06.992901', 'step': 891, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:07.027449', 'step': 891, 'epoch': 2} {'type': 'loss', 'content': 0.0157319363206625, 'timestamp': '2025-09-30 22:07:07.052689', 'step': 892, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:07.085998', 'step': 892, 'epoch': 2} {'type': 'loss', 'content': 0.0017558409599587321, 'timestamp': '2025-09-30 22:07:07.088108', 'step': 893, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:07.124712', 'step': 893, 'epoch': 2} {'type': 'loss', 'content': 0.004555154126137495, 'timestamp': '2025-09-30 22:07:07.129197', 'step': 894, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:07.163297', 'step': 894, 'epoch': 2} {'type': 'loss', 'content': 0.013950363732874393, 'timestamp': '2025-09-30 22:07:07.170138', 'step': 895, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:07:07.201824', 'step': 895, 'epoch': 2} {'type': 'loss', 'content': 0.0023241452872753143, 'timestamp': '2025-09-30 22:07:07.230501', 'step': 896, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:07.264682', 'step': 896, 'epoch': 2} {'type': 'loss', 'content': 0.0018783170962706208, 'timestamp': '2025-09-30 22:07:07.269445', 'step': 897, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-30 22:07:07.945628', 'step': 897, 'epoch': 2} {'type': 'pplx', 'content': 95768593.83513556, 'timestamp': '2025-09-30 22:07:07.947542', 'step': 897, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:07:07.977709', 'step': 897, 'epoch': 2} {'type': 'loss', 'content': 0.0018268562853336334, 'timestamp': '2025-09-30 22:07:07.979939', 'step': 898, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:07:08.016661', 'step': 898, 'epoch': 2} {'type': 'loss', 'content': 0.000786967168096453, 'timestamp': '2025-09-30 22:07:08.026834', 'step': 899, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:08.061996', 'step': 899, 'epoch': 2} {'type': 'loss', 'content': 0.0017735311994329095, 'timestamp': '2025-09-30 22:07:08.085681', 'step': 900, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:07:08.117983', 'step': 900, 'epoch': 2} {'type': 'loss', 'content': 0.0028034248389303684, 'timestamp': '2025-09-30 22:07:08.123092', 'step': 901, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:08.155456', 'step': 901, 'epoch': 2} {'type': 'loss', 'content': 0.0053888121619820595, 'timestamp': '2025-09-30 22:07:08.159956', 'step': 902, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:08.191966', 'step': 902, 'epoch': 2} {'type': 'loss', 'content': 0.0014559318078681827, 'timestamp': '2025-09-30 22:07:08.196228', 'step': 903, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:08.227596', 'step': 903, 'epoch': 2} {'type': 'loss', 'content': 0.0028973512817174196, 'timestamp': '2025-09-30 22:07:08.253063', 'step': 904, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:08.285797', 'step': 904, 'epoch': 2} {'type': 'loss', 'content': 0.0038677144329994917, 'timestamp': '2025-09-30 22:07:08.287802', 'step': 905, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:07:08.323927', 'step': 905, 'epoch': 2} {'type': 'loss', 'content': 0.0034440092276781797, 'timestamp': '2025-09-30 22:07:08.331827', 'step': 906, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:07:08.363498', 'step': 906, 'epoch': 2} {'type': 'loss', 'content': 0.0014499167446047068, 'timestamp': '2025-09-30 22:07:08.375723', 'step': 907, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:08.407391', 'step': 907, 'epoch': 2} {'type': 'loss', 'content': 0.0017258430598303676, 'timestamp': '2025-09-30 22:07:08.432998', 'step': 908, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:07:08.463947', 'step': 908, 'epoch': 2} {'type': 'loss', 'content': 0.0020710090175271034, 'timestamp': '2025-09-30 22:07:08.468648', 'step': 909, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:08.499735', 'step': 909, 'epoch': 2} {'type': 'loss', 'content': 0.0018089372897520661, 'timestamp': '2025-09-30 22:07:08.506547', 'step': 910, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:08.541538', 'step': 910, 'epoch': 2} {'type': 'loss', 'content': 0.0026987947057932615, 'timestamp': '2025-09-30 22:07:08.546322', 'step': 911, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:08.580949', 'step': 911, 'epoch': 2} {'type': 'loss', 'content': 0.0031114660669118166, 'timestamp': '2025-09-30 22:07:08.606421', 'step': 912, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:08.637806', 'step': 912, 'epoch': 2} {'type': 'loss', 'content': 0.008608967997133732, 'timestamp': '2025-09-30 22:07:08.640045', 'step': 913, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:07:08.673689', 'step': 913, 'epoch': 2} {'type': 'loss', 'content': 0.00260666711255908, 'timestamp': '2025-09-30 22:07:08.676010', 'step': 914, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:08.708330', 'step': 914, 'epoch': 2} {'type': 'loss', 'content': 0.0011994466185569763, 'timestamp': '2025-09-30 22:07:08.711152', 'step': 915, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:08.743385', 'step': 915, 'epoch': 2} {'type': 'loss', 'content': 0.004028329625725746, 'timestamp': '2025-09-30 22:07:08.767783', 'step': 916, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:08.802267', 'step': 916, 'epoch': 2} {'type': 'loss', 'content': 0.0007758959545753896, 'timestamp': '2025-09-30 22:07:08.804337', 'step': 917, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:08.836653', 'step': 917, 'epoch': 2} {'type': 'loss', 'content': 0.0007386510260403156, 'timestamp': '2025-09-30 22:07:08.843829', 'step': 918, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:07:08.883975', 'step': 918, 'epoch': 2} {'type': 'loss', 'content': 0.0027365325950086117, 'timestamp': '2025-09-30 22:07:08.886371', 'step': 919, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:08.924596', 'step': 919, 'epoch': 2} {'type': 'loss', 'content': 0.0016872246051207185, 'timestamp': '2025-09-30 22:07:08.948515', 'step': 920, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:08.980561', 'step': 920, 'epoch': 2} {'type': 'loss', 'content': 0.0009407888865098357, 'timestamp': '2025-09-30 22:07:08.983138', 'step': 921, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:09.017094', 'step': 921, 'epoch': 2} {'type': 'loss', 'content': 0.00036402264959178865, 'timestamp': '2025-09-30 22:07:09.021560', 'step': 922, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:07:09.055955', 'step': 922, 'epoch': 2} {'type': 'loss', 'content': 0.0008210391388274729, 'timestamp': '2025-09-30 22:07:09.063443', 'step': 923, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:09.097989', 'step': 923, 'epoch': 2} {'type': 'loss', 'content': 0.001234310562722385, 'timestamp': '2025-09-30 22:07:09.123744', 'step': 924, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:09.158919', 'step': 924, 'epoch': 2} {'type': 'loss', 'content': 0.004231620114296675, 'timestamp': '2025-09-30 22:07:09.161095', 'step': 925, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:09.193921', 'step': 925, 'epoch': 2} {'type': 'loss', 'content': 0.0013935185270383954, 'timestamp': '2025-09-30 22:07:09.197666', 'step': 926, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:09.229611', 'step': 926, 'epoch': 2} {'type': 'loss', 'content': 0.00570475310087204, 'timestamp': '2025-09-30 22:07:09.232139', 'step': 927, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:07:09.263920', 'step': 927, 'epoch': 2} {'type': 'loss', 'content': 0.00038181617856025696, 'timestamp': '2025-09-30 22:07:09.287318', 'step': 928, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:09.318675', 'step': 928, 'epoch': 2} {'type': 'loss', 'content': 0.0002547978365328163, 'timestamp': '2025-09-30 22:07:09.323157', 'step': 929, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:09.356607', 'step': 929, 'epoch': 2} {'type': 'loss', 'content': 0.003905776422470808, 'timestamp': '2025-09-30 22:07:09.363386', 'step': 930, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:09.394811', 'step': 930, 'epoch': 2} {'type': 'loss', 'content': 0.000579020765144378, 'timestamp': '2025-09-30 22:07:09.401676', 'step': 931, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:09.433557', 'step': 931, 'epoch': 2} {'type': 'loss', 'content': 0.005851599853485823, 'timestamp': '2025-09-30 22:07:09.458583', 'step': 932, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:09.489645', 'step': 932, 'epoch': 2} {'type': 'loss', 'content': 0.0011362012010067701, 'timestamp': '2025-09-30 22:07:09.494109', 'step': 933, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:09.533288', 'step': 933, 'epoch': 2} {'type': 'loss', 'content': 0.004097456112504005, 'timestamp': '2025-09-30 22:07:09.535419', 'step': 934, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:09.569467', 'step': 934, 'epoch': 2} {'type': 'loss', 'content': 0.0016320300055667758, 'timestamp': '2025-09-30 22:07:09.571675', 'step': 935, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:09.602463', 'step': 935, 'epoch': 2} {'type': 'loss', 'content': 0.0024072679225355387, 'timestamp': '2025-09-30 22:07:09.626031', 'step': 936, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-30 22:07:10.327478', 'step': 936, 'epoch': 2} {'type': 'pplx', 'content': 112929219.40033711, 'timestamp': '2025-09-30 22:07:10.329310', 'step': 936, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:07:10.360088', 'step': 936, 'epoch': 2} {'type': 'loss', 'content': 0.0004962026723660529, 'timestamp': '2025-09-30 22:07:10.362276', 'step': 937, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:10.406581', 'step': 937, 'epoch': 2} {'type': 'loss', 'content': 0.00023230533406604081, 'timestamp': '2025-09-30 22:07:10.410755', 'step': 938, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:10.451027', 'step': 938, 'epoch': 2} {'type': 'loss', 'content': 0.0002659259189385921, 'timestamp': '2025-09-30 22:07:10.455539', 'step': 939, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:10.490463', 'step': 939, 'epoch': 2} {'type': 'loss', 'content': 0.00033930627978406847, 'timestamp': '2025-09-30 22:07:10.514420', 'step': 940, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:10.556888', 'step': 940, 'epoch': 2} {'type': 'loss', 'content': 0.0016134243924170732, 'timestamp': '2025-09-30 22:07:10.559086', 'step': 941, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:10.590226', 'step': 941, 'epoch': 2} {'type': 'loss', 'content': 0.0004938853089697659, 'timestamp': '2025-09-30 22:07:10.592117', 'step': 942, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:10.631199', 'step': 942, 'epoch': 2} {'type': 'loss', 'content': 0.0002870448224712163, 'timestamp': '2025-09-30 22:07:10.633506', 'step': 943, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:10.670751', 'step': 943, 'epoch': 2} {'type': 'loss', 'content': 0.003749014111235738, 'timestamp': '2025-09-30 22:07:10.694301', 'step': 944, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:07:10.736657', 'step': 944, 'epoch': 2} {'type': 'loss', 'content': 0.0003916143032256514, 'timestamp': '2025-09-30 22:07:10.741782', 'step': 945, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:10.779978', 'step': 945, 'epoch': 2} {'type': 'loss', 'content': 0.00022652934421785176, 'timestamp': '2025-09-30 22:07:10.784524', 'step': 946, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:10.827099', 'step': 946, 'epoch': 2} {'type': 'loss', 'content': 0.0004099583311472088, 'timestamp': '2025-09-30 22:07:10.832155', 'step': 947, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:10.871557', 'step': 947, 'epoch': 2} {'type': 'loss', 'content': 0.000650174799375236, 'timestamp': '2025-09-30 22:07:10.896952', 'step': 948, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:10.942709', 'step': 948, 'epoch': 2} {'type': 'loss', 'content': 0.002298450330272317, 'timestamp': '2025-09-30 22:07:10.944667', 'step': 949, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:10.978710', 'step': 949, 'epoch': 2} {'type': 'loss', 'content': 0.00034443842014297843, 'timestamp': '2025-09-30 22:07:10.980518', 'step': 950, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:11.013027', 'step': 950, 'epoch': 2} {'type': 'loss', 'content': 0.00355920079164207, 'timestamp': '2025-09-30 22:07:11.015463', 'step': 951, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:07:11.047952', 'step': 951, 'epoch': 2} {'type': 'loss', 'content': 0.00011734906729543582, 'timestamp': '2025-09-30 22:07:11.076482', 'step': 952, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:11.107678', 'step': 952, 'epoch': 2} {'type': 'loss', 'content': 0.0001595637295395136, 'timestamp': '2025-09-30 22:07:11.110152', 'step': 953, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:07:11.147666', 'step': 953, 'epoch': 2} {'type': 'loss', 'content': 0.0003285036946181208, 'timestamp': '2025-09-30 22:07:11.150055', 'step': 954, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:11.188918', 'step': 954, 'epoch': 2} {'type': 'loss', 'content': 0.0007600878598168492, 'timestamp': '2025-09-30 22:07:11.190903', 'step': 955, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:07:11.224395', 'step': 955, 'epoch': 2} {'type': 'loss', 'content': 0.00014429469592869282, 'timestamp': '2025-09-30 22:07:11.252477', 'step': 956, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:11.289600', 'step': 956, 'epoch': 2} {'type': 'loss', 'content': 0.005687530618160963, 'timestamp': '2025-09-30 22:07:11.291958', 'step': 957, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:11.326919', 'step': 957, 'epoch': 2} {'type': 'loss', 'content': 0.001010962063446641, 'timestamp': '2025-09-30 22:07:11.329592', 'step': 958, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:11.364523', 'step': 958, 'epoch': 2} {'type': 'loss', 'content': 0.0005010986351408064, 'timestamp': '2025-09-30 22:07:11.367667', 'step': 959, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:11.402225', 'step': 959, 'epoch': 2} {'type': 'loss', 'content': 0.001585278776474297, 'timestamp': '2025-09-30 22:07:11.426244', 'step': 960, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:07:11.460298', 'step': 960, 'epoch': 2} {'type': 'loss', 'content': 0.00010611403558868915, 'timestamp': '2025-09-30 22:07:11.464889', 'step': 961, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:11.497324', 'step': 961, 'epoch': 2} {'type': 'loss', 'content': 0.00093518674839288, 'timestamp': '2025-09-30 22:07:11.499809', 'step': 962, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:11.539635', 'step': 962, 'epoch': 2} {'type': 'loss', 'content': 0.00011386203550500795, 'timestamp': '2025-09-30 22:07:11.542572', 'step': 963, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:07:11.576198', 'step': 963, 'epoch': 2} {'type': 'loss', 'content': 0.01964985392987728, 'timestamp': '2025-09-30 22:07:11.604288', 'step': 964, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:07:11.637144', 'step': 964, 'epoch': 2} {'type': 'loss', 'content': 0.002611854812130332, 'timestamp': '2025-09-30 22:07:11.638996', 'step': 965, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:11.671779', 'step': 965, 'epoch': 2} {'type': 'loss', 'content': 0.00027620504260994494, 'timestamp': '2025-09-30 22:07:11.676059', 'step': 966, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:11.710289', 'step': 966, 'epoch': 2} {'type': 'loss', 'content': 0.0017094091745093465, 'timestamp': '2025-09-30 22:07:11.713181', 'step': 967, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:11.746173', 'step': 967, 'epoch': 2} {'type': 'loss', 'content': 9.690089063951746e-05, 'timestamp': '2025-09-30 22:07:11.774308', 'step': 968, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:11.815616', 'step': 968, 'epoch': 2} {'type': 'loss', 'content': 0.0009474550024606287, 'timestamp': '2025-09-30 22:07:11.818783', 'step': 969, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:11.852885', 'step': 969, 'epoch': 2} {'type': 'loss', 'content': 0.00027702722582034767, 'timestamp': '2025-09-30 22:07:11.855700', 'step': 970, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:11.888806', 'step': 970, 'epoch': 2} {'type': 'loss', 'content': 9.48179658735171e-05, 'timestamp': '2025-09-30 22:07:11.893370', 'step': 971, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:11.927029', 'step': 971, 'epoch': 2} {'type': 'loss', 'content': 0.01434251107275486, 'timestamp': '2025-09-30 22:07:11.951056', 'step': 972, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:07:11.986648', 'step': 972, 'epoch': 2} {'type': 'loss', 'content': 0.00014298749738372862, 'timestamp': '2025-09-30 22:07:11.991438', 'step': 973, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:12.034567', 'step': 973, 'epoch': 2} {'type': 'loss', 'content': 8.121971768559888e-05, 'timestamp': '2025-09-30 22:07:12.038808', 'step': 974, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:12.073593', 'step': 974, 'epoch': 2} {'type': 'loss', 'content': 0.00010338701395085081, 'timestamp': '2025-09-30 22:07:12.080501', 'step': 975, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-30 22:07:12.812417', 'step': 975, 'epoch': 2} {'type': 'pplx', 'content': 121873066.00614506, 'timestamp': '2025-09-30 22:07:12.814660', 'step': 975, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:12.850025', 'step': 975, 'epoch': 2} {'type': 'loss', 'content': 0.007562555372714996, 'timestamp': '2025-09-30 22:07:12.873547', 'step': 976, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:07:12.913828', 'step': 976, 'epoch': 2} {'type': 'loss', 'content': 0.03160439059138298, 'timestamp': '2025-09-30 22:07:12.916224', 'step': 977, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:12.955925', 'step': 977, 'epoch': 2} {'type': 'loss', 'content': 0.000496004126034677, 'timestamp': '2025-09-30 22:07:12.958103', 'step': 978, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:07:12.995712', 'step': 978, 'epoch': 2} {'type': 'loss', 'content': 0.0037520842161029577, 'timestamp': '2025-09-30 22:07:13.003017', 'step': 979, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:13.036947', 'step': 979, 'epoch': 2} {'type': 'loss', 'content': 0.0006964383646845818, 'timestamp': '2025-09-30 22:07:13.060837', 'step': 980, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:07:13.095302', 'step': 980, 'epoch': 2} {'type': 'loss', 'content': 0.01626410521566868, 'timestamp': '2025-09-30 22:07:13.097441', 'step': 981, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:07:13.130837', 'step': 981, 'epoch': 2} {'type': 'loss', 'content': 0.00018333212938159704, 'timestamp': '2025-09-30 22:07:13.133238', 'step': 982, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:07:13.188556', 'step': 982, 'epoch': 2} {'type': 'loss', 'content': 7.79297188273631e-05, 'timestamp': '2025-09-30 22:07:13.196330', 'step': 983, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:13.241086', 'step': 983, 'epoch': 2} {'type': 'loss', 'content': 0.004288059659302235, 'timestamp': '2025-09-30 22:07:13.264670', 'step': 984, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:13.304500', 'step': 984, 'epoch': 2} {'type': 'loss', 'content': 0.0007159236702136695, 'timestamp': '2025-09-30 22:07:13.306981', 'step': 985, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:13.339972', 'step': 985, 'epoch': 2} {'type': 'loss', 'content': 0.00033975281985476613, 'timestamp': '2025-09-30 22:07:13.344327', 'step': 986, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:13.376246', 'step': 986, 'epoch': 2} {'type': 'loss', 'content': 0.03457098826766014, 'timestamp': '2025-09-30 22:07:13.378249', 'step': 987, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:13.420408', 'step': 987, 'epoch': 2} {'type': 'loss', 'content': 0.0003113284183200449, 'timestamp': '2025-09-30 22:07:13.443936', 'step': 988, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:13.483624', 'step': 988, 'epoch': 2} {'type': 'loss', 'content': 0.0004182211123406887, 'timestamp': '2025-09-30 22:07:13.485802', 'step': 989, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:07:13.526373', 'step': 989, 'epoch': 2} {'type': 'loss', 'content': 0.00030641566263511777, 'timestamp': '2025-09-30 22:07:13.528735', 'step': 990, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:13.568696', 'step': 990, 'epoch': 2} {'type': 'loss', 'content': 0.010180122219026089, 'timestamp': '2025-09-30 22:07:13.573135', 'step': 991, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:07:13.607697', 'step': 991, 'epoch': 2} {'type': 'loss', 'content': 0.00035741503234021366, 'timestamp': '2025-09-30 22:07:13.631468', 'step': 992, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:13.663681', 'step': 992, 'epoch': 2} {'type': 'loss', 'content': 0.003032468957826495, 'timestamp': '2025-09-30 22:07:13.665618', 'step': 993, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:13.697780', 'step': 993, 'epoch': 2} {'type': 'loss', 'content': 0.0021785900462418795, 'timestamp': '2025-09-30 22:07:13.702360', 'step': 994, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:13.736901', 'step': 994, 'epoch': 2} {'type': 'loss', 'content': 0.014290996827185154, 'timestamp': '2025-09-30 22:07:13.738898', 'step': 995, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:13.776796', 'step': 995, 'epoch': 2} {'type': 'loss', 'content': 0.002437558025121689, 'timestamp': '2025-09-30 22:07:13.800364', 'step': 996, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:13.834493', 'step': 996, 'epoch': 2} {'type': 'loss', 'content': 0.057113997638225555, 'timestamp': '2025-09-30 22:07:13.836581', 'step': 997, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:13.886520', 'step': 997, 'epoch': 2} {'type': 'loss', 'content': 0.010231812484562397, 'timestamp': '2025-09-30 22:07:13.891076', 'step': 998, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:13.925378', 'step': 998, 'epoch': 2} {'type': 'loss', 'content': 0.00013605566346086562, 'timestamp': '2025-09-30 22:07:13.932229', 'step': 999, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:13.980495', 'step': 999, 'epoch': 2} {'type': 'loss', 'content': 0.0021123080514371395, 'timestamp': '2025-09-30 22:07:14.005860', 'step': 1000, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 1000', 'timestamp': '2025-09-30 22:07:19.036825', 'step': 1000, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:07:19.072988', 'step': 1000, 'epoch': 2} {'type': 'loss', 'content': 0.0002514692605473101, 'timestamp': '2025-09-30 22:07:19.077266', 'step': 1001, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:19.112144', 'step': 1001, 'epoch': 2} {'type': 'loss', 'content': 0.0008739094482734799, 'timestamp': '2025-09-30 22:07:19.116564', 'step': 1002, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:19.150748', 'step': 1002, 'epoch': 2} {'type': 'loss', 'content': 0.00011316355085000396, 'timestamp': '2025-09-30 22:07:19.152969', 'step': 1003, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:19.188394', 'step': 1003, 'epoch': 2} {'type': 'loss', 'content': 0.0005285521619953215, 'timestamp': '2025-09-30 22:07:19.212056', 'step': 1004, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:19.247276', 'step': 1004, 'epoch': 2} {'type': 'loss', 'content': 0.00322272558696568, 'timestamp': '2025-09-30 22:07:19.251941', 'step': 1005, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:19.289381', 'step': 1005, 'epoch': 2} {'type': 'loss', 'content': 0.01631017215549946, 'timestamp': '2025-09-30 22:07:19.296423', 'step': 1006, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:19.333402', 'step': 1006, 'epoch': 2} {'type': 'loss', 'content': 0.000126895189168863, 'timestamp': '2025-09-30 22:07:19.340347', 'step': 1007, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:19.375598', 'step': 1007, 'epoch': 2} {'type': 'loss', 'content': 0.0014090503100305796, 'timestamp': '2025-09-30 22:07:19.399080', 'step': 1008, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:07:19.434942', 'step': 1008, 'epoch': 2} {'type': 'loss', 'content': 0.006856707390397787, 'timestamp': '2025-09-30 22:07:19.440215', 'step': 1009, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:19.480328', 'step': 1009, 'epoch': 2} {'type': 'loss', 'content': 0.008845113217830658, 'timestamp': '2025-09-30 22:07:19.483602', 'step': 1010, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:19.519136', 'step': 1010, 'epoch': 2} {'type': 'loss', 'content': 0.012483632192015648, 'timestamp': '2025-09-30 22:07:19.521797', 'step': 1011, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:19.558005', 'step': 1011, 'epoch': 2} {'type': 'loss', 'content': 0.0018354732310399413, 'timestamp': '2025-09-30 22:07:19.582291', 'step': 1012, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:19.621414', 'step': 1012, 'epoch': 2} {'type': 'loss', 'content': 0.0015046221669763327, 'timestamp': '2025-09-30 22:07:19.626137', 'step': 1013, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:07:19.659456', 'step': 1013, 'epoch': 2} {'type': 'loss', 'content': 0.0003278079384472221, 'timestamp': '2025-09-30 22:07:19.662253', 'step': 1014, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-30 22:07:20.510339', 'step': 1014, 'epoch': 2} {'type': 'pplx', 'content': 121440267.68757698, 'timestamp': '2025-09-30 22:07:20.512317', 'step': 1014, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:20.552632', 'step': 1014, 'epoch': 2} {'type': 'loss', 'content': 0.02951066754758358, 'timestamp': '2025-09-30 22:07:20.554813', 'step': 1015, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:07:20.600361', 'step': 1015, 'epoch': 2} {'type': 'loss', 'content': 0.0009736703941598535, 'timestamp': '2025-09-30 22:07:20.628644', 'step': 1016, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:20.664907', 'step': 1016, 'epoch': 2} {'type': 'loss', 'content': 0.0012555711437016726, 'timestamp': '2025-09-30 22:07:20.666944', 'step': 1017, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:20.700237', 'step': 1017, 'epoch': 2} {'type': 'loss', 'content': 0.0004989461740478873, 'timestamp': '2025-09-30 22:07:20.704567', 'step': 1018, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:07:20.738301', 'step': 1018, 'epoch': 2} {'type': 'loss', 'content': 0.0001337010762654245, 'timestamp': '2025-09-30 22:07:20.748590', 'step': 1019, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:20.783354', 'step': 1019, 'epoch': 2} {'type': 'loss', 'content': 0.0010935006430372596, 'timestamp': '2025-09-30 22:07:20.806760', 'step': 1020, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:20.851261', 'step': 1020, 'epoch': 2} {'type': 'loss', 'content': 0.03073728457093239, 'timestamp': '2025-09-30 22:07:20.853710', 'step': 1021, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:07:20.891776', 'step': 1021, 'epoch': 2} {'type': 'loss', 'content': 0.0001944132527569309, 'timestamp': '2025-09-30 22:07:20.899496', 'step': 1022, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:20.933172', 'step': 1022, 'epoch': 2} {'type': 'loss', 'content': 0.0009663135861046612, 'timestamp': '2025-09-30 22:07:20.938619', 'step': 1023, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:20.971818', 'step': 1023, 'epoch': 2} {'type': 'loss', 'content': 0.024121267721056938, 'timestamp': '2025-09-30 22:07:20.995407', 'step': 1024, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:07:21.045645', 'step': 1024, 'epoch': 2} {'type': 'loss', 'content': 0.002432918641716242, 'timestamp': '2025-09-30 22:07:21.051039', 'step': 1025, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:07:21.100559', 'step': 1025, 'epoch': 2} {'type': 'loss', 'content': 0.00709154549986124, 'timestamp': '2025-09-30 22:07:21.108172', 'step': 1026, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:07:21.142739', 'step': 1026, 'epoch': 2} {'type': 'loss', 'content': 0.00028265800210647285, 'timestamp': '2025-09-30 22:07:21.144826', 'step': 1027, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:07:21.179158', 'step': 1027, 'epoch': 2} {'type': 'loss', 'content': 0.004591092932969332, 'timestamp': '2025-09-30 22:07:21.210372', 'step': 1028, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:21.249460', 'step': 1028, 'epoch': 2} {'type': 'loss', 'content': 0.0005322260549291968, 'timestamp': '2025-09-30 22:07:21.251782', 'step': 1029, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:07:21.290982', 'step': 1029, 'epoch': 2} {'type': 'loss', 'content': 0.012618852779269218, 'timestamp': '2025-09-30 22:07:21.298793', 'step': 1030, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:21.333382', 'step': 1030, 'epoch': 2} {'type': 'loss', 'content': 0.0010076664621010423, 'timestamp': '2025-09-30 22:07:21.335575', 'step': 1031, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:21.373691', 'step': 1031, 'epoch': 2} {'type': 'loss', 'content': 0.00020813469018321484, 'timestamp': '2025-09-30 22:07:21.397263', 'step': 1032, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:21.430997', 'step': 1032, 'epoch': 2} {'type': 'loss', 'content': 0.0012870494974777102, 'timestamp': '2025-09-30 22:07:21.433021', 'step': 1033, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:21.468487', 'step': 1033, 'epoch': 2} {'type': 'loss', 'content': 0.0004765233024954796, 'timestamp': '2025-09-30 22:07:21.474005', 'step': 1034, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:21.511431', 'step': 1034, 'epoch': 2} {'type': 'loss', 'content': 0.00030462947324849665, 'timestamp': '2025-09-30 22:07:21.513565', 'step': 1035, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:07:21.548095', 'step': 1035, 'epoch': 2} {'type': 'loss', 'content': 0.001975154737010598, 'timestamp': '2025-09-30 22:07:21.571628', 'step': 1036, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:21.606500', 'step': 1036, 'epoch': 2} {'type': 'loss', 'content': 0.00012028579658363014, 'timestamp': '2025-09-30 22:07:21.608830', 'step': 1037, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:21.665252', 'step': 1037, 'epoch': 2} {'type': 'loss', 'content': 0.0015023910673335195, 'timestamp': '2025-09-30 22:07:21.672172', 'step': 1038, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:21.719265', 'step': 1038, 'epoch': 2} {'type': 'loss', 'content': 0.001884570112451911, 'timestamp': '2025-09-30 22:07:21.721969', 'step': 1039, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:07:21.753944', 'step': 1039, 'epoch': 2} {'type': 'loss', 'content': 0.01647031679749489, 'timestamp': '2025-09-30 22:07:21.782545', 'step': 1040, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:21.821889', 'step': 1040, 'epoch': 2} {'type': 'loss', 'content': 0.004868580959737301, 'timestamp': '2025-09-30 22:07:21.823987', 'step': 1041, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:07:21.866728', 'step': 1041, 'epoch': 2} {'type': 'loss', 'content': 0.00018905648903455585, 'timestamp': '2025-09-30 22:07:21.880188', 'step': 1042, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:21.920644', 'step': 1042, 'epoch': 2} {'type': 'loss', 'content': 0.005073660518974066, 'timestamp': '2025-09-30 22:07:21.923095', 'step': 1043, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:07:21.955870', 'step': 1043, 'epoch': 2} {'type': 'loss', 'content': 0.0005219231243245304, 'timestamp': '2025-09-30 22:07:21.979452', 'step': 1044, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:07:22.014807', 'step': 1044, 'epoch': 2} {'type': 'loss', 'content': 0.0013824685011059046, 'timestamp': '2025-09-30 22:07:22.017015', 'step': 1045, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:07:22.051195', 'step': 1045, 'epoch': 2} {'type': 'loss', 'content': 0.008890213444828987, 'timestamp': '2025-09-30 22:07:22.058860', 'step': 1046, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:22.105804', 'step': 1046, 'epoch': 2} {'type': 'loss', 'content': 0.00017873049364425242, 'timestamp': '2025-09-30 22:07:22.107894', 'step': 1047, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:22.147997', 'step': 1047, 'epoch': 2} {'type': 'loss', 'content': 0.00027590824174694717, 'timestamp': '2025-09-30 22:07:22.172286', 'step': 1048, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:22.214891', 'step': 1048, 'epoch': 2} {'type': 'loss', 'content': 0.0001358627196168527, 'timestamp': '2025-09-30 22:07:22.216878', 'step': 1049, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:22.250266', 'step': 1049, 'epoch': 2} {'type': 'loss', 'content': 0.0011405263794586062, 'timestamp': '2025-09-30 22:07:22.254896', 'step': 1050, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:22.287435', 'step': 1050, 'epoch': 2} {'type': 'loss', 'content': 0.0006561552872881293, 'timestamp': '2025-09-30 22:07:22.289500', 'step': 1051, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:22.322065', 'step': 1051, 'epoch': 2} {'type': 'loss', 'content': 0.003038618015125394, 'timestamp': '2025-09-30 22:07:22.345655', 'step': 1052, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:22.387636', 'step': 1052, 'epoch': 2} {'type': 'loss', 'content': 0.004722389858216047, 'timestamp': '2025-09-30 22:07:22.389633', 'step': 1053, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-30 22:07:23.247533', 'step': 1053, 'epoch': 2} {'type': 'pplx', 'content': 117074198.67895347, 'timestamp': '2025-09-30 22:07:23.249615', 'step': 1053, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:07:23.284013', 'step': 1053, 'epoch': 2} {'type': 'loss', 'content': 0.0001479993516113609, 'timestamp': '2025-09-30 22:07:23.291544', 'step': 1054, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:23.324865', 'step': 1054, 'epoch': 2} {'type': 'loss', 'content': 0.005029443185776472, 'timestamp': '2025-09-30 22:07:23.326902', 'step': 1055, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:23.367102', 'step': 1055, 'epoch': 2} {'type': 'loss', 'content': 0.0011753159342333674, 'timestamp': '2025-09-30 22:07:23.394932', 'step': 1056, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:23.426882', 'step': 1056, 'epoch': 2} {'type': 'loss', 'content': 0.006908381823450327, 'timestamp': '2025-09-30 22:07:23.429042', 'step': 1057, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:07:23.462639', 'step': 1057, 'epoch': 2} {'type': 'loss', 'content': 0.00392345292493701, 'timestamp': '2025-09-30 22:07:23.465057', 'step': 1058, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:07:23.497575', 'step': 1058, 'epoch': 2} {'type': 'loss', 'content': 0.00748575059697032, 'timestamp': '2025-09-30 22:07:23.505539', 'step': 1059, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:23.545877', 'step': 1059, 'epoch': 2} {'type': 'loss', 'content': 0.0001961512753041461, 'timestamp': '2025-09-30 22:07:23.569687', 'step': 1060, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:07:23.610728', 'step': 1060, 'epoch': 2} {'type': 'loss', 'content': 0.0005363413947634399, 'timestamp': '2025-09-30 22:07:23.616178', 'step': 1061, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:23.649617', 'step': 1061, 'epoch': 2} {'type': 'loss', 'content': 0.00021425398881547153, 'timestamp': '2025-09-30 22:07:23.651795', 'step': 1062, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:23.685918', 'step': 1062, 'epoch': 2} {'type': 'loss', 'content': 0.016600722447037697, 'timestamp': '2025-09-30 22:07:23.688807', 'step': 1063, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:23.724286', 'step': 1063, 'epoch': 2} {'type': 'loss', 'content': 0.0002527764590922743, 'timestamp': '2025-09-30 22:07:23.752278', 'step': 1064, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:23.788709', 'step': 1064, 'epoch': 2} {'type': 'loss', 'content': 0.019825953990221024, 'timestamp': '2025-09-30 22:07:23.790782', 'step': 1065, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:07:23.825375', 'step': 1065, 'epoch': 2} {'type': 'loss', 'content': 0.00023776039597578347, 'timestamp': '2025-09-30 22:07:23.833317', 'step': 1066, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:23.867092', 'step': 1066, 'epoch': 2} {'type': 'loss', 'content': 7.995210035005584e-05, 'timestamp': '2025-09-30 22:07:23.869672', 'step': 1067, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:23.906103', 'step': 1067, 'epoch': 2} {'type': 'loss', 'content': 5.865958155482076e-05, 'timestamp': '2025-09-30 22:07:23.929777', 'step': 1068, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:23.965857', 'step': 1068, 'epoch': 2} {'type': 'loss', 'content': 0.00011427756544435397, 'timestamp': '2025-09-30 22:07:23.967844', 'step': 1069, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:24.008524', 'step': 1069, 'epoch': 2} {'type': 'loss', 'content': 0.00019449848332442343, 'timestamp': '2025-09-30 22:07:24.011372', 'step': 1070, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:07:24.047641', 'step': 1070, 'epoch': 2} {'type': 'loss', 'content': 8.977264951681718e-05, 'timestamp': '2025-09-30 22:07:24.049571', 'step': 1071, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:24.083353', 'step': 1071, 'epoch': 2} {'type': 'loss', 'content': 0.006148288957774639, 'timestamp': '2025-09-30 22:07:24.107018', 'step': 1072, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:24.141017', 'step': 1072, 'epoch': 2} {'type': 'loss', 'content': 0.0018119210144504905, 'timestamp': '2025-09-30 22:07:24.143432', 'step': 1073, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:24.178422', 'step': 1073, 'epoch': 2} {'type': 'loss', 'content': 0.00039394685882143676, 'timestamp': '2025-09-30 22:07:24.181312', 'step': 1074, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:24.217732', 'step': 1074, 'epoch': 2} {'type': 'loss', 'content': 0.030289221554994583, 'timestamp': '2025-09-30 22:07:24.220222', 'step': 1075, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:24.254775', 'step': 1075, 'epoch': 2} {'type': 'loss', 'content': 0.003143486799672246, 'timestamp': '2025-09-30 22:07:24.280327', 'step': 1076, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:24.315761', 'step': 1076, 'epoch': 2} {'type': 'loss', 'content': 0.006031307391822338, 'timestamp': '2025-09-30 22:07:24.317867', 'step': 1077, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:24.351722', 'step': 1077, 'epoch': 2} {'type': 'loss', 'content': 0.0009177754982374609, 'timestamp': '2025-09-30 22:07:24.355987', 'step': 1078, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:07:24.391929', 'step': 1078, 'epoch': 2} {'type': 'loss', 'content': 0.0003655412292573601, 'timestamp': '2025-09-30 22:07:24.399251', 'step': 1079, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:07:24.442941', 'step': 1079, 'epoch': 2} {'type': 'loss', 'content': 7.012840069364756e-05, 'timestamp': '2025-09-30 22:07:24.474362', 'step': 1080, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:07:24.517405', 'step': 1080, 'epoch': 2} {'type': 'loss', 'content': 0.001427992945536971, 'timestamp': '2025-09-30 22:07:24.522347', 'step': 1081, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:24.571105', 'step': 1081, 'epoch': 2} {'type': 'loss', 'content': 0.0016603464027866721, 'timestamp': '2025-09-30 22:07:24.573000', 'step': 1082, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:07:24.620007', 'step': 1082, 'epoch': 2} {'type': 'loss', 'content': 0.00029763669590465724, 'timestamp': '2025-09-30 22:07:24.622427', 'step': 1083, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:07:24.663728', 'step': 1083, 'epoch': 2} {'type': 'loss', 'content': 0.017944645136594772, 'timestamp': '2025-09-30 22:07:24.687249', 'step': 1084, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:24.726871', 'step': 1084, 'epoch': 2} {'type': 'loss', 'content': 0.024879327043890953, 'timestamp': '2025-09-30 22:07:24.728726', 'step': 1085, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:24.775816', 'step': 1085, 'epoch': 2} {'type': 'loss', 'content': 6.769696483388543e-05, 'timestamp': '2025-09-30 22:07:24.777895', 'step': 1086, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:24.817685', 'step': 1086, 'epoch': 2} {'type': 'loss', 'content': 0.000262332905549556, 'timestamp': '2025-09-30 22:07:24.820018', 'step': 1087, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:24.855976', 'step': 1087, 'epoch': 2} {'type': 'loss', 'content': 0.0017855723854154348, 'timestamp': '2025-09-30 22:07:24.881476', 'step': 1088, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:24.923211', 'step': 1088, 'epoch': 2} {'type': 'loss', 'content': 0.001035021268762648, 'timestamp': '2025-09-30 22:07:24.925463', 'step': 1089, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:24.967707', 'step': 1089, 'epoch': 2} {'type': 'loss', 'content': 0.00034603956737555563, 'timestamp': '2025-09-30 22:07:24.970499', 'step': 1090, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:07:25.017243', 'step': 1090, 'epoch': 2} {'type': 'loss', 'content': 0.0006762367556802928, 'timestamp': '2025-09-30 22:07:25.019681', 'step': 1091, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:07:25.054496', 'step': 1091, 'epoch': 2} {'type': 'loss', 'content': 0.00011217459541512653, 'timestamp': '2025-09-30 22:07:25.078273', 'step': 1092, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-30 22:07:25.942337', 'step': 1092, 'epoch': 2} {'type': 'pplx', 'content': 114812072.89414735, 'timestamp': '2025-09-30 22:07:25.944231', 'step': 1092, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:25.982267', 'step': 1092, 'epoch': 2} {'type': 'loss', 'content': 0.00041928747668862343, 'timestamp': '2025-09-30 22:07:25.984388', 'step': 1093, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:26.026509', 'step': 1093, 'epoch': 2} {'type': 'loss', 'content': 0.00014250872482080013, 'timestamp': '2025-09-30 22:07:26.028673', 'step': 1094, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:26.062588', 'step': 1094, 'epoch': 2} {'type': 'loss', 'content': 0.020229920744895935, 'timestamp': '2025-09-30 22:07:26.067197', 'step': 1095, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:07:26.110414', 'step': 1095, 'epoch': 2} {'type': 'loss', 'content': 0.0002887416922021657, 'timestamp': '2025-09-30 22:07:26.138645', 'step': 1096, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:26.171453', 'step': 1096, 'epoch': 2} {'type': 'loss', 'content': 0.00011543335858732462, 'timestamp': '2025-09-30 22:07:26.173391', 'step': 1097, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:26.211958', 'step': 1097, 'epoch': 2} {'type': 'loss', 'content': 0.0009078416624106467, 'timestamp': '2025-09-30 22:07:26.216542', 'step': 1098, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:07:26.252557', 'step': 1098, 'epoch': 2} {'type': 'loss', 'content': 0.0011346840765327215, 'timestamp': '2025-09-30 22:07:26.259679', 'step': 1099, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:26.303161', 'step': 1099, 'epoch': 2} {'type': 'loss', 'content': 0.02301344834268093, 'timestamp': '2025-09-30 22:07:26.326868', 'step': 1100, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:26.373972', 'step': 1100, 'epoch': 2} {'type': 'loss', 'content': 0.0005447498406283557, 'timestamp': '2025-09-30 22:07:26.376021', 'step': 1101, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:26.423088', 'step': 1101, 'epoch': 2} {'type': 'loss', 'content': 0.002310013398528099, 'timestamp': '2025-09-30 22:07:26.427404', 'step': 1102, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:26.466821', 'step': 1102, 'epoch': 2} {'type': 'loss', 'content': 0.0007707496988587081, 'timestamp': '2025-09-30 22:07:26.468888', 'step': 1103, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:26.509087', 'step': 1103, 'epoch': 2} {'type': 'loss', 'content': 0.003529587760567665, 'timestamp': '2025-09-30 22:07:26.533027', 'step': 1104, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:26.566816', 'step': 1104, 'epoch': 2} {'type': 'loss', 'content': 0.0007036178722046316, 'timestamp': '2025-09-30 22:07:26.569427', 'step': 1105, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:26.604263', 'step': 1105, 'epoch': 2} {'type': 'loss', 'content': 0.002396752592176199, 'timestamp': '2025-09-30 22:07:26.608622', 'step': 1106, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:26.641009', 'step': 1106, 'epoch': 2} {'type': 'loss', 'content': 0.0011795532191172242, 'timestamp': '2025-09-30 22:07:26.643490', 'step': 1107, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:07:26.687701', 'step': 1107, 'epoch': 2} {'type': 'loss', 'content': 0.005366952158510685, 'timestamp': '2025-09-30 22:07:26.719029', 'step': 1108, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:26.763493', 'step': 1108, 'epoch': 2} {'type': 'loss', 'content': 0.007472001016139984, 'timestamp': '2025-09-30 22:07:26.765459', 'step': 1109, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:26.816703', 'step': 1109, 'epoch': 2} {'type': 'loss', 'content': 0.005245849024504423, 'timestamp': '2025-09-30 22:07:26.819660', 'step': 1110, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:26.852233', 'step': 1110, 'epoch': 2} {'type': 'loss', 'content': 0.001431244076229632, 'timestamp': '2025-09-30 22:07:26.854268', 'step': 1111, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:07:26.897266', 'step': 1111, 'epoch': 2} {'type': 'loss', 'content': 0.00028347785701043904, 'timestamp': '2025-09-30 22:07:26.920936', 'step': 1112, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:26.964343', 'step': 1112, 'epoch': 2} {'type': 'loss', 'content': 0.0003852161462418735, 'timestamp': '2025-09-30 22:07:26.966426', 'step': 1113, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:26.999290', 'step': 1113, 'epoch': 2} {'type': 'loss', 'content': 0.00235289940610528, 'timestamp': '2025-09-30 22:07:27.001433', 'step': 1114, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:27.034182', 'step': 1114, 'epoch': 2} {'type': 'loss', 'content': 0.000366671709343791, 'timestamp': '2025-09-30 22:07:27.036369', 'step': 1115, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:07:27.081274', 'step': 1115, 'epoch': 2} {'type': 'loss', 'content': 0.00013203351409174502, 'timestamp': '2025-09-30 22:07:27.110286', 'step': 1116, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:27.165920', 'step': 1116, 'epoch': 2} {'type': 'loss', 'content': 0.0011101525742560625, 'timestamp': '2025-09-30 22:07:27.167921', 'step': 1117, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:27.199904', 'step': 1117, 'epoch': 2} {'type': 'loss', 'content': 0.005109868943691254, 'timestamp': '2025-09-30 22:07:27.202093', 'step': 1118, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:27.234411', 'step': 1118, 'epoch': 2} {'type': 'loss', 'content': 0.00020523167040664703, 'timestamp': '2025-09-30 22:07:27.239013', 'step': 1119, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:27.280895', 'step': 1119, 'epoch': 2} {'type': 'loss', 'content': 0.015378288924694061, 'timestamp': '2025-09-30 22:07:27.304752', 'step': 1120, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:07:27.340057', 'step': 1120, 'epoch': 2} {'type': 'loss', 'content': 0.0002846012939698994, 'timestamp': '2025-09-30 22:07:27.345741', 'step': 1121, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:27.383913', 'step': 1121, 'epoch': 2} {'type': 'loss', 'content': 0.0018362916307523847, 'timestamp': '2025-09-30 22:07:27.386049', 'step': 1122, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:27.435373', 'step': 1122, 'epoch': 2} {'type': 'loss', 'content': 0.004865389317274094, 'timestamp': '2025-09-30 22:07:27.442458', 'step': 1123, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:27.486170', 'step': 1123, 'epoch': 2} {'type': 'loss', 'content': 0.022158129140734673, 'timestamp': '2025-09-30 22:07:27.509829', 'step': 1124, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:27.551375', 'step': 1124, 'epoch': 2} {'type': 'loss', 'content': 0.00012032425729557872, 'timestamp': '2025-09-30 22:07:27.553428', 'step': 1125, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:07:27.601023', 'step': 1125, 'epoch': 2} {'type': 'loss', 'content': 0.0012263988610357046, 'timestamp': '2025-09-30 22:07:27.613415', 'step': 1126, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:27.663420', 'step': 1126, 'epoch': 2} {'type': 'loss', 'content': 8.227315993281081e-05, 'timestamp': '2025-09-30 22:07:27.667792', 'step': 1127, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:27.710674', 'step': 1127, 'epoch': 2} {'type': 'loss', 'content': 0.004805626813322306, 'timestamp': '2025-09-30 22:07:27.736004', 'step': 1128, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:27.783760', 'step': 1128, 'epoch': 2} {'type': 'loss', 'content': 0.039238281548023224, 'timestamp': '2025-09-30 22:07:27.785901', 'step': 1129, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:07:27.835489', 'step': 1129, 'epoch': 2} {'type': 'loss', 'content': 0.00029851426370441914, 'timestamp': '2025-09-30 22:07:27.843225', 'step': 1130, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:27.879179', 'step': 1130, 'epoch': 2} {'type': 'loss', 'content': 0.00048508719191886485, 'timestamp': '2025-09-30 22:07:27.881252', 'step': 1131, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-30 22:07:28.696294', 'step': 1131, 'epoch': 2} {'type': 'pplx', 'content': 111057330.30651917, 'timestamp': '2025-09-30 22:07:28.698984', 'step': 1131, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:28.728636', 'step': 1131, 'epoch': 2} {'type': 'loss', 'content': 0.00038018793566152453, 'timestamp': '2025-09-30 22:07:28.752198', 'step': 1132, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:28.795951', 'step': 1132, 'epoch': 2} {'type': 'loss', 'content': 0.00018479253049008548, 'timestamp': '2025-09-30 22:07:28.800746', 'step': 1133, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:28.836099', 'step': 1133, 'epoch': 2} {'type': 'loss', 'content': 0.00030270571005530655, 'timestamp': '2025-09-30 22:07:28.838307', 'step': 1134, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:28.876430', 'step': 1134, 'epoch': 2} {'type': 'loss', 'content': 0.00044822480413131416, 'timestamp': '2025-09-30 22:07:28.881175', 'step': 1135, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:07:28.915107', 'step': 1135, 'epoch': 2} {'type': 'loss', 'content': 0.004301538225263357, 'timestamp': '2025-09-30 22:07:28.944253', 'step': 1136, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:28.997685', 'step': 1136, 'epoch': 2} {'type': 'loss', 'content': 0.0001317481219302863, 'timestamp': '2025-09-30 22:07:29.000768', 'step': 1137, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:07:29.049135', 'step': 1137, 'epoch': 2} {'type': 'loss', 'content': 0.002347787842154503, 'timestamp': '2025-09-30 22:07:29.056453', 'step': 1138, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 80], 'flops': 2373281365952}, 'timestamp': '2025-09-30 22:07:29.098652', 'step': 1138, 'epoch': 2} {'type': 'loss', 'content': 0.0011587826302275062, 'timestamp': '2025-09-30 22:07:29.102108', 'step': 1139, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:07:29.136199', 'step': 1139, 'epoch': 2} {'type': 'loss', 'content': 0.0026315338909626007, 'timestamp': '2025-09-30 22:07:29.161088', 'step': 1140, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:07:29.194963', 'step': 1140, 'epoch': 2} {'type': 'loss', 'content': 0.0005633877008222044, 'timestamp': '2025-09-30 22:07:29.199920', 'step': 1141, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:29.232078', 'step': 1141, 'epoch': 2} {'type': 'loss', 'content': 0.006881002802401781, 'timestamp': '2025-09-30 22:07:29.236618', 'step': 1142, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:29.277385', 'step': 1142, 'epoch': 2} {'type': 'loss', 'content': 0.0006113244453445077, 'timestamp': '2025-09-30 22:07:29.280123', 'step': 1143, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:29.313899', 'step': 1143, 'epoch': 2} {'type': 'loss', 'content': 0.01047942228615284, 'timestamp': '2025-09-30 22:07:29.337920', 'step': 1144, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:29.376061', 'step': 1144, 'epoch': 2} {'type': 'loss', 'content': 0.0009232796146534383, 'timestamp': '2025-09-30 22:07:29.378580', 'step': 1145, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:29.413447', 'step': 1145, 'epoch': 2} {'type': 'loss', 'content': 0.0028549160342663527, 'timestamp': '2025-09-30 22:07:29.418173', 'step': 1146, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:29.459570', 'step': 1146, 'epoch': 2} {'type': 'loss', 'content': 0.00026438047643750906, 'timestamp': '2025-09-30 22:07:29.466695', 'step': 1147, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:29.516982', 'step': 1147, 'epoch': 2} {'type': 'loss', 'content': 0.006288810167461634, 'timestamp': '2025-09-30 22:07:29.541140', 'step': 1148, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:29.578736', 'step': 1148, 'epoch': 2} {'type': 'loss', 'content': 0.009455591440200806, 'timestamp': '2025-09-30 22:07:29.581409', 'step': 1149, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:07:29.618934', 'step': 1149, 'epoch': 2} {'type': 'loss', 'content': 0.003386803437024355, 'timestamp': '2025-09-30 22:07:29.626128', 'step': 1150, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:29.664128', 'step': 1150, 'epoch': 2} {'type': 'loss', 'content': 0.0006055928533896804, 'timestamp': '2025-09-30 22:07:29.671155', 'step': 1151, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:29.712255', 'step': 1151, 'epoch': 2} {'type': 'loss', 'content': 0.038205623626708984, 'timestamp': '2025-09-30 22:07:29.736319', 'step': 1152, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:07:29.770374', 'step': 1152, 'epoch': 2} {'type': 'loss', 'content': 0.003251886460930109, 'timestamp': '2025-09-30 22:07:29.773048', 'step': 1153, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:29.808933', 'step': 1153, 'epoch': 2} {'type': 'loss', 'content': 0.00016367484931834042, 'timestamp': '2025-09-30 22:07:29.811337', 'step': 1154, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:29.844482', 'step': 1154, 'epoch': 2} {'type': 'loss', 'content': 7.107849523890764e-05, 'timestamp': '2025-09-30 22:07:29.849187', 'step': 1155, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:29.883924', 'step': 1155, 'epoch': 2} {'type': 'loss', 'content': 0.002373484428972006, 'timestamp': '2025-09-30 22:07:29.908377', 'step': 1156, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:29.950141', 'step': 1156, 'epoch': 2} {'type': 'loss', 'content': 0.00015534063277300447, 'timestamp': '2025-09-30 22:07:29.952180', 'step': 1157, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:29.989428', 'step': 1157, 'epoch': 2} {'type': 'loss', 'content': 0.00014958075189497322, 'timestamp': '2025-09-30 22:07:29.992165', 'step': 1158, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:30.027404', 'step': 1158, 'epoch': 2} {'type': 'loss', 'content': 0.001462747110053897, 'timestamp': '2025-09-30 22:07:30.031747', 'step': 1159, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:07:30.065053', 'step': 1159, 'epoch': 2} {'type': 'loss', 'content': 0.0019475112203508615, 'timestamp': '2025-09-30 22:07:30.089231', 'step': 1160, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:30.124473', 'step': 1160, 'epoch': 2} {'type': 'loss', 'content': 0.01143405307084322, 'timestamp': '2025-09-30 22:07:30.127259', 'step': 1161, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:30.162715', 'step': 1161, 'epoch': 2} {'type': 'loss', 'content': 0.02949167788028717, 'timestamp': '2025-09-30 22:07:30.169782', 'step': 1162, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:30.206509', 'step': 1162, 'epoch': 2} {'type': 'loss', 'content': 0.0006551267579197884, 'timestamp': '2025-09-30 22:07:30.210897', 'step': 1163, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:30.245757', 'step': 1163, 'epoch': 2} {'type': 'loss', 'content': 0.025250211358070374, 'timestamp': '2025-09-30 22:07:30.273860', 'step': 1164, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:30.307190', 'step': 1164, 'epoch': 2} {'type': 'loss', 'content': 0.0009507841314189136, 'timestamp': '2025-09-30 22:07:30.311036', 'step': 1165, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:30.344818', 'step': 1165, 'epoch': 2} {'type': 'loss', 'content': 0.01949252560734749, 'timestamp': '2025-09-30 22:07:30.349033', 'step': 1166, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:07:30.384127', 'step': 1166, 'epoch': 2} {'type': 'loss', 'content': 0.0043541546911001205, 'timestamp': '2025-09-30 22:07:30.387046', 'step': 1167, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:30.422103', 'step': 1167, 'epoch': 2} {'type': 'loss', 'content': 0.00019489186524879187, 'timestamp': '2025-09-30 22:07:30.446610', 'step': 1168, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:30.479821', 'step': 1168, 'epoch': 2} {'type': 'loss', 'content': 0.00754409609362483, 'timestamp': '2025-09-30 22:07:30.482482', 'step': 1169, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:07:30.516951', 'step': 1169, 'epoch': 2} {'type': 'loss', 'content': 0.00020673531980719417, 'timestamp': '2025-09-30 22:07:30.524294', 'step': 1170, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-30 22:07:31.226245', 'step': 1170, 'epoch': 2} {'type': 'pplx', 'content': 107393841.23209043, 'timestamp': '2025-09-30 22:07:31.228308', 'step': 1170, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:31.258468', 'step': 1170, 'epoch': 2} {'type': 'loss', 'content': 0.004547130316495895, 'timestamp': '2025-09-30 22:07:31.260654', 'step': 1171, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:31.314875', 'step': 1171, 'epoch': 2} {'type': 'loss', 'content': 8.820913353702053e-05, 'timestamp': '2025-09-30 22:07:31.338414', 'step': 1172, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:31.372133', 'step': 1172, 'epoch': 2} {'type': 'loss', 'content': 0.0012921736342832446, 'timestamp': '2025-09-30 22:07:31.374214', 'step': 1173, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:31.412384', 'step': 1173, 'epoch': 2} {'type': 'loss', 'content': 0.00023375029559247196, 'timestamp': '2025-09-30 22:07:31.414570', 'step': 1174, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:07:31.448147', 'step': 1174, 'epoch': 2} {'type': 'loss', 'content': 0.0003004588943440467, 'timestamp': '2025-09-30 22:07:31.455362', 'step': 1175, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:31.487937', 'step': 1175, 'epoch': 2} {'type': 'loss', 'content': 0.00036343271494843066, 'timestamp': '2025-09-30 22:07:31.511390', 'step': 1176, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:31.550605', 'step': 1176, 'epoch': 2} {'type': 'loss', 'content': 0.000501815986353904, 'timestamp': '2025-09-30 22:07:31.552786', 'step': 1177, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:07:31.585625', 'step': 1177, 'epoch': 2} {'type': 'loss', 'content': 0.00023165717720985413, 'timestamp': '2025-09-30 22:07:31.587783', 'step': 1178, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:31.622101', 'step': 1178, 'epoch': 2} {'type': 'loss', 'content': 0.0015713156899437308, 'timestamp': '2025-09-30 22:07:31.626934', 'step': 1179, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:31.659455', 'step': 1179, 'epoch': 2} {'type': 'loss', 'content': 0.0007136641070246696, 'timestamp': '2025-09-30 22:07:31.684955', 'step': 1180, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:31.723796', 'step': 1180, 'epoch': 2} {'type': 'loss', 'content': 0.0010480453493073583, 'timestamp': '2025-09-30 22:07:31.726094', 'step': 1181, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:07:31.758210', 'step': 1181, 'epoch': 2} {'type': 'loss', 'content': 0.002461208263412118, 'timestamp': '2025-09-30 22:07:31.760498', 'step': 1182, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:31.799843', 'step': 1182, 'epoch': 2} {'type': 'loss', 'content': 0.0006307312869466841, 'timestamp': '2025-09-30 22:07:31.804128', 'step': 1183, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:31.836251', 'step': 1183, 'epoch': 2} {'type': 'loss', 'content': 0.0055591752752661705, 'timestamp': '2025-09-30 22:07:31.864519', 'step': 1184, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:31.903849', 'step': 1184, 'epoch': 2} {'type': 'loss', 'content': 0.0013035887386649847, 'timestamp': '2025-09-30 22:07:31.908539', 'step': 1185, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:31.941569', 'step': 1185, 'epoch': 2} {'type': 'loss', 'content': 0.0003693876205943525, 'timestamp': '2025-09-30 22:07:31.943777', 'step': 1186, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:31.977278', 'step': 1186, 'epoch': 2} {'type': 'loss', 'content': 0.0008305688970722258, 'timestamp': '2025-09-30 22:07:31.982309', 'step': 1187, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:07:32.015357', 'step': 1187, 'epoch': 2} {'type': 'loss', 'content': 0.0003829421184491366, 'timestamp': '2025-09-30 22:07:32.043487', 'step': 1188, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:32.077283', 'step': 1188, 'epoch': 2} {'type': 'loss', 'content': 0.00402789656072855, 'timestamp': '2025-09-30 22:07:32.079679', 'step': 1189, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:32.112944', 'step': 1189, 'epoch': 2} {'type': 'loss', 'content': 0.00023092723859008402, 'timestamp': '2025-09-30 22:07:32.115035', 'step': 1190, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:32.148948', 'step': 1190, 'epoch': 2} {'type': 'loss', 'content': 0.0004790943639818579, 'timestamp': '2025-09-30 22:07:32.156001', 'step': 1191, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:07:32.194346', 'step': 1191, 'epoch': 2} {'type': 'loss', 'content': 0.00016945795505307615, 'timestamp': '2025-09-30 22:07:32.217962', 'step': 1192, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:32.259407', 'step': 1192, 'epoch': 2} {'type': 'loss', 'content': 0.001122702145949006, 'timestamp': '2025-09-30 22:07:32.261909', 'step': 1193, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:32.294564', 'step': 1193, 'epoch': 2} {'type': 'loss', 'content': 0.00017199316062033176, 'timestamp': '2025-09-30 22:07:32.301669', 'step': 1194, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:32.336916', 'step': 1194, 'epoch': 2} {'type': 'loss', 'content': 0.0003308496088720858, 'timestamp': '2025-09-30 22:07:32.341319', 'step': 1195, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:07:32.373387', 'step': 1195, 'epoch': 2} {'type': 'loss', 'content': 0.001034219516441226, 'timestamp': '2025-09-30 22:07:32.396897', 'step': 1196, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:32.431249', 'step': 1196, 'epoch': 2} {'type': 'loss', 'content': 0.000114070026029367, 'timestamp': '2025-09-30 22:07:32.435927', 'step': 1197, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:07:32.479303', 'step': 1197, 'epoch': 2} {'type': 'loss', 'content': 0.0019510245183482766, 'timestamp': '2025-09-30 22:07:32.487092', 'step': 1198, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:32.519501', 'step': 1198, 'epoch': 2} {'type': 'loss', 'content': 0.00010530307190492749, 'timestamp': '2025-09-30 22:07:32.521706', 'step': 1199, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:32.557335', 'step': 1199, 'epoch': 2} {'type': 'loss', 'content': 0.00038648530608043075, 'timestamp': '2025-09-30 22:07:32.580927', 'step': 1200, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:32.622607', 'step': 1200, 'epoch': 2} {'type': 'loss', 'content': 0.0008311023120768368, 'timestamp': '2025-09-30 22:07:32.624938', 'step': 1201, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:07:32.664466', 'step': 1201, 'epoch': 2} {'type': 'loss', 'content': 0.005615743342787027, 'timestamp': '2025-09-30 22:07:32.671706', 'step': 1202, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:32.706151', 'step': 1202, 'epoch': 2} {'type': 'loss', 'content': 0.0002661571779754013, 'timestamp': '2025-09-30 22:07:32.710377', 'step': 1203, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:32.743688', 'step': 1203, 'epoch': 2} {'type': 'loss', 'content': 0.0001349939702777192, 'timestamp': '2025-09-30 22:07:32.769416', 'step': 1204, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:07:32.811244', 'step': 1204, 'epoch': 2} {'type': 'loss', 'content': 0.00019696114759426564, 'timestamp': '2025-09-30 22:07:32.816686', 'step': 1205, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:32.851179', 'step': 1205, 'epoch': 2} {'type': 'loss', 'content': 0.0005079305847175419, 'timestamp': '2025-09-30 22:07:32.855636', 'step': 1206, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:32.891640', 'step': 1206, 'epoch': 2} {'type': 'loss', 'content': 0.0006546974182128906, 'timestamp': '2025-09-30 22:07:32.896010', 'step': 1207, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:07:32.949034', 'step': 1207, 'epoch': 2} {'type': 'loss', 'content': 0.0014550471678376198, 'timestamp': '2025-09-30 22:07:32.972497', 'step': 1208, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:07:33.011042', 'step': 1208, 'epoch': 2} {'type': 'loss', 'content': 0.002267097821459174, 'timestamp': '2025-09-30 22:07:33.013074', 'step': 1209, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-30 22:07:33.696170', 'step': 1209, 'epoch': 2} {'type': 'pplx', 'content': 108422431.51281904, 'timestamp': '2025-09-30 22:07:33.698353', 'step': 1209, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:33.727709', 'step': 1209, 'epoch': 2} {'type': 'loss', 'content': 0.0005266775260679424, 'timestamp': '2025-09-30 22:07:33.732299', 'step': 1210, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:33.764619', 'step': 1210, 'epoch': 2} {'type': 'loss', 'content': 0.014525589533150196, 'timestamp': '2025-09-30 22:07:33.766776', 'step': 1211, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:33.807720', 'step': 1211, 'epoch': 2} {'type': 'loss', 'content': 0.0015358092496171594, 'timestamp': '2025-09-30 22:07:33.833018', 'step': 1212, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:33.870207', 'step': 1212, 'epoch': 2} {'type': 'loss', 'content': 0.00038857717299833894, 'timestamp': '2025-09-30 22:07:33.872719', 'step': 1213, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:33.908278', 'step': 1213, 'epoch': 2} {'type': 'loss', 'content': 0.00033371103927493095, 'timestamp': '2025-09-30 22:07:33.912847', 'step': 1214, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:33.946687', 'step': 1214, 'epoch': 2} {'type': 'loss', 'content': 7.424094656016678e-05, 'timestamp': '2025-09-30 22:07:33.949561', 'step': 1215, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:33.982696', 'step': 1215, 'epoch': 2} {'type': 'loss', 'content': 0.0003344232391100377, 'timestamp': '2025-09-30 22:07:34.006596', 'step': 1216, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:07:34.041977', 'step': 1216, 'epoch': 2} {'type': 'loss', 'content': 0.0001631371706025675, 'timestamp': '2025-09-30 22:07:34.044266', 'step': 1217, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:34.084628', 'step': 1217, 'epoch': 2} {'type': 'loss', 'content': 0.0026797533500939608, 'timestamp': '2025-09-30 22:07:34.087412', 'step': 1218, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:34.120397', 'step': 1218, 'epoch': 2} {'type': 'loss', 'content': 0.0005254180869087577, 'timestamp': '2025-09-30 22:07:34.122157', 'step': 1219, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:07:34.155920', 'step': 1219, 'epoch': 2} {'type': 'loss', 'content': 0.0001768097426975146, 'timestamp': '2025-09-30 22:07:34.179731', 'step': 1220, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:07:34.220980', 'step': 1220, 'epoch': 2} {'type': 'loss', 'content': 0.0001642854476813227, 'timestamp': '2025-09-30 22:07:34.223159', 'step': 1221, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:34.257299', 'step': 1221, 'epoch': 2} {'type': 'loss', 'content': 0.00038075828342698514, 'timestamp': '2025-09-30 22:07:34.259443', 'step': 1222, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:07:34.295921', 'step': 1222, 'epoch': 2} {'type': 'loss', 'content': 0.013797529973089695, 'timestamp': '2025-09-30 22:07:34.304067', 'step': 1223, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:34.336990', 'step': 1223, 'epoch': 2} {'type': 'loss', 'content': 0.019091341644525528, 'timestamp': '2025-09-30 22:07:34.360616', 'step': 1224, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:34.394341', 'step': 1224, 'epoch': 2} {'type': 'loss', 'content': 0.00017502431001048535, 'timestamp': '2025-09-30 22:07:34.398956', 'step': 1225, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:34.431879', 'step': 1225, 'epoch': 2} {'type': 'loss', 'content': 0.0009142042836174369, 'timestamp': '2025-09-30 22:07:34.434803', 'step': 1226, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:34.467132', 'step': 1226, 'epoch': 2} {'type': 'loss', 'content': 0.000977121526375413, 'timestamp': '2025-09-30 22:07:34.471775', 'step': 1227, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:34.514271', 'step': 1227, 'epoch': 2} {'type': 'loss', 'content': 0.0005587582127191126, 'timestamp': '2025-09-30 22:07:34.538008', 'step': 1228, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:34.588925', 'step': 1228, 'epoch': 2} {'type': 'loss', 'content': 0.001227530068717897, 'timestamp': '2025-09-30 22:07:34.591044', 'step': 1229, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:34.624813', 'step': 1229, 'epoch': 2} {'type': 'loss', 'content': 0.04056653380393982, 'timestamp': '2025-09-30 22:07:34.628727', 'step': 1230, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:34.668933', 'step': 1230, 'epoch': 2} {'type': 'loss', 'content': 0.00010975310578942299, 'timestamp': '2025-09-30 22:07:34.670920', 'step': 1231, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:34.704241', 'step': 1231, 'epoch': 2} {'type': 'loss', 'content': 0.0007277274271473289, 'timestamp': '2025-09-30 22:07:34.729772', 'step': 1232, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:34.762694', 'step': 1232, 'epoch': 2} {'type': 'loss', 'content': 0.0003397251130081713, 'timestamp': '2025-09-30 22:07:34.764836', 'step': 1233, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:34.798183', 'step': 1233, 'epoch': 2} {'type': 'loss', 'content': 0.007443286012858152, 'timestamp': '2025-09-30 22:07:34.802875', 'step': 1234, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:34.835821', 'step': 1234, 'epoch': 2} {'type': 'loss', 'content': 0.0015747037250548601, 'timestamp': '2025-09-30 22:07:34.842772', 'step': 1235, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:34.878376', 'step': 1235, 'epoch': 2} {'type': 'loss', 'content': 5.67116767342668e-05, 'timestamp': '2025-09-30 22:07:34.906530', 'step': 1236, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:07:34.941103', 'step': 1236, 'epoch': 2} {'type': 'loss', 'content': 0.0033072498627007008, 'timestamp': '2025-09-30 22:07:34.943249', 'step': 1237, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:07:34.976375', 'step': 1237, 'epoch': 2} {'type': 'loss', 'content': 0.0008226304198615253, 'timestamp': '2025-09-30 22:07:34.983683', 'step': 1238, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:35.016379', 'step': 1238, 'epoch': 2} {'type': 'loss', 'content': 0.013950667344033718, 'timestamp': '2025-09-30 22:07:35.019223', 'step': 1239, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:07:35.050865', 'step': 1239, 'epoch': 2} {'type': 'loss', 'content': 0.0024328443687409163, 'timestamp': '2025-09-30 22:07:35.079020', 'step': 1240, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:07:35.111767', 'step': 1240, 'epoch': 2} {'type': 'loss', 'content': 0.009844565764069557, 'timestamp': '2025-09-30 22:07:35.114111', 'step': 1241, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:35.148199', 'step': 1241, 'epoch': 2} {'type': 'loss', 'content': 0.0006344670546241105, 'timestamp': '2025-09-30 22:07:35.152928', 'step': 1242, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:35.187526', 'step': 1242, 'epoch': 2} {'type': 'loss', 'content': 0.0005711555131711066, 'timestamp': '2025-09-30 22:07:35.191857', 'step': 1243, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:35.227428', 'step': 1243, 'epoch': 2} {'type': 'loss', 'content': 0.008294356986880302, 'timestamp': '2025-09-30 22:07:35.252810', 'step': 1244, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:35.285127', 'step': 1244, 'epoch': 2} {'type': 'loss', 'content': 0.0008478214731439948, 'timestamp': '2025-09-30 22:07:35.289820', 'step': 1245, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [2, 192], 'flops': 2847885110400}, 'timestamp': '2025-09-30 22:07:35.331568', 'step': 1245, 'epoch': 2} {'type': 'loss', 'content': 0.00015602679923176765, 'timestamp': '2025-09-30 22:07:35.333611', 'step': 1246, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:35.383504', 'step': 1246, 'epoch': 3} {'type': 'loss', 'content': 0.0039008252788335085, 'timestamp': '2025-09-30 22:07:35.385600', 'step': 1247, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:35.419217', 'step': 1247, 'epoch': 3} {'type': 'loss', 'content': 0.004876903258264065, 'timestamp': '2025-09-30 22:07:35.443027', 'step': 1248, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-30 22:07:36.135041', 'step': 1248, 'epoch': 3} {'type': 'pplx', 'content': 109807121.1574246, 'timestamp': '2025-09-30 22:07:36.136996', 'step': 1248, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:36.176025', 'step': 1248, 'epoch': 3} {'type': 'loss', 'content': 0.000255336839472875, 'timestamp': '2025-09-30 22:07:36.178051', 'step': 1249, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:36.212961', 'step': 1249, 'epoch': 3} {'type': 'loss', 'content': 0.004327817354351282, 'timestamp': '2025-09-30 22:07:36.217256', 'step': 1250, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:07:36.252409', 'step': 1250, 'epoch': 3} {'type': 'loss', 'content': 0.0003256227064412087, 'timestamp': '2025-09-30 22:07:36.260120', 'step': 1251, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:36.295118', 'step': 1251, 'epoch': 3} {'type': 'loss', 'content': 0.0035193650983273983, 'timestamp': '2025-09-30 22:07:36.320782', 'step': 1252, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:07:36.354257', 'step': 1252, 'epoch': 3} {'type': 'loss', 'content': 0.0034225154668092728, 'timestamp': '2025-09-30 22:07:36.356290', 'step': 1253, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:36.395143', 'step': 1253, 'epoch': 3} {'type': 'loss', 'content': 0.028722798451781273, 'timestamp': '2025-09-30 22:07:36.400020', 'step': 1254, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:07:36.432660', 'step': 1254, 'epoch': 3} {'type': 'loss', 'content': 0.00042226066580042243, 'timestamp': '2025-09-30 22:07:36.434794', 'step': 1255, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:36.467524', 'step': 1255, 'epoch': 3} {'type': 'loss', 'content': 0.0007823723135516047, 'timestamp': '2025-09-30 22:07:36.491012', 'step': 1256, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:07:36.527852', 'step': 1256, 'epoch': 3} {'type': 'loss', 'content': 0.011989377439022064, 'timestamp': '2025-09-30 22:07:36.533147', 'step': 1257, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:36.566229', 'step': 1257, 'epoch': 3} {'type': 'loss', 'content': 0.00017260621825698763, 'timestamp': '2025-09-30 22:07:36.570419', 'step': 1258, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:36.603032', 'step': 1258, 'epoch': 3} {'type': 'loss', 'content': 0.00010432758426759392, 'timestamp': '2025-09-30 22:07:36.605767', 'step': 1259, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:36.637699', 'step': 1259, 'epoch': 3} {'type': 'loss', 'content': 0.008409126661717892, 'timestamp': '2025-09-30 22:07:36.661361', 'step': 1260, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:36.693688', 'step': 1260, 'epoch': 3} {'type': 'loss', 'content': 0.03930716961622238, 'timestamp': '2025-09-30 22:07:36.695933', 'step': 1261, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:36.728645', 'step': 1261, 'epoch': 3} {'type': 'loss', 'content': 0.004691932816058397, 'timestamp': '2025-09-30 22:07:36.731368', 'step': 1262, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:36.762632', 'step': 1262, 'epoch': 3} {'type': 'loss', 'content': 0.0003119763860013336, 'timestamp': '2025-09-30 22:07:36.765588', 'step': 1263, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:36.800587', 'step': 1263, 'epoch': 3} {'type': 'loss', 'content': 0.00028951704734936357, 'timestamp': '2025-09-30 22:07:36.828500', 'step': 1264, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:07:36.860604', 'step': 1264, 'epoch': 3} {'type': 'loss', 'content': 0.0010244931327179074, 'timestamp': '2025-09-30 22:07:36.862339', 'step': 1265, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:36.895817', 'step': 1265, 'epoch': 3} {'type': 'loss', 'content': 0.0013763883616775274, 'timestamp': '2025-09-30 22:07:36.903016', 'step': 1266, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:36.937081', 'step': 1266, 'epoch': 3} {'type': 'loss', 'content': 0.0009086270001716912, 'timestamp': '2025-09-30 22:07:36.944023', 'step': 1267, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:36.975857', 'step': 1267, 'epoch': 3} {'type': 'loss', 'content': 0.004262962378561497, 'timestamp': '2025-09-30 22:07:36.999675', 'step': 1268, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:37.031985', 'step': 1268, 'epoch': 3} {'type': 'loss', 'content': 0.02412317879498005, 'timestamp': '2025-09-30 22:07:37.033995', 'step': 1269, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:37.067156', 'step': 1269, 'epoch': 3} {'type': 'loss', 'content': 0.02252691611647606, 'timestamp': '2025-09-30 22:07:37.071793', 'step': 1270, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:37.108421', 'step': 1270, 'epoch': 3} {'type': 'loss', 'content': 0.00012152874114690349, 'timestamp': '2025-09-30 22:07:37.115419', 'step': 1271, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:07:37.149492', 'step': 1271, 'epoch': 3} {'type': 'loss', 'content': 0.00133028463460505, 'timestamp': '2025-09-30 22:07:37.178293', 'step': 1272, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:37.211271', 'step': 1272, 'epoch': 3} {'type': 'loss', 'content': 0.022769641131162643, 'timestamp': '2025-09-30 22:07:37.213314', 'step': 1273, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:07:37.249613', 'step': 1273, 'epoch': 3} {'type': 'loss', 'content': 0.012292742729187012, 'timestamp': '2025-09-30 22:07:37.253055', 'step': 1274, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:07:37.287530', 'step': 1274, 'epoch': 3} {'type': 'loss', 'content': 0.005768026225268841, 'timestamp': '2025-09-30 22:07:37.294740', 'step': 1275, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:37.329256', 'step': 1275, 'epoch': 3} {'type': 'loss', 'content': 0.00015319335216190666, 'timestamp': '2025-09-30 22:07:37.354968', 'step': 1276, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:37.396114', 'step': 1276, 'epoch': 3} {'type': 'loss', 'content': 9.743180271470919e-05, 'timestamp': '2025-09-30 22:07:37.398099', 'step': 1277, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:37.431604', 'step': 1277, 'epoch': 3} {'type': 'loss', 'content': 0.006411910522729158, 'timestamp': '2025-09-30 22:07:37.433438', 'step': 1278, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:37.466660', 'step': 1278, 'epoch': 3} {'type': 'loss', 'content': 0.010445569641888142, 'timestamp': '2025-09-30 22:07:37.473756', 'step': 1279, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:37.506255', 'step': 1279, 'epoch': 3} {'type': 'loss', 'content': 0.004939712584018707, 'timestamp': '2025-09-30 22:07:37.534338', 'step': 1280, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:37.566932', 'step': 1280, 'epoch': 3} {'type': 'loss', 'content': 0.000179723632754758, 'timestamp': '2025-09-30 22:07:37.569247', 'step': 1281, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:37.601976', 'step': 1281, 'epoch': 3} {'type': 'loss', 'content': 0.006414394360035658, 'timestamp': '2025-09-30 22:07:37.604761', 'step': 1282, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:37.636239', 'step': 1282, 'epoch': 3} {'type': 'loss', 'content': 0.01634913496673107, 'timestamp': '2025-09-30 22:07:37.638114', 'step': 1283, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:37.669132', 'step': 1283, 'epoch': 3} {'type': 'loss', 'content': 0.0030618479941040277, 'timestamp': '2025-09-30 22:07:37.692933', 'step': 1284, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:37.724375', 'step': 1284, 'epoch': 3} {'type': 'loss', 'content': 0.014753976836800575, 'timestamp': '2025-09-30 22:07:37.726062', 'step': 1285, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:37.757327', 'step': 1285, 'epoch': 3} {'type': 'loss', 'content': 0.00043953536078333855, 'timestamp': '2025-09-30 22:07:37.761731', 'step': 1286, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:37.794682', 'step': 1286, 'epoch': 3} {'type': 'loss', 'content': 0.0008979348349384964, 'timestamp': '2025-09-30 22:07:37.801834', 'step': 1287, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-30 22:07:38.470861', 'step': 1287, 'epoch': 3} {'type': 'pplx', 'content': 109989573.47868328, 'timestamp': '2025-09-30 22:07:38.473187', 'step': 1287, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:38.508925', 'step': 1287, 'epoch': 3} {'type': 'loss', 'content': 0.023780453950166702, 'timestamp': '2025-09-30 22:07:38.534259', 'step': 1288, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:38.569925', 'step': 1288, 'epoch': 3} {'type': 'loss', 'content': 0.0005712545826099813, 'timestamp': '2025-09-30 22:07:38.572178', 'step': 1289, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:07:38.604571', 'step': 1289, 'epoch': 3} {'type': 'loss', 'content': 0.014848102815449238, 'timestamp': '2025-09-30 22:07:38.611884', 'step': 1290, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:38.661965', 'step': 1290, 'epoch': 3} {'type': 'loss', 'content': 0.0018235408933833241, 'timestamp': '2025-09-30 22:07:38.664575', 'step': 1291, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:38.697886', 'step': 1291, 'epoch': 3} {'type': 'loss', 'content': 0.0031597481574863195, 'timestamp': '2025-09-30 22:07:38.723546', 'step': 1292, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:38.764873', 'step': 1292, 'epoch': 3} {'type': 'loss', 'content': 0.0017853927565738559, 'timestamp': '2025-09-30 22:07:38.767578', 'step': 1293, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:07:38.803281', 'step': 1293, 'epoch': 3} {'type': 'loss', 'content': 0.0030321648810058832, 'timestamp': '2025-09-30 22:07:38.806057', 'step': 1294, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:38.841128', 'step': 1294, 'epoch': 3} {'type': 'loss', 'content': 0.002925761044025421, 'timestamp': '2025-09-30 22:07:38.843318', 'step': 1295, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:38.878262', 'step': 1295, 'epoch': 3} {'type': 'loss', 'content': 0.0015968728112056851, 'timestamp': '2025-09-30 22:07:38.901759', 'step': 1296, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:38.943013', 'step': 1296, 'epoch': 3} {'type': 'loss', 'content': 0.005096987821161747, 'timestamp': '2025-09-30 22:07:38.945453', 'step': 1297, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:07:38.982097', 'step': 1297, 'epoch': 3} {'type': 'loss', 'content': 0.0007914546877145767, 'timestamp': '2025-09-30 22:07:38.984606', 'step': 1298, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:39.019090', 'step': 1298, 'epoch': 3} {'type': 'loss', 'content': 0.0022949494887143373, 'timestamp': '2025-09-30 22:07:39.021437', 'step': 1299, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:39.062138', 'step': 1299, 'epoch': 3} {'type': 'loss', 'content': 0.000646329834125936, 'timestamp': '2025-09-30 22:07:39.086371', 'step': 1300, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:39.119570', 'step': 1300, 'epoch': 3} {'type': 'loss', 'content': 0.006533071864396334, 'timestamp': '2025-09-30 22:07:39.121936', 'step': 1301, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:39.155908', 'step': 1301, 'epoch': 3} {'type': 'loss', 'content': 0.0032184135634452105, 'timestamp': '2025-09-30 22:07:39.158607', 'step': 1302, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:39.192997', 'step': 1302, 'epoch': 3} {'type': 'loss', 'content': 0.0008461990510113537, 'timestamp': '2025-09-30 22:07:39.195669', 'step': 1303, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:07:39.229134', 'step': 1303, 'epoch': 3} {'type': 'loss', 'content': 0.01580544374883175, 'timestamp': '2025-09-30 22:07:39.253492', 'step': 1304, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:07:39.286631', 'step': 1304, 'epoch': 3} {'type': 'loss', 'content': 0.015919901430606842, 'timestamp': '2025-09-30 22:07:39.288786', 'step': 1305, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:39.322651', 'step': 1305, 'epoch': 3} {'type': 'loss', 'content': 0.033842675387859344, 'timestamp': '2025-09-30 22:07:39.325255', 'step': 1306, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:07:39.357476', 'step': 1306, 'epoch': 3} {'type': 'loss', 'content': 0.003096401458606124, 'timestamp': '2025-09-30 22:07:39.360347', 'step': 1307, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:39.394566', 'step': 1307, 'epoch': 3} {'type': 'loss', 'content': 0.0005393567844294012, 'timestamp': '2025-09-30 22:07:39.420354', 'step': 1308, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:07:39.453395', 'step': 1308, 'epoch': 3} {'type': 'loss', 'content': 0.0008734531002119184, 'timestamp': '2025-09-30 22:07:39.456644', 'step': 1309, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:39.488834', 'step': 1309, 'epoch': 3} {'type': 'loss', 'content': 0.002081485465168953, 'timestamp': '2025-09-30 22:07:39.493369', 'step': 1310, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:39.526571', 'step': 1310, 'epoch': 3} {'type': 'loss', 'content': 0.011021971702575684, 'timestamp': '2025-09-30 22:07:39.530759', 'step': 1311, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:39.565568', 'step': 1311, 'epoch': 3} {'type': 'loss', 'content': 0.00900907814502716, 'timestamp': '2025-09-30 22:07:39.589812', 'step': 1312, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:39.620721', 'step': 1312, 'epoch': 3} {'type': 'loss', 'content': 0.011516647413372993, 'timestamp': '2025-09-30 22:07:39.624276', 'step': 1313, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:07:39.657077', 'step': 1313, 'epoch': 3} {'type': 'loss', 'content': 0.000806538388133049, 'timestamp': '2025-09-30 22:07:39.658901', 'step': 1314, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:39.691156', 'step': 1314, 'epoch': 3} {'type': 'loss', 'content': 0.010637268424034119, 'timestamp': '2025-09-30 22:07:39.695673', 'step': 1315, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:39.727487', 'step': 1315, 'epoch': 3} {'type': 'loss', 'content': 0.0007261876598931849, 'timestamp': '2025-09-30 22:07:39.750945', 'step': 1316, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:07:39.783255', 'step': 1316, 'epoch': 3} {'type': 'loss', 'content': 0.002348328474909067, 'timestamp': '2025-09-30 22:07:39.785684', 'step': 1317, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:07:39.818553', 'step': 1317, 'epoch': 3} {'type': 'loss', 'content': 0.005363943986594677, 'timestamp': '2025-09-30 22:07:39.821258', 'step': 1318, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:07:39.853727', 'step': 1318, 'epoch': 3} {'type': 'loss', 'content': 0.00031056805164553225, 'timestamp': '2025-09-30 22:07:39.856008', 'step': 1319, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:39.887622', 'step': 1319, 'epoch': 3} {'type': 'loss', 'content': 0.004423830658197403, 'timestamp': '2025-09-30 22:07:39.912275', 'step': 1320, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:39.945415', 'step': 1320, 'epoch': 3} {'type': 'loss', 'content': 8.320227061631158e-05, 'timestamp': '2025-09-30 22:07:39.948184', 'step': 1321, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:39.980603', 'step': 1321, 'epoch': 3} {'type': 'loss', 'content': 0.0002933432988356799, 'timestamp': '2025-09-30 22:07:39.985405', 'step': 1322, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:40.018838', 'step': 1322, 'epoch': 3} {'type': 'loss', 'content': 0.0042983428575098515, 'timestamp': '2025-09-30 22:07:40.022003', 'step': 1323, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:40.056300', 'step': 1323, 'epoch': 3} {'type': 'loss', 'content': 0.014138293452560902, 'timestamp': '2025-09-30 22:07:40.084488', 'step': 1324, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:07:40.118024', 'step': 1324, 'epoch': 3} {'type': 'loss', 'content': 0.03766119107604027, 'timestamp': '2025-09-30 22:07:40.123535', 'step': 1325, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:40.157052', 'step': 1325, 'epoch': 3} {'type': 'loss', 'content': 0.001065706484951079, 'timestamp': '2025-09-30 22:07:40.161626', 'step': 1326, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-30 22:07:40.814410', 'step': 1326, 'epoch': 3} {'type': 'pplx', 'content': 106030059.01171215, 'timestamp': '2025-09-30 22:07:40.816322', 'step': 1326, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:40.846018', 'step': 1326, 'epoch': 3} {'type': 'loss', 'content': 0.0014224022161215544, 'timestamp': '2025-09-30 22:07:40.853213', 'step': 1327, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:40.886027', 'step': 1327, 'epoch': 3} {'type': 'loss', 'content': 0.0001248378393938765, 'timestamp': '2025-09-30 22:07:40.911756', 'step': 1328, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:07:40.944563', 'step': 1328, 'epoch': 3} {'type': 'loss', 'content': 0.00013765999756287783, 'timestamp': '2025-09-30 22:07:40.950013', 'step': 1329, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:40.986025', 'step': 1329, 'epoch': 3} {'type': 'loss', 'content': 0.003243004670366645, 'timestamp': '2025-09-30 22:07:40.990606', 'step': 1330, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:07:41.021649', 'step': 1330, 'epoch': 3} {'type': 'loss', 'content': 0.0005152979865670204, 'timestamp': '2025-09-30 22:07:41.029170', 'step': 1331, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:41.064293', 'step': 1331, 'epoch': 3} {'type': 'loss', 'content': 0.0005544818704947829, 'timestamp': '2025-09-30 22:07:41.088257', 'step': 1332, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:07:41.121724', 'step': 1332, 'epoch': 3} {'type': 'loss', 'content': 0.001916523789986968, 'timestamp': '2025-09-30 22:07:41.127255', 'step': 1333, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:41.159608', 'step': 1333, 'epoch': 3} {'type': 'loss', 'content': 0.0005377818597480655, 'timestamp': '2025-09-30 22:07:41.161126', 'step': 1334, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:41.193703', 'step': 1334, 'epoch': 3} {'type': 'loss', 'content': 0.0011306932428851724, 'timestamp': '2025-09-30 22:07:41.200986', 'step': 1335, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:41.232954', 'step': 1335, 'epoch': 3} {'type': 'loss', 'content': 0.0022087269462645054, 'timestamp': '2025-09-30 22:07:41.258171', 'step': 1336, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:41.291315', 'step': 1336, 'epoch': 3} {'type': 'loss', 'content': 0.00083199079381302, 'timestamp': '2025-09-30 22:07:41.293552', 'step': 1337, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:41.327551', 'step': 1337, 'epoch': 3} {'type': 'loss', 'content': 0.0007499647326767445, 'timestamp': '2025-09-30 22:07:41.332457', 'step': 1338, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:41.368792', 'step': 1338, 'epoch': 3} {'type': 'loss', 'content': 0.02916637435555458, 'timestamp': '2025-09-30 22:07:41.373358', 'step': 1339, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:41.410206', 'step': 1339, 'epoch': 3} {'type': 'loss', 'content': 0.008801432326436043, 'timestamp': '2025-09-30 22:07:41.436123', 'step': 1340, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:41.467956', 'step': 1340, 'epoch': 3} {'type': 'loss', 'content': 0.0003800613048952073, 'timestamp': '2025-09-30 22:07:41.470444', 'step': 1341, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:41.502289', 'step': 1341, 'epoch': 3} {'type': 'loss', 'content': 0.00023526222503278404, 'timestamp': '2025-09-30 22:07:41.506904', 'step': 1342, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:41.537764', 'step': 1342, 'epoch': 3} {'type': 'loss', 'content': 0.0013486042153090239, 'timestamp': '2025-09-30 22:07:41.539519', 'step': 1343, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:41.571710', 'step': 1343, 'epoch': 3} {'type': 'loss', 'content': 0.0008594534010626376, 'timestamp': '2025-09-30 22:07:41.594882', 'step': 1344, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:07:41.628068', 'step': 1344, 'epoch': 3} {'type': 'loss', 'content': 0.002135425340384245, 'timestamp': '2025-09-30 22:07:41.633603', 'step': 1345, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:41.669158', 'step': 1345, 'epoch': 3} {'type': 'loss', 'content': 0.0004654375952668488, 'timestamp': '2025-09-30 22:07:41.672158', 'step': 1346, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:41.707133', 'step': 1346, 'epoch': 3} {'type': 'loss', 'content': 0.0007671394268982112, 'timestamp': '2025-09-30 22:07:41.714176', 'step': 1347, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:41.748009', 'step': 1347, 'epoch': 3} {'type': 'loss', 'content': 0.00014771531277801841, 'timestamp': '2025-09-30 22:07:41.773750', 'step': 1348, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:41.807576', 'step': 1348, 'epoch': 3} {'type': 'loss', 'content': 0.0006639771163463593, 'timestamp': '2025-09-30 22:07:41.809571', 'step': 1349, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:41.843263', 'step': 1349, 'epoch': 3} {'type': 'loss', 'content': 0.0008383739041164517, 'timestamp': '2025-09-30 22:07:41.847870', 'step': 1350, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:07:41.879607', 'step': 1350, 'epoch': 3} {'type': 'loss', 'content': 0.00017002799722831696, 'timestamp': '2025-09-30 22:07:41.887532', 'step': 1351, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:07:41.919217', 'step': 1351, 'epoch': 3} {'type': 'loss', 'content': 0.0007719314889982343, 'timestamp': '2025-09-30 22:07:41.942902', 'step': 1352, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:41.975422', 'step': 1352, 'epoch': 3} {'type': 'loss', 'content': 0.0007648206083104014, 'timestamp': '2025-09-30 22:07:41.977609', 'step': 1353, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:42.010502', 'step': 1353, 'epoch': 3} {'type': 'loss', 'content': 0.0007910222630016506, 'timestamp': '2025-09-30 22:07:42.012591', 'step': 1354, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:42.043880', 'step': 1354, 'epoch': 3} {'type': 'loss', 'content': 0.0020354478619992733, 'timestamp': '2025-09-30 22:07:42.045704', 'step': 1355, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:42.077166', 'step': 1355, 'epoch': 3} {'type': 'loss', 'content': 0.004846019204705954, 'timestamp': '2025-09-30 22:07:42.102588', 'step': 1356, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:42.135357', 'step': 1356, 'epoch': 3} {'type': 'loss', 'content': 0.004001092631369829, 'timestamp': '2025-09-30 22:07:42.137610', 'step': 1357, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:42.176312', 'step': 1357, 'epoch': 3} {'type': 'loss', 'content': 0.0003547620144672692, 'timestamp': '2025-09-30 22:07:42.178435', 'step': 1358, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:42.211823', 'step': 1358, 'epoch': 3} {'type': 'loss', 'content': 0.001670529949478805, 'timestamp': '2025-09-30 22:07:42.216033', 'step': 1359, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:07:42.248214', 'step': 1359, 'epoch': 3} {'type': 'loss', 'content': 0.020056426525115967, 'timestamp': '2025-09-30 22:07:42.276989', 'step': 1360, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:42.309095', 'step': 1360, 'epoch': 3} {'type': 'loss', 'content': 0.0006083712796680629, 'timestamp': '2025-09-30 22:07:42.311340', 'step': 1361, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:42.344216', 'step': 1361, 'epoch': 3} {'type': 'loss', 'content': 0.0007033172878436744, 'timestamp': '2025-09-30 22:07:42.348786', 'step': 1362, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:07:42.381169', 'step': 1362, 'epoch': 3} {'type': 'loss', 'content': 0.0025410365778952837, 'timestamp': '2025-09-30 22:07:42.383295', 'step': 1363, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:42.418777', 'step': 1363, 'epoch': 3} {'type': 'loss', 'content': 0.0006478140712715685, 'timestamp': '2025-09-30 22:07:42.446782', 'step': 1364, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:42.478821', 'step': 1364, 'epoch': 3} {'type': 'loss', 'content': 0.000585120462346822, 'timestamp': '2025-09-30 22:07:42.480984', 'step': 1365, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-30 22:07:43.129212', 'step': 1365, 'epoch': 3} {'type': 'pplx', 'content': 109053889.68612264, 'timestamp': '2025-09-30 22:07:43.130865', 'step': 1365, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:07:43.161899', 'step': 1365, 'epoch': 3} {'type': 'loss', 'content': 0.0001839351753005758, 'timestamp': '2025-09-30 22:07:43.172005', 'step': 1366, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:43.206699', 'step': 1366, 'epoch': 3} {'type': 'loss', 'content': 0.0008755376911722124, 'timestamp': '2025-09-30 22:07:43.211246', 'step': 1367, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:43.245998', 'step': 1367, 'epoch': 3} {'type': 'loss', 'content': 0.0005775559693574905, 'timestamp': '2025-09-30 22:07:43.269389', 'step': 1368, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:43.302158', 'step': 1368, 'epoch': 3} {'type': 'loss', 'content': 0.0018166237277910113, 'timestamp': '2025-09-30 22:07:43.304484', 'step': 1369, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:07:43.337709', 'step': 1369, 'epoch': 3} {'type': 'loss', 'content': 0.0011054730275645852, 'timestamp': '2025-09-30 22:07:43.344899', 'step': 1370, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:43.379219', 'step': 1370, 'epoch': 3} {'type': 'loss', 'content': 0.002334549557417631, 'timestamp': '2025-09-30 22:07:43.382150', 'step': 1371, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:43.415628', 'step': 1371, 'epoch': 3} {'type': 'loss', 'content': 0.000506529351696372, 'timestamp': '2025-09-30 22:07:43.441137', 'step': 1372, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:07:43.474812', 'step': 1372, 'epoch': 3} {'type': 'loss', 'content': 0.00016645164578221738, 'timestamp': '2025-09-30 22:07:43.477135', 'step': 1373, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:07:43.512473', 'step': 1373, 'epoch': 3} {'type': 'loss', 'content': 0.0001811900729080662, 'timestamp': '2025-09-30 22:07:43.514980', 'step': 1374, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:43.546709', 'step': 1374, 'epoch': 3} {'type': 'loss', 'content': 0.001400345703586936, 'timestamp': '2025-09-30 22:07:43.551305', 'step': 1375, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:07:43.583104', 'step': 1375, 'epoch': 3} {'type': 'loss', 'content': 0.00017988981562666595, 'timestamp': '2025-09-30 22:07:43.614522', 'step': 1376, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:43.646821', 'step': 1376, 'epoch': 3} {'type': 'loss', 'content': 0.000209273915970698, 'timestamp': '2025-09-30 22:07:43.649104', 'step': 1377, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:07:43.682137', 'step': 1377, 'epoch': 3} {'type': 'loss', 'content': 0.0006141887861303985, 'timestamp': '2025-09-30 22:07:43.684330', 'step': 1378, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:43.717491', 'step': 1378, 'epoch': 3} {'type': 'loss', 'content': 0.0010347587522119284, 'timestamp': '2025-09-30 22:07:43.721848', 'step': 1379, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:43.754013', 'step': 1379, 'epoch': 3} {'type': 'loss', 'content': 0.00014966915477998555, 'timestamp': '2025-09-30 22:07:43.782992', 'step': 1380, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:07:43.816666', 'step': 1380, 'epoch': 3} {'type': 'loss', 'content': 0.014756555669009686, 'timestamp': '2025-09-30 22:07:43.821789', 'step': 1381, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:43.854140', 'step': 1381, 'epoch': 3} {'type': 'loss', 'content': 0.00017656719137448817, 'timestamp': '2025-09-30 22:07:43.858815', 'step': 1382, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:07:43.891147', 'step': 1382, 'epoch': 3} {'type': 'loss', 'content': 0.011107811704277992, 'timestamp': '2025-09-30 22:07:43.893491', 'step': 1383, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:43.925131', 'step': 1383, 'epoch': 3} {'type': 'loss', 'content': 0.0002206839999416843, 'timestamp': '2025-09-30 22:07:43.949472', 'step': 1384, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:43.982366', 'step': 1384, 'epoch': 3} {'type': 'loss', 'content': 0.010964823886752129, 'timestamp': '2025-09-30 22:07:43.984532', 'step': 1385, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:07:44.017959', 'step': 1385, 'epoch': 3} {'type': 'loss', 'content': 7.031839777482674e-05, 'timestamp': '2025-09-30 22:07:44.020280', 'step': 1386, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:07:44.053815', 'step': 1386, 'epoch': 3} {'type': 'loss', 'content': 0.000803628470748663, 'timestamp': '2025-09-30 22:07:44.056177', 'step': 1387, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:07:44.096237', 'step': 1387, 'epoch': 3} {'type': 'loss', 'content': 0.0005003432161174715, 'timestamp': '2025-09-30 22:07:44.125286', 'step': 1388, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:44.164569', 'step': 1388, 'epoch': 3} {'type': 'loss', 'content': 0.0014421312371268868, 'timestamp': '2025-09-30 22:07:44.166908', 'step': 1389, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:44.198649', 'step': 1389, 'epoch': 3} {'type': 'loss', 'content': 0.0009203127701766789, 'timestamp': '2025-09-30 22:07:44.200711', 'step': 1390, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:44.233083', 'step': 1390, 'epoch': 3} {'type': 'loss', 'content': 0.001186627778224647, 'timestamp': '2025-09-30 22:07:44.235276', 'step': 1391, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:44.271707', 'step': 1391, 'epoch': 3} {'type': 'loss', 'content': 0.00014718440070282668, 'timestamp': '2025-09-30 22:07:44.295260', 'step': 1392, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:44.333181', 'step': 1392, 'epoch': 3} {'type': 'loss', 'content': 0.00036089521017856896, 'timestamp': '2025-09-30 22:07:44.335507', 'step': 1393, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:44.367482', 'step': 1393, 'epoch': 3} {'type': 'loss', 'content': 6.875755207147449e-05, 'timestamp': '2025-09-30 22:07:44.371873', 'step': 1394, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:44.404487', 'step': 1394, 'epoch': 3} {'type': 'loss', 'content': 0.0001918337366078049, 'timestamp': '2025-09-30 22:07:44.408881', 'step': 1395, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:44.450911', 'step': 1395, 'epoch': 3} {'type': 'loss', 'content': 0.00034902142942883074, 'timestamp': '2025-09-30 22:07:44.476425', 'step': 1396, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:44.523212', 'step': 1396, 'epoch': 3} {'type': 'loss', 'content': 0.0007774402620270848, 'timestamp': '2025-09-30 22:07:44.525276', 'step': 1397, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:44.560153', 'step': 1397, 'epoch': 3} {'type': 'loss', 'content': 0.0013828465016558766, 'timestamp': '2025-09-30 22:07:44.562494', 'step': 1398, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:44.595406', 'step': 1398, 'epoch': 3} {'type': 'loss', 'content': 0.015292760916054249, 'timestamp': '2025-09-30 22:07:44.598139', 'step': 1399, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:44.643739', 'step': 1399, 'epoch': 3} {'type': 'loss', 'content': 0.000216073400224559, 'timestamp': '2025-09-30 22:07:44.669046', 'step': 1400, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:44.704594', 'step': 1400, 'epoch': 3} {'type': 'loss', 'content': 0.0016167605062946677, 'timestamp': '2025-09-30 22:07:44.706908', 'step': 1401, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:44.750476', 'step': 1401, 'epoch': 3} {'type': 'loss', 'content': 7.860381447244436e-05, 'timestamp': '2025-09-30 22:07:44.757450', 'step': 1402, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:07:44.800280', 'step': 1402, 'epoch': 3} {'type': 'loss', 'content': 0.0003883039462380111, 'timestamp': '2025-09-30 22:07:44.802361', 'step': 1403, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:44.842323', 'step': 1403, 'epoch': 3} {'type': 'loss', 'content': 0.0006712899194099009, 'timestamp': '2025-09-30 22:07:44.868005', 'step': 1404, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-30 22:07:45.656702', 'step': 1404, 'epoch': 3} {'type': 'pplx', 'content': 111324626.72600262, 'timestamp': '2025-09-30 22:07:45.658624', 'step': 1404, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:45.688457', 'step': 1404, 'epoch': 3} {'type': 'loss', 'content': 0.0016097808256745338, 'timestamp': '2025-09-30 22:07:45.691034', 'step': 1405, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:07:45.728705', 'step': 1405, 'epoch': 3} {'type': 'loss', 'content': 0.00021898788691032678, 'timestamp': '2025-09-30 22:07:45.735867', 'step': 1406, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:45.770158', 'step': 1406, 'epoch': 3} {'type': 'loss', 'content': 0.0016417077276855707, 'timestamp': '2025-09-30 22:07:45.774112', 'step': 1407, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:45.808998', 'step': 1407, 'epoch': 3} {'type': 'loss', 'content': 0.00014548443141393363, 'timestamp': '2025-09-30 22:07:45.832929', 'step': 1408, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:07:45.868456', 'step': 1408, 'epoch': 3} {'type': 'loss', 'content': 0.002540131565183401, 'timestamp': '2025-09-30 22:07:45.873402', 'step': 1409, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:45.907205', 'step': 1409, 'epoch': 3} {'type': 'loss', 'content': 0.0002596612030174583, 'timestamp': '2025-09-30 22:07:45.911392', 'step': 1410, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:45.942423', 'step': 1410, 'epoch': 3} {'type': 'loss', 'content': 0.0001415082806488499, 'timestamp': '2025-09-30 22:07:45.944951', 'step': 1411, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:45.980839', 'step': 1411, 'epoch': 3} {'type': 'loss', 'content': 0.0008175332914106548, 'timestamp': '2025-09-30 22:07:46.005676', 'step': 1412, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:46.037564', 'step': 1412, 'epoch': 3} {'type': 'loss', 'content': 5.09199126099702e-05, 'timestamp': '2025-09-30 22:07:46.039734', 'step': 1413, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:46.072284', 'step': 1413, 'epoch': 3} {'type': 'loss', 'content': 0.056001435965299606, 'timestamp': '2025-09-30 22:07:46.075028', 'step': 1414, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:07:46.116722', 'step': 1414, 'epoch': 3} {'type': 'loss', 'content': 0.0008140717982314527, 'timestamp': '2025-09-30 22:07:46.124299', 'step': 1415, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:46.174037', 'step': 1415, 'epoch': 3} {'type': 'loss', 'content': 0.01871386542916298, 'timestamp': '2025-09-30 22:07:46.198975', 'step': 1416, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:46.239964', 'step': 1416, 'epoch': 3} {'type': 'loss', 'content': 0.0017616671975702047, 'timestamp': '2025-09-30 22:07:46.241845', 'step': 1417, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:07:46.282331', 'step': 1417, 'epoch': 3} {'type': 'loss', 'content': 5.8603611250873655e-05, 'timestamp': '2025-09-30 22:07:46.289616', 'step': 1418, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:46.327101', 'step': 1418, 'epoch': 3} {'type': 'loss', 'content': 0.00041110877646133304, 'timestamp': '2025-09-30 22:07:46.331307', 'step': 1419, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:46.369163', 'step': 1419, 'epoch': 3} {'type': 'loss', 'content': 8.381939551327378e-05, 'timestamp': '2025-09-30 22:07:46.392799', 'step': 1420, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:46.425008', 'step': 1420, 'epoch': 3} {'type': 'loss', 'content': 0.00018263938545715064, 'timestamp': '2025-09-30 22:07:46.427179', 'step': 1421, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:46.458869', 'step': 1421, 'epoch': 3} {'type': 'loss', 'content': 0.00024420252884738147, 'timestamp': '2025-09-30 22:07:46.461665', 'step': 1422, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:07:46.493460', 'step': 1422, 'epoch': 3} {'type': 'loss', 'content': 0.0002778305788524449, 'timestamp': '2025-09-30 22:07:46.499142', 'step': 1423, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:46.532510', 'step': 1423, 'epoch': 3} {'type': 'loss', 'content': 6.686317647108808e-05, 'timestamp': '2025-09-30 22:07:46.556191', 'step': 1424, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:46.590168', 'step': 1424, 'epoch': 3} {'type': 'loss', 'content': 0.00014924805145710707, 'timestamp': '2025-09-30 22:07:46.592375', 'step': 1425, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:46.623259', 'step': 1425, 'epoch': 3} {'type': 'loss', 'content': 0.00018138554878532887, 'timestamp': '2025-09-30 22:07:46.625203', 'step': 1426, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:46.660949', 'step': 1426, 'epoch': 3} {'type': 'loss', 'content': 0.0007415400468744338, 'timestamp': '2025-09-30 22:07:46.665233', 'step': 1427, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:07:46.695568', 'step': 1427, 'epoch': 3} {'type': 'loss', 'content': 0.005260678939521313, 'timestamp': '2025-09-30 22:07:46.724287', 'step': 1428, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:07:46.756642', 'step': 1428, 'epoch': 3} {'type': 'loss', 'content': 0.00022222170082386583, 'timestamp': '2025-09-30 22:07:46.759042', 'step': 1429, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:46.794819', 'step': 1429, 'epoch': 3} {'type': 'loss', 'content': 0.000168702652445063, 'timestamp': '2025-09-30 22:07:46.797581', 'step': 1430, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:46.841723', 'step': 1430, 'epoch': 3} {'type': 'loss', 'content': 0.0005504986620508134, 'timestamp': '2025-09-30 22:07:46.843896', 'step': 1431, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:46.878320', 'step': 1431, 'epoch': 3} {'type': 'loss', 'content': 0.0018384003778919578, 'timestamp': '2025-09-30 22:07:46.903319', 'step': 1432, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:46.936905', 'step': 1432, 'epoch': 3} {'type': 'loss', 'content': 0.007305443286895752, 'timestamp': '2025-09-30 22:07:46.940047', 'step': 1433, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:07:46.974532', 'step': 1433, 'epoch': 3} {'type': 'loss', 'content': 0.00016536428302060813, 'timestamp': '2025-09-30 22:07:46.981490', 'step': 1434, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:47.014908', 'step': 1434, 'epoch': 3} {'type': 'loss', 'content': 0.00028782375738956034, 'timestamp': '2025-09-30 22:07:47.019297', 'step': 1435, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:47.052292', 'step': 1435, 'epoch': 3} {'type': 'loss', 'content': 0.00013649475295096636, 'timestamp': '2025-09-30 22:07:47.077558', 'step': 1436, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:47.109343', 'step': 1436, 'epoch': 3} {'type': 'loss', 'content': 0.00019715628877747804, 'timestamp': '2025-09-30 22:07:47.111552', 'step': 1437, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:47.142209', 'step': 1437, 'epoch': 3} {'type': 'loss', 'content': 0.001548089669086039, 'timestamp': '2025-09-30 22:07:47.144674', 'step': 1438, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:47.176869', 'step': 1438, 'epoch': 3} {'type': 'loss', 'content': 0.00021238917543087155, 'timestamp': '2025-09-30 22:07:47.179722', 'step': 1439, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:47.210562', 'step': 1439, 'epoch': 3} {'type': 'loss', 'content': 0.0016702745342627168, 'timestamp': '2025-09-30 22:07:47.235882', 'step': 1440, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:47.267065', 'step': 1440, 'epoch': 3} {'type': 'loss', 'content': 7.280301360879093e-05, 'timestamp': '2025-09-30 22:07:47.270100', 'step': 1441, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:47.302565', 'step': 1441, 'epoch': 3} {'type': 'loss', 'content': 0.011224872432649136, 'timestamp': '2025-09-30 22:07:47.307249', 'step': 1442, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:47.340103', 'step': 1442, 'epoch': 3} {'type': 'loss', 'content': 0.0005737360916100442, 'timestamp': '2025-09-30 22:07:47.344784', 'step': 1443, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-30 22:07:48.038441', 'step': 1443, 'epoch': 3} {'type': 'pplx', 'content': 110657800.76337399, 'timestamp': '2025-09-30 22:07:48.040960', 'step': 1443, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:48.073885', 'step': 1443, 'epoch': 3} {'type': 'loss', 'content': 0.00037595274625346065, 'timestamp': '2025-09-30 22:07:48.097896', 'step': 1444, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:07:48.131159', 'step': 1444, 'epoch': 3} {'type': 'loss', 'content': 5.4645923228235915e-05, 'timestamp': '2025-09-30 22:07:48.136051', 'step': 1445, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:48.168214', 'step': 1445, 'epoch': 3} {'type': 'loss', 'content': 0.00010486804967513308, 'timestamp': '2025-09-30 22:07:48.175111', 'step': 1446, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:48.210247', 'step': 1446, 'epoch': 3} {'type': 'loss', 'content': 0.0005528793553821743, 'timestamp': '2025-09-30 22:07:48.217190', 'step': 1447, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:48.253649', 'step': 1447, 'epoch': 3} {'type': 'loss', 'content': 0.00015464976604562253, 'timestamp': '2025-09-30 22:07:48.278708', 'step': 1448, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:07:48.314094', 'step': 1448, 'epoch': 3} {'type': 'loss', 'content': 0.0005388990975916386, 'timestamp': '2025-09-30 22:07:48.316676', 'step': 1449, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:48.350531', 'step': 1449, 'epoch': 3} {'type': 'loss', 'content': 0.04156193509697914, 'timestamp': '2025-09-30 22:07:48.353251', 'step': 1450, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:48.394087', 'step': 1450, 'epoch': 3} {'type': 'loss', 'content': 0.0002184375043725595, 'timestamp': '2025-09-30 22:07:48.396812', 'step': 1451, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:48.430159', 'step': 1451, 'epoch': 3} {'type': 'loss', 'content': 0.0006777126109227538, 'timestamp': '2025-09-30 22:07:48.454481', 'step': 1452, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:07:48.491447', 'step': 1452, 'epoch': 3} {'type': 'loss', 'content': 0.01857731305062771, 'timestamp': '2025-09-30 22:07:48.493951', 'step': 1453, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:07:48.529874', 'step': 1453, 'epoch': 3} {'type': 'loss', 'content': 0.006827401462942362, 'timestamp': '2025-09-30 22:07:48.536715', 'step': 1454, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:48.572067', 'step': 1454, 'epoch': 3} {'type': 'loss', 'content': 0.000535592611413449, 'timestamp': '2025-09-30 22:07:48.574354', 'step': 1455, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:48.605918', 'step': 1455, 'epoch': 3} {'type': 'loss', 'content': 0.0012004311429336667, 'timestamp': '2025-09-30 22:07:48.629656', 'step': 1456, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:48.663068', 'step': 1456, 'epoch': 3} {'type': 'loss', 'content': 0.0042284573428332806, 'timestamp': '2025-09-30 22:07:48.667190', 'step': 1457, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:48.700364', 'step': 1457, 'epoch': 3} {'type': 'loss', 'content': 0.0034628177527338266, 'timestamp': '2025-09-30 22:07:48.703026', 'step': 1458, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:07:48.742729', 'step': 1458, 'epoch': 3} {'type': 'loss', 'content': 0.00019150369917042553, 'timestamp': '2025-09-30 22:07:48.747333', 'step': 1459, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:48.781159', 'step': 1459, 'epoch': 3} {'type': 'loss', 'content': 0.013045864179730415, 'timestamp': '2025-09-30 22:07:48.806102', 'step': 1460, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:48.837908', 'step': 1460, 'epoch': 3} {'type': 'loss', 'content': 0.00012074068217771128, 'timestamp': '2025-09-30 22:07:48.840251', 'step': 1461, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:48.877361', 'step': 1461, 'epoch': 3} {'type': 'loss', 'content': 8.07570613687858e-05, 'timestamp': '2025-09-30 22:07:48.880079', 'step': 1462, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:07:48.910600', 'step': 1462, 'epoch': 3} {'type': 'loss', 'content': 0.00021056282275822014, 'timestamp': '2025-09-30 22:07:48.912779', 'step': 1463, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:07:48.946052', 'step': 1463, 'epoch': 3} {'type': 'loss', 'content': 0.006550428923219442, 'timestamp': '2025-09-30 22:07:48.975086', 'step': 1464, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:49.006721', 'step': 1464, 'epoch': 3} {'type': 'loss', 'content': 0.000441436015535146, 'timestamp': '2025-09-30 22:07:49.009015', 'step': 1465, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:07:49.040591', 'step': 1465, 'epoch': 3} {'type': 'loss', 'content': 0.0004042996733915061, 'timestamp': '2025-09-30 22:07:49.048783', 'step': 1466, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:49.080751', 'step': 1466, 'epoch': 3} {'type': 'loss', 'content': 7.870148692745715e-05, 'timestamp': '2025-09-30 22:07:49.085230', 'step': 1467, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:07:49.118125', 'step': 1467, 'epoch': 3} {'type': 'loss', 'content': 0.005421972833573818, 'timestamp': '2025-09-30 22:07:49.149473', 'step': 1468, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:07:49.185397', 'step': 1468, 'epoch': 3} {'type': 'loss', 'content': 0.0005969242192804813, 'timestamp': '2025-09-30 22:07:49.190180', 'step': 1469, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:49.223876', 'step': 1469, 'epoch': 3} {'type': 'loss', 'content': 0.0004208089376334101, 'timestamp': '2025-09-30 22:07:49.230889', 'step': 1470, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:49.263973', 'step': 1470, 'epoch': 3} {'type': 'loss', 'content': 0.0075470260344445705, 'timestamp': '2025-09-30 22:07:49.266058', 'step': 1471, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:49.300310', 'step': 1471, 'epoch': 3} {'type': 'loss', 'content': 0.0005702655180357397, 'timestamp': '2025-09-30 22:07:49.325800', 'step': 1472, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:49.368781', 'step': 1472, 'epoch': 3} {'type': 'loss', 'content': 0.00041888977284543216, 'timestamp': '2025-09-30 22:07:49.371029', 'step': 1473, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:49.402474', 'step': 1473, 'epoch': 3} {'type': 'loss', 'content': 8.177950803656131e-05, 'timestamp': '2025-09-30 22:07:49.404944', 'step': 1474, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:07:49.437614', 'step': 1474, 'epoch': 3} {'type': 'loss', 'content': 7.957070920383558e-05, 'timestamp': '2025-09-30 22:07:49.444752', 'step': 1475, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:49.477866', 'step': 1475, 'epoch': 3} {'type': 'loss', 'content': 0.02284647338092327, 'timestamp': '2025-09-30 22:07:49.501905', 'step': 1476, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:49.538854', 'step': 1476, 'epoch': 3} {'type': 'loss', 'content': 0.006310875993221998, 'timestamp': '2025-09-30 22:07:49.540904', 'step': 1477, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:07:49.575727', 'step': 1477, 'epoch': 3} {'type': 'loss', 'content': 0.00017135801317635924, 'timestamp': '2025-09-30 22:07:49.583040', 'step': 1478, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:07:49.619405', 'step': 1478, 'epoch': 3} {'type': 'loss', 'content': 0.007718032691627741, 'timestamp': '2025-09-30 22:07:49.626760', 'step': 1479, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:07:49.663438', 'step': 1479, 'epoch': 3} {'type': 'loss', 'content': 0.00022026248916517943, 'timestamp': '2025-09-30 22:07:49.687202', 'step': 1480, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:07:49.718874', 'step': 1480, 'epoch': 3} {'type': 'loss', 'content': 0.0012717212084680796, 'timestamp': '2025-09-30 22:07:49.720862', 'step': 1481, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:07:49.752263', 'step': 1481, 'epoch': 3} {'type': 'loss', 'content': 0.001083889976143837, 'timestamp': '2025-09-30 22:07:49.760158', 'step': 1482, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-30 22:07:50.388619', 'step': 1482, 'epoch': 3} {'type': 'pplx', 'content': 109402435.07773964, 'timestamp': '2025-09-30 22:07:50.390663', 'step': 1482, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:50.421017', 'step': 1482, 'epoch': 3} {'type': 'loss', 'content': 0.0057127587497234344, 'timestamp': '2025-09-30 22:07:50.425123', 'step': 1483, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:50.457499', 'step': 1483, 'epoch': 3} {'type': 'loss', 'content': 0.0002413246693322435, 'timestamp': '2025-09-30 22:07:50.483002', 'step': 1484, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:50.520018', 'step': 1484, 'epoch': 3} {'type': 'loss', 'content': 0.016218166798353195, 'timestamp': '2025-09-30 22:07:50.522073', 'step': 1485, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:50.563361', 'step': 1485, 'epoch': 3} {'type': 'loss', 'content': 0.00025467827799730003, 'timestamp': '2025-09-30 22:07:50.567887', 'step': 1486, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:07:50.599267', 'step': 1486, 'epoch': 3} {'type': 'loss', 'content': 0.0005150886718183756, 'timestamp': '2025-09-30 22:07:50.606375', 'step': 1487, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:50.637765', 'step': 1487, 'epoch': 3} {'type': 'loss', 'content': 0.0004436791059561074, 'timestamp': '2025-09-30 22:07:50.663012', 'step': 1488, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:50.694451', 'step': 1488, 'epoch': 3} {'type': 'loss', 'content': 0.006838344968855381, 'timestamp': '2025-09-30 22:07:50.698960', 'step': 1489, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:50.730865', 'step': 1489, 'epoch': 3} {'type': 'loss', 'content': 0.015281050466001034, 'timestamp': '2025-09-30 22:07:50.735240', 'step': 1490, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:07:50.766604', 'step': 1490, 'epoch': 3} {'type': 'loss', 'content': 0.0014932630583643913, 'timestamp': '2025-09-30 22:07:50.773774', 'step': 1491, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:07:50.805710', 'step': 1491, 'epoch': 3} {'type': 'loss', 'content': 0.0003391748759895563, 'timestamp': '2025-09-30 22:07:50.834430', 'step': 1492, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:50.865966', 'step': 1492, 'epoch': 3} {'type': 'loss', 'content': 0.00013037241296842694, 'timestamp': '2025-09-30 22:07:50.868409', 'step': 1493, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:50.899827', 'step': 1493, 'epoch': 3} {'type': 'loss', 'content': 0.0005963169387541711, 'timestamp': '2025-09-30 22:07:50.903989', 'step': 1494, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:50.934506', 'step': 1494, 'epoch': 3} {'type': 'loss', 'content': 5.436746869236231e-05, 'timestamp': '2025-09-30 22:07:50.941603', 'step': 1495, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:50.973050', 'step': 1495, 'epoch': 3} {'type': 'loss', 'content': 0.00010126921552000567, 'timestamp': '2025-09-30 22:07:50.996877', 'step': 1496, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:51.028055', 'step': 1496, 'epoch': 3} {'type': 'loss', 'content': 0.0009686009143479168, 'timestamp': '2025-09-30 22:07:51.030132', 'step': 1497, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:51.061647', 'step': 1497, 'epoch': 3} {'type': 'loss', 'content': 0.0005619805306196213, 'timestamp': '2025-09-30 22:07:51.064170', 'step': 1498, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:51.095934', 'step': 1498, 'epoch': 3} {'type': 'loss', 'content': 0.0003612506261561066, 'timestamp': '2025-09-30 22:07:51.098090', 'step': 1499, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:07:51.129365', 'step': 1499, 'epoch': 3} {'type': 'loss', 'content': 0.008169925771653652, 'timestamp': '2025-09-30 22:07:51.153105', 'step': 1500, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 1500', 'timestamp': '2025-09-30 22:07:56.007630', 'step': 1500, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:56.056192', 'step': 1500, 'epoch': 3} {'type': 'loss', 'content': 0.011021244339644909, 'timestamp': '2025-09-30 22:07:56.066412', 'step': 1501, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:56.099565', 'step': 1501, 'epoch': 3} {'type': 'loss', 'content': 0.00015265133697539568, 'timestamp': '2025-09-30 22:07:56.105586', 'step': 1502, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:56.143258', 'step': 1502, 'epoch': 3} {'type': 'loss', 'content': 0.005286376923322678, 'timestamp': '2025-09-30 22:07:56.150186', 'step': 1503, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:07:56.189838', 'step': 1503, 'epoch': 3} {'type': 'loss', 'content': 0.00011992856889264658, 'timestamp': '2025-09-30 22:07:56.219725', 'step': 1504, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:56.263434', 'step': 1504, 'epoch': 3} {'type': 'loss', 'content': 0.006383276078850031, 'timestamp': '2025-09-30 22:07:56.266521', 'step': 1505, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:56.306137', 'step': 1505, 'epoch': 3} {'type': 'loss', 'content': 0.0003791770723182708, 'timestamp': '2025-09-30 22:07:56.314141', 'step': 1506, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:56.364410', 'step': 1506, 'epoch': 3} {'type': 'loss', 'content': 0.0002968795888591558, 'timestamp': '2025-09-30 22:07:56.369283', 'step': 1507, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:07:56.417705', 'step': 1507, 'epoch': 3} {'type': 'loss', 'content': 0.0003191308060195297, 'timestamp': '2025-09-30 22:07:56.442819', 'step': 1508, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:56.476820', 'step': 1508, 'epoch': 3} {'type': 'loss', 'content': 0.0005938306567259133, 'timestamp': '2025-09-30 22:07:56.480008', 'step': 1509, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:56.521308', 'step': 1509, 'epoch': 3} {'type': 'loss', 'content': 0.0016930060228332877, 'timestamp': '2025-09-30 22:07:56.526990', 'step': 1510, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:56.561894', 'step': 1510, 'epoch': 3} {'type': 'loss', 'content': 0.0003255842602811754, 'timestamp': '2025-09-30 22:07:56.572108', 'step': 1511, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:56.605706', 'step': 1511, 'epoch': 3} {'type': 'loss', 'content': 0.000320999271934852, 'timestamp': '2025-09-30 22:07:56.629936', 'step': 1512, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:56.662094', 'step': 1512, 'epoch': 3} {'type': 'loss', 'content': 0.00016117122140713036, 'timestamp': '2025-09-30 22:07:56.668838', 'step': 1513, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:07:56.707464', 'step': 1513, 'epoch': 3} {'type': 'loss', 'content': 0.0009694714681245387, 'timestamp': '2025-09-30 22:07:56.714135', 'step': 1514, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:56.753636', 'step': 1514, 'epoch': 3} {'type': 'loss', 'content': 0.0008338862680830061, 'timestamp': '2025-09-30 22:07:56.758078', 'step': 1515, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:56.799819', 'step': 1515, 'epoch': 3} {'type': 'loss', 'content': 8.57623599586077e-05, 'timestamp': '2025-09-30 22:07:56.826578', 'step': 1516, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:56.861369', 'step': 1516, 'epoch': 3} {'type': 'loss', 'content': 0.0021684980019927025, 'timestamp': '2025-09-30 22:07:56.873671', 'step': 1517, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:56.913424', 'step': 1517, 'epoch': 3} {'type': 'loss', 'content': 0.00260011269710958, 'timestamp': '2025-09-30 22:07:56.920385', 'step': 1518, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:07:56.959620', 'step': 1518, 'epoch': 3} {'type': 'loss', 'content': 0.0006647937698289752, 'timestamp': '2025-09-30 22:07:56.967446', 'step': 1519, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:56.999268', 'step': 1519, 'epoch': 3} {'type': 'loss', 'content': 0.0001569747255416587, 'timestamp': '2025-09-30 22:07:57.030104', 'step': 1520, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:07:57.078530', 'step': 1520, 'epoch': 3} {'type': 'loss', 'content': 0.0003687713178806007, 'timestamp': '2025-09-30 22:07:57.087089', 'step': 1521, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-30 22:07:57.765660', 'step': 1521, 'epoch': 3} {'type': 'pplx', 'content': 108316397.555252, 'timestamp': '2025-09-30 22:07:57.773309', 'step': 1521, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:07:57.809493', 'step': 1521, 'epoch': 3} {'type': 'loss', 'content': 7.841020851628855e-05, 'timestamp': '2025-09-30 22:07:57.819649', 'step': 1522, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:57.856081', 'step': 1522, 'epoch': 3} {'type': 'loss', 'content': 0.00047487596748396754, 'timestamp': '2025-09-30 22:07:57.863331', 'step': 1523, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:07:57.903662', 'step': 1523, 'epoch': 3} {'type': 'loss', 'content': 0.0012490659719333053, 'timestamp': '2025-09-30 22:07:57.932164', 'step': 1524, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:57.967105', 'step': 1524, 'epoch': 3} {'type': 'loss', 'content': 0.0005907863960601389, 'timestamp': '2025-09-30 22:07:57.972722', 'step': 1525, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:58.010496', 'step': 1525, 'epoch': 3} {'type': 'loss', 'content': 0.0002828697324730456, 'timestamp': '2025-09-30 22:07:58.014933', 'step': 1526, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:58.047621', 'step': 1526, 'epoch': 3} {'type': 'loss', 'content': 0.0006750501925125718, 'timestamp': '2025-09-30 22:07:58.053908', 'step': 1527, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:58.098564', 'step': 1527, 'epoch': 3} {'type': 'loss', 'content': 0.0007431631092913449, 'timestamp': '2025-09-30 22:07:58.128539', 'step': 1528, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:07:58.168866', 'step': 1528, 'epoch': 3} {'type': 'loss', 'content': 0.00021667192049790174, 'timestamp': '2025-09-30 22:07:58.176704', 'step': 1529, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:07:58.212352', 'step': 1529, 'epoch': 3} {'type': 'loss', 'content': 0.0002985773899126798, 'timestamp': '2025-09-30 22:07:58.224713', 'step': 1530, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:58.262613', 'step': 1530, 'epoch': 3} {'type': 'loss', 'content': 0.00027525509358383715, 'timestamp': '2025-09-30 22:07:58.269304', 'step': 1531, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:07:58.305293', 'step': 1531, 'epoch': 3} {'type': 'loss', 'content': 0.0006788480677641928, 'timestamp': '2025-09-30 22:07:58.334872', 'step': 1532, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:58.374429', 'step': 1532, 'epoch': 3} {'type': 'loss', 'content': 0.00018581384210847318, 'timestamp': '2025-09-30 22:07:58.381574', 'step': 1533, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:58.419942', 'step': 1533, 'epoch': 3} {'type': 'loss', 'content': 0.00016812498506624252, 'timestamp': '2025-09-30 22:07:58.432510', 'step': 1534, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:58.471750', 'step': 1534, 'epoch': 3} {'type': 'loss', 'content': 0.0006146311643533409, 'timestamp': '2025-09-30 22:07:58.476170', 'step': 1535, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:58.523246', 'step': 1535, 'epoch': 3} {'type': 'loss', 'content': 0.00039723937516100705, 'timestamp': '2025-09-30 22:07:58.553914', 'step': 1536, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:07:58.586459', 'step': 1536, 'epoch': 3} {'type': 'loss', 'content': 0.001720373285934329, 'timestamp': '2025-09-30 22:07:58.592902', 'step': 1537, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:58.628973', 'step': 1537, 'epoch': 3} {'type': 'loss', 'content': 0.0002852992038242519, 'timestamp': '2025-09-30 22:07:58.633065', 'step': 1538, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:58.674476', 'step': 1538, 'epoch': 3} {'type': 'loss', 'content': 0.0007187298615463078, 'timestamp': '2025-09-30 22:07:58.681138', 'step': 1539, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:58.720500', 'step': 1539, 'epoch': 3} {'type': 'loss', 'content': 0.0001630386832403019, 'timestamp': '2025-09-30 22:07:58.744828', 'step': 1540, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:58.777185', 'step': 1540, 'epoch': 3} {'type': 'loss', 'content': 0.00010030083649326116, 'timestamp': '2025-09-30 22:07:58.781824', 'step': 1541, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:07:58.827357', 'step': 1541, 'epoch': 3} {'type': 'loss', 'content': 0.0005012182518839836, 'timestamp': '2025-09-30 22:07:58.829764', 'step': 1542, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:58.862190', 'step': 1542, 'epoch': 3} {'type': 'loss', 'content': 0.00025205491692759097, 'timestamp': '2025-09-30 22:07:58.864801', 'step': 1543, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:58.913324', 'step': 1543, 'epoch': 3} {'type': 'loss', 'content': 0.00020313379354774952, 'timestamp': '2025-09-30 22:07:58.940418', 'step': 1544, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:58.981896', 'step': 1544, 'epoch': 3} {'type': 'loss', 'content': 4.318432183936238e-05, 'timestamp': '2025-09-30 22:07:58.986189', 'step': 1545, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:07:59.029376', 'step': 1545, 'epoch': 3} {'type': 'loss', 'content': 0.00014498786185868084, 'timestamp': '2025-09-30 22:07:59.036725', 'step': 1546, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:59.074469', 'step': 1546, 'epoch': 3} {'type': 'loss', 'content': 0.0003494999255053699, 'timestamp': '2025-09-30 22:07:59.079065', 'step': 1547, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:59.111455', 'step': 1547, 'epoch': 3} {'type': 'loss', 'content': 0.0018302559619769454, 'timestamp': '2025-09-30 22:07:59.138392', 'step': 1548, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:07:59.171635', 'step': 1548, 'epoch': 3} {'type': 'loss', 'content': 0.01349946204572916, 'timestamp': '2025-09-30 22:07:59.175050', 'step': 1549, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:07:59.219607', 'step': 1549, 'epoch': 3} {'type': 'loss', 'content': 0.00011048569285776466, 'timestamp': '2025-09-30 22:07:59.229055', 'step': 1550, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:07:59.275463', 'step': 1550, 'epoch': 3} {'type': 'loss', 'content': 0.00011431624443503097, 'timestamp': '2025-09-30 22:07:59.279372', 'step': 1551, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:59.321139', 'step': 1551, 'epoch': 3} {'type': 'loss', 'content': 6.0733193095074967e-05, 'timestamp': '2025-09-30 22:07:59.349157', 'step': 1552, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:59.390225', 'step': 1552, 'epoch': 3} {'type': 'loss', 'content': 0.0003296414215583354, 'timestamp': '2025-09-30 22:07:59.394857', 'step': 1553, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:59.427894', 'step': 1553, 'epoch': 3} {'type': 'loss', 'content': 0.000213971987250261, 'timestamp': '2025-09-30 22:07:59.436213', 'step': 1554, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:07:59.483135', 'step': 1554, 'epoch': 3} {'type': 'loss', 'content': 0.0001407633681083098, 'timestamp': '2025-09-30 22:07:59.488822', 'step': 1555, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:07:59.533912', 'step': 1555, 'epoch': 3} {'type': 'loss', 'content': 0.0003707043069880456, 'timestamp': '2025-09-30 22:07:59.561812', 'step': 1556, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:07:59.604114', 'step': 1556, 'epoch': 3} {'type': 'loss', 'content': 0.0002882958797272295, 'timestamp': '2025-09-30 22:07:59.610152', 'step': 1557, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:59.653434', 'step': 1557, 'epoch': 3} {'type': 'loss', 'content': 0.004305084235966206, 'timestamp': '2025-09-30 22:07:59.660444', 'step': 1558, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:07:59.698055', 'step': 1558, 'epoch': 3} {'type': 'loss', 'content': 0.0006565157091245055, 'timestamp': '2025-09-30 22:07:59.705079', 'step': 1559, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:07:59.738815', 'step': 1559, 'epoch': 3} {'type': 'loss', 'content': 0.00025418223231099546, 'timestamp': '2025-09-30 22:07:59.766899', 'step': 1560, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-30 22:08:00.508526', 'step': 1560, 'epoch': 3} {'type': 'pplx', 'content': 112198427.95504099, 'timestamp': '2025-09-30 22:08:00.510974', 'step': 1560, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:08:00.540871', 'step': 1560, 'epoch': 3} {'type': 'loss', 'content': 0.0014659542357549071, 'timestamp': '2025-09-30 22:08:00.543889', 'step': 1561, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:08:00.578213', 'step': 1561, 'epoch': 3} {'type': 'loss', 'content': 0.001355137093923986, 'timestamp': '2025-09-30 22:08:00.582836', 'step': 1562, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:08:00.615430', 'step': 1562, 'epoch': 3} {'type': 'loss', 'content': 0.0008022190886549652, 'timestamp': '2025-09-30 22:08:00.617561', 'step': 1563, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:08:00.652645', 'step': 1563, 'epoch': 3} {'type': 'loss', 'content': 0.05110324174165726, 'timestamp': '2025-09-30 22:08:00.677735', 'step': 1564, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:08:00.711624', 'step': 1564, 'epoch': 3} {'type': 'loss', 'content': 0.06121738627552986, 'timestamp': '2025-09-30 22:08:00.714775', 'step': 1565, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:00.763825', 'step': 1565, 'epoch': 3} {'type': 'loss', 'content': 0.0002549318887759, 'timestamp': '2025-09-30 22:08:00.765929', 'step': 1566, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:08:00.809761', 'step': 1566, 'epoch': 3} {'type': 'loss', 'content': 0.0022373481187969446, 'timestamp': '2025-09-30 22:08:00.812029', 'step': 1567, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:08:00.854177', 'step': 1567, 'epoch': 3} {'type': 'loss', 'content': 0.00020157112157903612, 'timestamp': '2025-09-30 22:08:00.882886', 'step': 1568, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:08:00.920926', 'step': 1568, 'epoch': 3} {'type': 'loss', 'content': 0.004094513598829508, 'timestamp': '2025-09-30 22:08:00.923616', 'step': 1569, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:08:00.965875', 'step': 1569, 'epoch': 3} {'type': 'loss', 'content': 0.00041190601768903434, 'timestamp': '2025-09-30 22:08:00.970108', 'step': 1570, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:08:01.011757', 'step': 1570, 'epoch': 3} {'type': 'loss', 'content': 6.220638169907033e-05, 'timestamp': '2025-09-30 22:08:01.016471', 'step': 1571, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:08:01.053329', 'step': 1571, 'epoch': 3} {'type': 'loss', 'content': 3.0949806387070566e-05, 'timestamp': '2025-09-30 22:08:01.078440', 'step': 1572, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:01.120823', 'step': 1572, 'epoch': 3} {'type': 'loss', 'content': 7.183482375694439e-05, 'timestamp': '2025-09-30 22:08:01.123523', 'step': 1573, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:08:01.160656', 'step': 1573, 'epoch': 3} {'type': 'loss', 'content': 6.708560977131128e-05, 'timestamp': '2025-09-30 22:08:01.163300', 'step': 1574, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:08:01.202125', 'step': 1574, 'epoch': 3} {'type': 'loss', 'content': 6.0947448218939826e-05, 'timestamp': '2025-09-30 22:08:01.209942', 'step': 1575, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:01.244074', 'step': 1575, 'epoch': 3} {'type': 'loss', 'content': 0.00033145371708087623, 'timestamp': '2025-09-30 22:08:01.270433', 'step': 1576, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:08:01.308732', 'step': 1576, 'epoch': 3} {'type': 'loss', 'content': 0.00012974298442713916, 'timestamp': '2025-09-30 22:08:01.320260', 'step': 1577, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:08:01.361271', 'step': 1577, 'epoch': 3} {'type': 'loss', 'content': 0.001361784408800304, 'timestamp': '2025-09-30 22:08:01.363606', 'step': 1578, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:08:01.404572', 'step': 1578, 'epoch': 3} {'type': 'loss', 'content': 7.593195186927915e-05, 'timestamp': '2025-09-30 22:08:01.413470', 'step': 1579, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:08:01.448140', 'step': 1579, 'epoch': 3} {'type': 'loss', 'content': 0.0013231473276391625, 'timestamp': '2025-09-30 22:08:01.471885', 'step': 1580, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:01.503369', 'step': 1580, 'epoch': 3} {'type': 'loss', 'content': 0.0015925122424960136, 'timestamp': '2025-09-30 22:08:01.507497', 'step': 1581, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:08:01.538654', 'step': 1581, 'epoch': 3} {'type': 'loss', 'content': 0.0001307614438701421, 'timestamp': '2025-09-30 22:08:01.541790', 'step': 1582, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:08:01.576886', 'step': 1582, 'epoch': 3} {'type': 'loss', 'content': 0.0006219060160219669, 'timestamp': '2025-09-30 22:08:01.581903', 'step': 1583, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:08:01.615786', 'step': 1583, 'epoch': 3} {'type': 'loss', 'content': 0.00012529719970189035, 'timestamp': '2025-09-30 22:08:01.642023', 'step': 1584, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:01.676258', 'step': 1584, 'epoch': 3} {'type': 'loss', 'content': 0.0003298810333944857, 'timestamp': '2025-09-30 22:08:01.679148', 'step': 1585, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:08:01.713366', 'step': 1585, 'epoch': 3} {'type': 'loss', 'content': 0.00010575191117823124, 'timestamp': '2025-09-30 22:08:01.718128', 'step': 1586, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:08:01.753223', 'step': 1586, 'epoch': 3} {'type': 'loss', 'content': 4.924646418658085e-05, 'timestamp': '2025-09-30 22:08:01.760425', 'step': 1587, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:08:01.794419', 'step': 1587, 'epoch': 3} {'type': 'loss', 'content': 0.0003786073357332498, 'timestamp': '2025-09-30 22:08:01.820784', 'step': 1588, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:08:01.859129', 'step': 1588, 'epoch': 3} {'type': 'loss', 'content': 0.001415244652889669, 'timestamp': '2025-09-30 22:08:01.864451', 'step': 1589, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:08:01.897159', 'step': 1589, 'epoch': 3} {'type': 'loss', 'content': 0.0006168781546875834, 'timestamp': '2025-09-30 22:08:01.902324', 'step': 1590, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:08:01.934825', 'step': 1590, 'epoch': 3} {'type': 'loss', 'content': 5.856368807144463e-05, 'timestamp': '2025-09-30 22:08:01.941885', 'step': 1591, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:08:01.978997', 'step': 1591, 'epoch': 3} {'type': 'loss', 'content': 0.0019562358502298594, 'timestamp': '2025-09-30 22:08:02.007784', 'step': 1592, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:08:02.043044', 'step': 1592, 'epoch': 3} {'type': 'loss', 'content': 0.0004735655675176531, 'timestamp': '2025-09-30 22:08:02.045587', 'step': 1593, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:08:02.081380', 'step': 1593, 'epoch': 3} {'type': 'loss', 'content': 0.00011099340918008238, 'timestamp': '2025-09-30 22:08:02.087946', 'step': 1594, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:08:02.125184', 'step': 1594, 'epoch': 3} {'type': 'loss', 'content': 0.00012295310443732888, 'timestamp': '2025-09-30 22:08:02.129602', 'step': 1595, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:08:02.166017', 'step': 1595, 'epoch': 3} {'type': 'loss', 'content': 0.0004471055290196091, 'timestamp': '2025-09-30 22:08:02.194200', 'step': 1596, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:08:02.235368', 'step': 1596, 'epoch': 3} {'type': 'loss', 'content': 0.00020820485951844603, 'timestamp': '2025-09-30 22:08:02.241209', 'step': 1597, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:08:02.279475', 'step': 1597, 'epoch': 3} {'type': 'loss', 'content': 0.0005493653588928282, 'timestamp': '2025-09-30 22:08:02.289794', 'step': 1598, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:02.328001', 'step': 1598, 'epoch': 3} {'type': 'loss', 'content': 0.00024865224258974195, 'timestamp': '2025-09-30 22:08:02.333983', 'step': 1599, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-30 22:08:03.013747', 'step': 1599, 'epoch': 3} {'type': 'pplx', 'content': 104521737.10689187, 'timestamp': '2025-09-30 22:08:03.019853', 'step': 1599, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:08:03.053604', 'step': 1599, 'epoch': 3} {'type': 'loss', 'content': 8.496097143506631e-05, 'timestamp': '2025-09-30 22:08:03.079978', 'step': 1600, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:03.113950', 'step': 1600, 'epoch': 3} {'type': 'loss', 'content': 0.0004024894442409277, 'timestamp': '2025-09-30 22:08:03.120001', 'step': 1601, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:08:03.160823', 'step': 1601, 'epoch': 3} {'type': 'loss', 'content': 0.002289453987032175, 'timestamp': '2025-09-30 22:08:03.169481', 'step': 1602, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:08:03.204493', 'step': 1602, 'epoch': 3} {'type': 'loss', 'content': 0.0002782022929750383, 'timestamp': '2025-09-30 22:08:03.207404', 'step': 1603, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:08:03.246178', 'step': 1603, 'epoch': 3} {'type': 'loss', 'content': 0.00017332390416413546, 'timestamp': '2025-09-30 22:08:03.274045', 'step': 1604, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:08:03.314144', 'step': 1604, 'epoch': 3} {'type': 'loss', 'content': 0.00021450161875691265, 'timestamp': '2025-09-30 22:08:03.319232', 'step': 1605, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:08:03.359821', 'step': 1605, 'epoch': 3} {'type': 'loss', 'content': 3.722803739947267e-05, 'timestamp': '2025-09-30 22:08:03.367781', 'step': 1606, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:03.398632', 'step': 1606, 'epoch': 3} {'type': 'loss', 'content': 0.00020089758618269116, 'timestamp': '2025-09-30 22:08:03.400602', 'step': 1607, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:08:03.434721', 'step': 1607, 'epoch': 3} {'type': 'loss', 'content': 0.0003215324250049889, 'timestamp': '2025-09-30 22:08:03.460118', 'step': 1608, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:08:03.491394', 'step': 1608, 'epoch': 3} {'type': 'loss', 'content': 0.0012815693626180291, 'timestamp': '2025-09-30 22:08:03.493533', 'step': 1609, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:03.524683', 'step': 1609, 'epoch': 3} {'type': 'loss', 'content': 0.004723010119050741, 'timestamp': '2025-09-30 22:08:03.526861', 'step': 1610, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:03.557305', 'step': 1610, 'epoch': 3} {'type': 'loss', 'content': 0.00014322168135549873, 'timestamp': '2025-09-30 22:08:03.559751', 'step': 1611, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:08:03.592700', 'step': 1611, 'epoch': 3} {'type': 'loss', 'content': 9.33581031858921e-05, 'timestamp': '2025-09-30 22:08:03.616537', 'step': 1612, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:08:03.647387', 'step': 1612, 'epoch': 3} {'type': 'loss', 'content': 0.00022943881049286574, 'timestamp': '2025-09-30 22:08:03.649671', 'step': 1613, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:08:03.686610', 'step': 1613, 'epoch': 3} {'type': 'loss', 'content': 0.008927569724619389, 'timestamp': '2025-09-30 22:08:03.691136', 'step': 1614, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:08:03.730745', 'step': 1614, 'epoch': 3} {'type': 'loss', 'content': 0.00228672637604177, 'timestamp': '2025-09-30 22:08:03.733198', 'step': 1615, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:08:03.772058', 'step': 1615, 'epoch': 3} {'type': 'loss', 'content': 0.00068600446684286, 'timestamp': '2025-09-30 22:08:03.797307', 'step': 1616, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:08:03.830402', 'step': 1616, 'epoch': 3} {'type': 'loss', 'content': 0.002715339185670018, 'timestamp': '2025-09-30 22:08:03.832684', 'step': 1617, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:03.863999', 'step': 1617, 'epoch': 3} {'type': 'loss', 'content': 3.630607534432784e-05, 'timestamp': '2025-09-30 22:08:03.866297', 'step': 1618, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:08:03.897609', 'step': 1618, 'epoch': 3} {'type': 'loss', 'content': 0.0015939169097691774, 'timestamp': '2025-09-30 22:08:03.899791', 'step': 1619, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:08:03.933202', 'step': 1619, 'epoch': 3} {'type': 'loss', 'content': 3.691700840136036e-05, 'timestamp': '2025-09-30 22:08:03.957020', 'step': 1620, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:08:03.994494', 'step': 1620, 'epoch': 3} {'type': 'loss', 'content': 0.0012790290638804436, 'timestamp': '2025-09-30 22:08:03.996772', 'step': 1621, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:08:04.030003', 'step': 1621, 'epoch': 3} {'type': 'loss', 'content': 0.005167053546756506, 'timestamp': '2025-09-30 22:08:04.037126', 'step': 1622, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:08:04.075078', 'step': 1622, 'epoch': 3} {'type': 'loss', 'content': 0.00023754978610668331, 'timestamp': '2025-09-30 22:08:04.079770', 'step': 1623, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:08:04.114642', 'step': 1623, 'epoch': 3} {'type': 'loss', 'content': 0.0023776155430823565, 'timestamp': '2025-09-30 22:08:04.143356', 'step': 1624, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:08:04.178019', 'step': 1624, 'epoch': 3} {'type': 'loss', 'content': 0.020028727129101753, 'timestamp': '2025-09-30 22:08:04.180324', 'step': 1625, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:04.215444', 'step': 1625, 'epoch': 3} {'type': 'loss', 'content': 0.002531526843085885, 'timestamp': '2025-09-30 22:08:04.217576', 'step': 1626, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:08:04.257311', 'step': 1626, 'epoch': 3} {'type': 'loss', 'content': 0.0007390738464891911, 'timestamp': '2025-09-30 22:08:04.259321', 'step': 1627, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:08:04.295905', 'step': 1627, 'epoch': 3} {'type': 'loss', 'content': 0.0002480483672115952, 'timestamp': '2025-09-30 22:08:04.323889', 'step': 1628, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:08:04.357678', 'step': 1628, 'epoch': 3} {'type': 'loss', 'content': 0.00014147233741823584, 'timestamp': '2025-09-30 22:08:04.362330', 'step': 1629, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:08:04.395767', 'step': 1629, 'epoch': 3} {'type': 'loss', 'content': 0.00017867713177111, 'timestamp': '2025-09-30 22:08:04.402891', 'step': 1630, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:04.440235', 'step': 1630, 'epoch': 3} {'type': 'loss', 'content': 0.00012707387213595212, 'timestamp': '2025-09-30 22:08:04.442387', 'step': 1631, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:08:04.478325', 'step': 1631, 'epoch': 3} {'type': 'loss', 'content': 0.000509004108607769, 'timestamp': '2025-09-30 22:08:04.506687', 'step': 1632, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:08:04.543312', 'step': 1632, 'epoch': 3} {'type': 'loss', 'content': 6.3887688156683e-05, 'timestamp': '2025-09-30 22:08:04.545290', 'step': 1633, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:04.589706', 'step': 1633, 'epoch': 3} {'type': 'loss', 'content': 0.00019534204329829663, 'timestamp': '2025-09-30 22:08:04.591941', 'step': 1634, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:08:04.623935', 'step': 1634, 'epoch': 3} {'type': 'loss', 'content': 9.548116941004992e-05, 'timestamp': '2025-09-30 22:08:04.626038', 'step': 1635, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:08:04.674140', 'step': 1635, 'epoch': 3} {'type': 'loss', 'content': 0.002590916818007827, 'timestamp': '2025-09-30 22:08:04.702182', 'step': 1636, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:08:04.734293', 'step': 1636, 'epoch': 3} {'type': 'loss', 'content': 0.0026767959352582693, 'timestamp': '2025-09-30 22:08:04.736317', 'step': 1637, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:08:04.770294', 'step': 1637, 'epoch': 3} {'type': 'loss', 'content': 8.954613440437242e-05, 'timestamp': '2025-09-30 22:08:04.772376', 'step': 1638, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-30 22:08:05.452373', 'step': 1638, 'epoch': 3} {'type': 'pplx', 'content': 115538961.0174772, 'timestamp': '2025-09-30 22:08:05.454369', 'step': 1638, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:08:05.483522', 'step': 1638, 'epoch': 3} {'type': 'loss', 'content': 3.402471338631585e-05, 'timestamp': '2025-09-30 22:08:05.490804', 'step': 1639, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:05.525581', 'step': 1639, 'epoch': 3} {'type': 'loss', 'content': 0.004754878580570221, 'timestamp': '2025-09-30 22:08:05.549459', 'step': 1640, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:08:05.581943', 'step': 1640, 'epoch': 3} {'type': 'loss', 'content': 0.003890208899974823, 'timestamp': '2025-09-30 22:08:05.583987', 'step': 1641, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:08:05.617330', 'step': 1641, 'epoch': 3} {'type': 'loss', 'content': 5.600903750746511e-05, 'timestamp': '2025-09-30 22:08:05.627829', 'step': 1642, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:08:05.659112', 'step': 1642, 'epoch': 3} {'type': 'loss', 'content': 0.0029056521598249674, 'timestamp': '2025-09-30 22:08:05.661117', 'step': 1643, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:08:05.692710', 'step': 1643, 'epoch': 3} {'type': 'loss', 'content': 0.0007899158517830074, 'timestamp': '2025-09-30 22:08:05.717988', 'step': 1644, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:08:05.763063', 'step': 1644, 'epoch': 3} {'type': 'loss', 'content': 0.0004726467013824731, 'timestamp': '2025-09-30 22:08:05.768454', 'step': 1645, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:08:05.803704', 'step': 1645, 'epoch': 3} {'type': 'loss', 'content': 0.00011156607797602192, 'timestamp': '2025-09-30 22:08:05.805874', 'step': 1646, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:05.843267', 'step': 1646, 'epoch': 3} {'type': 'loss', 'content': 0.0001994954509427771, 'timestamp': '2025-09-30 22:08:05.845573', 'step': 1647, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:08:05.880475', 'step': 1647, 'epoch': 3} {'type': 'loss', 'content': 0.0001336904097115621, 'timestamp': '2025-09-30 22:08:05.909260', 'step': 1648, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:08:05.940343', 'step': 1648, 'epoch': 3} {'type': 'loss', 'content': 0.00022927757527213544, 'timestamp': '2025-09-30 22:08:05.945564', 'step': 1649, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:08:05.981251', 'step': 1649, 'epoch': 3} {'type': 'loss', 'content': 5.889233580091968e-05, 'timestamp': '2025-09-30 22:08:05.984116', 'step': 1650, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:08:06.016082', 'step': 1650, 'epoch': 3} {'type': 'loss', 'content': 6.961514009162784e-05, 'timestamp': '2025-09-30 22:08:06.026407', 'step': 1651, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:08:06.059199', 'step': 1651, 'epoch': 3} {'type': 'loss', 'content': 0.00011490475299069658, 'timestamp': '2025-09-30 22:08:06.084648', 'step': 1652, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:08:06.122253', 'step': 1652, 'epoch': 3} {'type': 'loss', 'content': 4.839776738663204e-05, 'timestamp': '2025-09-30 22:08:06.127516', 'step': 1653, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:06.160802', 'step': 1653, 'epoch': 3} {'type': 'loss', 'content': 2.3726159270154312e-05, 'timestamp': '2025-09-30 22:08:06.163020', 'step': 1654, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:06.205253', 'step': 1654, 'epoch': 3} {'type': 'loss', 'content': 0.00011560900748008862, 'timestamp': '2025-09-30 22:08:06.208040', 'step': 1655, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:06.239929', 'step': 1655, 'epoch': 3} {'type': 'loss', 'content': 7.772482058499008e-05, 'timestamp': '2025-09-30 22:08:06.268057', 'step': 1656, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:08:06.303065', 'step': 1656, 'epoch': 3} {'type': 'loss', 'content': 0.0005724158836528659, 'timestamp': '2025-09-30 22:08:06.305384', 'step': 1657, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:06.338520', 'step': 1657, 'epoch': 3} {'type': 'loss', 'content': 3.609975101426244e-05, 'timestamp': '2025-09-30 22:08:06.340509', 'step': 1658, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:08:06.375056', 'step': 1658, 'epoch': 3} {'type': 'loss', 'content': 0.00023306449293158948, 'timestamp': '2025-09-30 22:08:06.378031', 'step': 1659, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:08:06.409148', 'step': 1659, 'epoch': 3} {'type': 'loss', 'content': 0.00013319592108018696, 'timestamp': '2025-09-30 22:08:06.435575', 'step': 1660, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:08:06.467952', 'step': 1660, 'epoch': 3} {'type': 'loss', 'content': 0.0014770907582715154, 'timestamp': '2025-09-30 22:08:06.472640', 'step': 1661, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:08:06.506661', 'step': 1661, 'epoch': 3} {'type': 'loss', 'content': 8.019845699891448e-05, 'timestamp': '2025-09-30 22:08:06.509503', 'step': 1662, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:08:06.545897', 'step': 1662, 'epoch': 3} {'type': 'loss', 'content': 4.1435498133068904e-05, 'timestamp': '2025-09-30 22:08:06.553635', 'step': 1663, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:08:06.588194', 'step': 1663, 'epoch': 3} {'type': 'loss', 'content': 0.0002740359923336655, 'timestamp': '2025-09-30 22:08:06.613532', 'step': 1664, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:08:06.647439', 'step': 1664, 'epoch': 3} {'type': 'loss', 'content': 6.516856956295669e-05, 'timestamp': '2025-09-30 22:08:06.660143', 'step': 1665, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:06.691015', 'step': 1665, 'epoch': 3} {'type': 'loss', 'content': 0.008054564706981182, 'timestamp': '2025-09-30 22:08:06.693141', 'step': 1666, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:08:06.724794', 'step': 1666, 'epoch': 3} {'type': 'loss', 'content': 5.055733709014021e-05, 'timestamp': '2025-09-30 22:08:06.727106', 'step': 1667, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:08:06.768751', 'step': 1667, 'epoch': 3} {'type': 'loss', 'content': 0.0001062467708834447, 'timestamp': '2025-09-30 22:08:06.792145', 'step': 1668, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:08:06.824186', 'step': 1668, 'epoch': 3} {'type': 'loss', 'content': 2.5943687433027662e-05, 'timestamp': '2025-09-30 22:08:06.829545', 'step': 1669, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:06.864663', 'step': 1669, 'epoch': 3} {'type': 'loss', 'content': 5.616279304376803e-05, 'timestamp': '2025-09-30 22:08:06.867073', 'step': 1670, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:08:06.900297', 'step': 1670, 'epoch': 3} {'type': 'loss', 'content': 4.885617090621963e-05, 'timestamp': '2025-09-30 22:08:06.902554', 'step': 1671, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:08:06.936246', 'step': 1671, 'epoch': 3} {'type': 'loss', 'content': 4.1574046917958185e-05, 'timestamp': '2025-09-30 22:08:06.961348', 'step': 1672, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:08:06.997844', 'step': 1672, 'epoch': 3} {'type': 'loss', 'content': 0.019031139090657234, 'timestamp': '2025-09-30 22:08:07.000638', 'step': 1673, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:07.042395', 'step': 1673, 'epoch': 3} {'type': 'loss', 'content': 0.00019428497762419283, 'timestamp': '2025-09-30 22:08:07.045189', 'step': 1674, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:08:07.076829', 'step': 1674, 'epoch': 3} {'type': 'loss', 'content': 0.0002930422779172659, 'timestamp': '2025-09-30 22:08:07.079216', 'step': 1675, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:07.112571', 'step': 1675, 'epoch': 3} {'type': 'loss', 'content': 0.0008218813454732299, 'timestamp': '2025-09-30 22:08:07.137668', 'step': 1676, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:08:07.171638', 'step': 1676, 'epoch': 3} {'type': 'loss', 'content': 2.3237113055074587e-05, 'timestamp': '2025-09-30 22:08:07.177005', 'step': 1677, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-30 22:08:07.816204', 'step': 1677, 'epoch': 3} {'type': 'pplx', 'content': 121808560.6157007, 'timestamp': '2025-09-30 22:08:07.818747', 'step': 1677, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:07.848201', 'step': 1677, 'epoch': 3} {'type': 'loss', 'content': 2.809312536555808e-05, 'timestamp': '2025-09-30 22:08:07.850567', 'step': 1678, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:08:07.882750', 'step': 1678, 'epoch': 3} {'type': 'loss', 'content': 0.00043884789920412004, 'timestamp': '2025-09-30 22:08:07.889749', 'step': 1679, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:08:07.922034', 'step': 1679, 'epoch': 3} {'type': 'loss', 'content': 0.002086315304040909, 'timestamp': '2025-09-30 22:08:07.947140', 'step': 1680, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:08:07.980525', 'step': 1680, 'epoch': 3} {'type': 'loss', 'content': 4.865376467932947e-05, 'timestamp': '2025-09-30 22:08:07.983369', 'step': 1681, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:08:08.017152', 'step': 1681, 'epoch': 3} {'type': 'loss', 'content': 7.374538836302236e-05, 'timestamp': '2025-09-30 22:08:08.024990', 'step': 1682, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:08:08.056481', 'step': 1682, 'epoch': 3} {'type': 'loss', 'content': 6.192681757966056e-05, 'timestamp': '2025-09-30 22:08:08.059948', 'step': 1683, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:08:08.092914', 'step': 1683, 'epoch': 3} {'type': 'loss', 'content': 0.0006117112934589386, 'timestamp': '2025-09-30 22:08:08.121736', 'step': 1684, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:08.154336', 'step': 1684, 'epoch': 3} {'type': 'loss', 'content': 4.967437780578621e-05, 'timestamp': '2025-09-30 22:08:08.157294', 'step': 1685, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:08:08.191086', 'step': 1685, 'epoch': 3} {'type': 'loss', 'content': 3.295835995231755e-05, 'timestamp': '2025-09-30 22:08:08.193954', 'step': 1686, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:08:08.224463', 'step': 1686, 'epoch': 3} {'type': 'loss', 'content': 4.771046951645985e-05, 'timestamp': '2025-09-30 22:08:08.231325', 'step': 1687, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:08:08.265360', 'step': 1687, 'epoch': 3} {'type': 'loss', 'content': 4.849955075769685e-05, 'timestamp': '2025-09-30 22:08:08.290450', 'step': 1688, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:08:08.331749', 'step': 1688, 'epoch': 3} {'type': 'loss', 'content': 0.0001034624656313099, 'timestamp': '2025-09-30 22:08:08.337217', 'step': 1689, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:08.369308', 'step': 1689, 'epoch': 3} {'type': 'loss', 'content': 0.000108941356302239, 'timestamp': '2025-09-30 22:08:08.371908', 'step': 1690, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:08.405890', 'step': 1690, 'epoch': 3} {'type': 'loss', 'content': 4.94919549964834e-05, 'timestamp': '2025-09-30 22:08:08.409277', 'step': 1691, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:08.445119', 'step': 1691, 'epoch': 3} {'type': 'loss', 'content': 3.778712198254652e-05, 'timestamp': '2025-09-30 22:08:08.469009', 'step': 1692, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:08:08.500808', 'step': 1692, 'epoch': 3} {'type': 'loss', 'content': 7.626505248481408e-05, 'timestamp': '2025-09-30 22:08:08.503411', 'step': 1693, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:08:08.534715', 'step': 1693, 'epoch': 3} {'type': 'loss', 'content': 2.672936716408003e-05, 'timestamp': '2025-09-30 22:08:08.537958', 'step': 1694, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:08:08.570418', 'step': 1694, 'epoch': 3} {'type': 'loss', 'content': 0.005537017714232206, 'timestamp': '2025-09-30 22:08:08.573775', 'step': 1695, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:08:08.606232', 'step': 1695, 'epoch': 3} {'type': 'loss', 'content': 0.0002917372912634164, 'timestamp': '2025-09-30 22:08:08.631346', 'step': 1696, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:08:08.664470', 'step': 1696, 'epoch': 3} {'type': 'loss', 'content': 6.110074173193425e-05, 'timestamp': '2025-09-30 22:08:08.667323', 'step': 1697, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:08.699099', 'step': 1697, 'epoch': 3} {'type': 'loss', 'content': 0.00011303651990601793, 'timestamp': '2025-09-30 22:08:08.701837', 'step': 1698, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:08:08.732756', 'step': 1698, 'epoch': 3} {'type': 'loss', 'content': 0.0005966713069938123, 'timestamp': '2025-09-30 22:08:08.737317', 'step': 1699, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:08.769689', 'step': 1699, 'epoch': 3} {'type': 'loss', 'content': 6.444390601245686e-05, 'timestamp': '2025-09-30 22:08:08.801952', 'step': 1700, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:08:08.834298', 'step': 1700, 'epoch': 3} {'type': 'loss', 'content': 4.669316331273876e-05, 'timestamp': '2025-09-30 22:08:08.836388', 'step': 1701, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:08:08.869567', 'step': 1701, 'epoch': 3} {'type': 'loss', 'content': 0.0003925747296307236, 'timestamp': '2025-09-30 22:08:08.876623', 'step': 1702, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:08:08.907036', 'step': 1702, 'epoch': 3} {'type': 'loss', 'content': 5.464601417770609e-05, 'timestamp': '2025-09-30 22:08:08.917280', 'step': 1703, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:08:08.949598', 'step': 1703, 'epoch': 3} {'type': 'loss', 'content': 0.0007480279309675097, 'timestamp': '2025-09-30 22:08:08.977683', 'step': 1704, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:09.011058', 'step': 1704, 'epoch': 3} {'type': 'loss', 'content': 0.0002451858308631927, 'timestamp': '2025-09-30 22:08:09.013383', 'step': 1705, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:08:09.046894', 'step': 1705, 'epoch': 3} {'type': 'loss', 'content': 0.01712205819785595, 'timestamp': '2025-09-30 22:08:09.049384', 'step': 1706, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:08:09.081727', 'step': 1706, 'epoch': 3} {'type': 'loss', 'content': 4.979677760275081e-05, 'timestamp': '2025-09-30 22:08:09.084051', 'step': 1707, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:09.114820', 'step': 1707, 'epoch': 3} {'type': 'loss', 'content': 0.00010694911907194182, 'timestamp': '2025-09-30 22:08:09.138319', 'step': 1708, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:08:09.170189', 'step': 1708, 'epoch': 3} {'type': 'loss', 'content': 8.407200948568061e-05, 'timestamp': '2025-09-30 22:08:09.173668', 'step': 1709, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:08:09.217356', 'step': 1709, 'epoch': 3} {'type': 'loss', 'content': 3.759035098482855e-05, 'timestamp': '2025-09-30 22:08:09.219576', 'step': 1710, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:08:09.251614', 'step': 1710, 'epoch': 3} {'type': 'loss', 'content': 6.0760878113796934e-05, 'timestamp': '2025-09-30 22:08:09.256141', 'step': 1711, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:09.288638', 'step': 1711, 'epoch': 3} {'type': 'loss', 'content': 0.0006046485505066812, 'timestamp': '2025-09-30 22:08:09.312368', 'step': 1712, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:08:09.343502', 'step': 1712, 'epoch': 3} {'type': 'loss', 'content': 4.117127173230983e-05, 'timestamp': '2025-09-30 22:08:09.345604', 'step': 1713, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:08:09.383976', 'step': 1713, 'epoch': 3} {'type': 'loss', 'content': 0.0038941497914493084, 'timestamp': '2025-09-30 22:08:09.386523', 'step': 1714, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:08:09.416535', 'step': 1714, 'epoch': 3} {'type': 'loss', 'content': 4.572251418721862e-05, 'timestamp': '2025-09-30 22:08:09.419070', 'step': 1715, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:08:09.450798', 'step': 1715, 'epoch': 3} {'type': 'loss', 'content': 4.344709304859862e-05, 'timestamp': '2025-09-30 22:08:09.474705', 'step': 1716, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-30 22:08:10.121434', 'step': 1716, 'epoch': 3} {'type': 'pplx', 'content': 123022155.31750192, 'timestamp': '2025-09-30 22:08:10.123279', 'step': 1716, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:10.152016', 'step': 1716, 'epoch': 3} {'type': 'loss', 'content': 2.7246298486716114e-05, 'timestamp': '2025-09-30 22:08:10.154210', 'step': 1717, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:08:10.187641', 'step': 1717, 'epoch': 3} {'type': 'loss', 'content': 2.945954656752292e-05, 'timestamp': '2025-09-30 22:08:10.192406', 'step': 1718, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:08:10.230016', 'step': 1718, 'epoch': 3} {'type': 'loss', 'content': 0.0006682882667519152, 'timestamp': '2025-09-30 22:08:10.237555', 'step': 1719, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:08:10.274085', 'step': 1719, 'epoch': 3} {'type': 'loss', 'content': 0.00010243523138342425, 'timestamp': '2025-09-30 22:08:10.298031', 'step': 1720, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:08:10.332529', 'step': 1720, 'epoch': 3} {'type': 'loss', 'content': 3.174094672431238e-05, 'timestamp': '2025-09-30 22:08:10.334866', 'step': 1721, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:08:10.369414', 'step': 1721, 'epoch': 3} {'type': 'loss', 'content': 6.566093361470848e-05, 'timestamp': '2025-09-30 22:08:10.376545', 'step': 1722, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:08:10.420688', 'step': 1722, 'epoch': 3} {'type': 'loss', 'content': 0.00014871249732095748, 'timestamp': '2025-09-30 22:08:10.423199', 'step': 1723, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:10.453615', 'step': 1723, 'epoch': 3} {'type': 'loss', 'content': 0.0001852501736721024, 'timestamp': '2025-09-30 22:08:10.477119', 'step': 1724, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:08:10.507896', 'step': 1724, 'epoch': 3} {'type': 'loss', 'content': 0.005387753248214722, 'timestamp': '2025-09-30 22:08:10.513194', 'step': 1725, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:10.548504', 'step': 1725, 'epoch': 3} {'type': 'loss', 'content': 0.00018650003767106682, 'timestamp': '2025-09-30 22:08:10.550888', 'step': 1726, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:08:10.586529', 'step': 1726, 'epoch': 3} {'type': 'loss', 'content': 2.3841199435992166e-05, 'timestamp': '2025-09-30 22:08:10.589189', 'step': 1727, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:08:10.620099', 'step': 1727, 'epoch': 3} {'type': 'loss', 'content': 3.871664375765249e-05, 'timestamp': '2025-09-30 22:08:10.645377', 'step': 1728, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:08:10.677597', 'step': 1728, 'epoch': 3} {'type': 'loss', 'content': 0.0007445144583471119, 'timestamp': '2025-09-30 22:08:10.679797', 'step': 1729, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:08:10.711443', 'step': 1729, 'epoch': 3} {'type': 'loss', 'content': 3.987590753240511e-05, 'timestamp': '2025-09-30 22:08:10.713708', 'step': 1730, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:08:10.744341', 'step': 1730, 'epoch': 3} {'type': 'loss', 'content': 0.0006975015858188272, 'timestamp': '2025-09-30 22:08:10.754725', 'step': 1731, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:08:10.785891', 'step': 1731, 'epoch': 3} {'type': 'loss', 'content': 3.186930189258419e-05, 'timestamp': '2025-09-30 22:08:10.809266', 'step': 1732, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:08:10.839143', 'step': 1732, 'epoch': 3} {'type': 'loss', 'content': 2.39141645579366e-05, 'timestamp': '2025-09-30 22:08:10.841347', 'step': 1733, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:10.872641', 'step': 1733, 'epoch': 3} {'type': 'loss', 'content': 0.0007975990301929414, 'timestamp': '2025-09-30 22:08:10.878155', 'step': 1734, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:08:10.909577', 'step': 1734, 'epoch': 3} {'type': 'loss', 'content': 3.0157307264744304e-05, 'timestamp': '2025-09-30 22:08:10.911579', 'step': 1735, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:08:10.943570', 'step': 1735, 'epoch': 3} {'type': 'loss', 'content': 0.00013178681547287852, 'timestamp': '2025-09-30 22:08:10.968656', 'step': 1736, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:11.002059', 'step': 1736, 'epoch': 3} {'type': 'loss', 'content': 0.000744854100048542, 'timestamp': '2025-09-30 22:08:11.004778', 'step': 1737, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:11.036810', 'step': 1737, 'epoch': 3} {'type': 'loss', 'content': 7.578790973639116e-05, 'timestamp': '2025-09-30 22:08:11.039780', 'step': 1738, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:08:11.077610', 'step': 1738, 'epoch': 3} {'type': 'loss', 'content': 4.3506599467946216e-05, 'timestamp': '2025-09-30 22:08:11.085666', 'step': 1739, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:08:11.117434', 'step': 1739, 'epoch': 3} {'type': 'loss', 'content': 6.423755257856101e-05, 'timestamp': '2025-09-30 22:08:11.142553', 'step': 1740, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:08:11.176165', 'step': 1740, 'epoch': 3} {'type': 'loss', 'content': 0.00023928364680614322, 'timestamp': '2025-09-30 22:08:11.178379', 'step': 1741, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:08:11.210418', 'step': 1741, 'epoch': 3} {'type': 'loss', 'content': 6.30137074040249e-05, 'timestamp': '2025-09-30 22:08:11.214998', 'step': 1742, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:08:11.247480', 'step': 1742, 'epoch': 3} {'type': 'loss', 'content': 0.0006644020904786885, 'timestamp': '2025-09-30 22:08:11.250348', 'step': 1743, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:08:11.282008', 'step': 1743, 'epoch': 3} {'type': 'loss', 'content': 8.710688416613266e-05, 'timestamp': '2025-09-30 22:08:11.311018', 'step': 1744, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:11.341815', 'step': 1744, 'epoch': 3} {'type': 'loss', 'content': 0.0001336832356173545, 'timestamp': '2025-09-30 22:08:11.344275', 'step': 1745, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:08:11.375695', 'step': 1745, 'epoch': 3} {'type': 'loss', 'content': 0.006218440365046263, 'timestamp': '2025-09-30 22:08:11.382933', 'step': 1746, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:11.413325', 'step': 1746, 'epoch': 3} {'type': 'loss', 'content': 0.002578115090727806, 'timestamp': '2025-09-30 22:08:11.415211', 'step': 1747, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:08:11.447475', 'step': 1747, 'epoch': 3} {'type': 'loss', 'content': 0.0001024999437504448, 'timestamp': '2025-09-30 22:08:11.471344', 'step': 1748, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:08:11.502763', 'step': 1748, 'epoch': 3} {'type': 'loss', 'content': 6.375632074195892e-05, 'timestamp': '2025-09-30 22:08:11.512589', 'step': 1749, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:08:11.543597', 'step': 1749, 'epoch': 3} {'type': 'loss', 'content': 3.2842282962519675e-05, 'timestamp': '2025-09-30 22:08:11.547960', 'step': 1750, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:08:11.580397', 'step': 1750, 'epoch': 3} {'type': 'loss', 'content': 0.0002622344472911209, 'timestamp': '2025-09-30 22:08:11.584669', 'step': 1751, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:08:11.615644', 'step': 1751, 'epoch': 3} {'type': 'loss', 'content': 5.3559277148451656e-05, 'timestamp': '2025-09-30 22:08:11.641124', 'step': 1752, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:08:11.678427', 'step': 1752, 'epoch': 3} {'type': 'loss', 'content': 0.0020374690648168325, 'timestamp': '2025-09-30 22:08:11.683692', 'step': 1753, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:11.714588', 'step': 1753, 'epoch': 3} {'type': 'loss', 'content': 2.5645465939305723e-05, 'timestamp': '2025-09-30 22:08:11.716983', 'step': 1754, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:11.747409', 'step': 1754, 'epoch': 3} {'type': 'loss', 'content': 0.0005786855472251773, 'timestamp': '2025-09-30 22:08:11.749624', 'step': 1755, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-30 22:08:12.585912', 'step': 1755, 'epoch': 3} {'type': 'pplx', 'content': 123261309.76453374, 'timestamp': '2025-09-30 22:08:12.587845', 'step': 1755, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:08:12.617583', 'step': 1755, 'epoch': 3} {'type': 'loss', 'content': 3.3810920285759494e-05, 'timestamp': '2025-09-30 22:08:12.646149', 'step': 1756, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:12.685768', 'step': 1756, 'epoch': 3} {'type': 'loss', 'content': 0.0003127233940176666, 'timestamp': '2025-09-30 22:08:12.688016', 'step': 1757, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:08:12.720260', 'step': 1757, 'epoch': 3} {'type': 'loss', 'content': 0.00022420093591790646, 'timestamp': '2025-09-30 22:08:12.724993', 'step': 1758, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:08:12.757111', 'step': 1758, 'epoch': 3} {'type': 'loss', 'content': 0.00010123888932866976, 'timestamp': '2025-09-30 22:08:12.765058', 'step': 1759, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:08:12.800033', 'step': 1759, 'epoch': 3} {'type': 'loss', 'content': 7.35086141503416e-05, 'timestamp': '2025-09-30 22:08:12.825405', 'step': 1760, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:08:12.865360', 'step': 1760, 'epoch': 3} {'type': 'loss', 'content': 3.3173182600876316e-05, 'timestamp': '2025-09-30 22:08:12.870461', 'step': 1761, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 80], 'flops': 2373281365952}, 'timestamp': '2025-09-30 22:08:12.904167', 'step': 1761, 'epoch': 3} {'type': 'loss', 'content': 4.140309101785533e-05, 'timestamp': '2025-09-30 22:08:12.906124', 'step': 1762, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:08:12.941723', 'step': 1762, 'epoch': 3} {'type': 'loss', 'content': 0.00019971518486272544, 'timestamp': '2025-09-30 22:08:12.943757', 'step': 1763, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:08:12.982711', 'step': 1763, 'epoch': 3} {'type': 'loss', 'content': 0.0005979278357699513, 'timestamp': '2025-09-30 22:08:13.010824', 'step': 1764, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:08:13.043654', 'step': 1764, 'epoch': 3} {'type': 'loss', 'content': 2.990214852616191e-05, 'timestamp': '2025-09-30 22:08:13.045882', 'step': 1765, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:08:13.077493', 'step': 1765, 'epoch': 3} {'type': 'loss', 'content': 0.0001495322067057714, 'timestamp': '2025-09-30 22:08:13.079602', 'step': 1766, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:08:13.119342', 'step': 1766, 'epoch': 3} {'type': 'loss', 'content': 0.0001800880127120763, 'timestamp': '2025-09-30 22:08:13.122330', 'step': 1767, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:13.174447', 'step': 1767, 'epoch': 3} {'type': 'loss', 'content': 0.00013568572467193007, 'timestamp': '2025-09-30 22:08:13.197813', 'step': 1768, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:08:13.236825', 'step': 1768, 'epoch': 3} {'type': 'loss', 'content': 0.0002641365572344512, 'timestamp': '2025-09-30 22:08:13.239161', 'step': 1769, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:08:13.277883', 'step': 1769, 'epoch': 3} {'type': 'loss', 'content': 0.018095744773745537, 'timestamp': '2025-09-30 22:08:13.284886', 'step': 1770, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:13.315789', 'step': 1770, 'epoch': 3} {'type': 'loss', 'content': 3.474125333013944e-05, 'timestamp': '2025-09-30 22:08:13.317887', 'step': 1771, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:08:13.350296', 'step': 1771, 'epoch': 3} {'type': 'loss', 'content': 5.6115230108844116e-05, 'timestamp': '2025-09-30 22:08:13.375844', 'step': 1772, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:08:13.420055', 'step': 1772, 'epoch': 3} {'type': 'loss', 'content': 3.4445827623130754e-05, 'timestamp': '2025-09-30 22:08:13.425114', 'step': 1773, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:08:13.459661', 'step': 1773, 'epoch': 3} {'type': 'loss', 'content': 7.985342381289229e-05, 'timestamp': '2025-09-30 22:08:13.466773', 'step': 1774, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:13.499306', 'step': 1774, 'epoch': 3} {'type': 'loss', 'content': 6.720200326526538e-05, 'timestamp': '2025-09-30 22:08:13.501629', 'step': 1775, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:08:13.539229', 'step': 1775, 'epoch': 3} {'type': 'loss', 'content': 3.170343188685365e-05, 'timestamp': '2025-09-30 22:08:13.562723', 'step': 1776, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:08:13.594808', 'step': 1776, 'epoch': 3} {'type': 'loss', 'content': 3.736267171916552e-05, 'timestamp': '2025-09-30 22:08:13.597121', 'step': 1777, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:13.638168', 'step': 1777, 'epoch': 3} {'type': 'loss', 'content': 3.414504317333922e-05, 'timestamp': '2025-09-30 22:08:13.640483', 'step': 1778, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:08:13.676919', 'step': 1778, 'epoch': 3} {'type': 'loss', 'content': 0.0002532243961468339, 'timestamp': '2025-09-30 22:08:13.679667', 'step': 1779, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:08:13.712968', 'step': 1779, 'epoch': 3} {'type': 'loss', 'content': 2.2894188077771105e-05, 'timestamp': '2025-09-30 22:08:13.738132', 'step': 1780, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:13.777012', 'step': 1780, 'epoch': 3} {'type': 'loss', 'content': 0.00030392984626814723, 'timestamp': '2025-09-30 22:08:13.779101', 'step': 1781, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:08:13.821398', 'step': 1781, 'epoch': 3} {'type': 'loss', 'content': 4.223351788823493e-05, 'timestamp': '2025-09-30 22:08:13.825786', 'step': 1782, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:08:13.865488', 'step': 1782, 'epoch': 3} {'type': 'loss', 'content': 1.772106406860985e-05, 'timestamp': '2025-09-30 22:08:13.869031', 'step': 1783, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:08:13.914524', 'step': 1783, 'epoch': 3} {'type': 'loss', 'content': 9.17787037906237e-05, 'timestamp': '2025-09-30 22:08:13.939747', 'step': 1784, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:08:13.979286', 'step': 1784, 'epoch': 3} {'type': 'loss', 'content': 0.00010865591320907697, 'timestamp': '2025-09-30 22:08:13.983792', 'step': 1785, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:08:14.021728', 'step': 1785, 'epoch': 3} {'type': 'loss', 'content': 3.610273779486306e-05, 'timestamp': '2025-09-30 22:08:14.026121', 'step': 1786, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:08:14.057167', 'step': 1786, 'epoch': 3} {'type': 'loss', 'content': 0.0003818259574472904, 'timestamp': '2025-09-30 22:08:14.064336', 'step': 1787, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:08:14.096565', 'step': 1787, 'epoch': 3} {'type': 'loss', 'content': 0.000534689286723733, 'timestamp': '2025-09-30 22:08:14.120234', 'step': 1788, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:08:14.161199', 'step': 1788, 'epoch': 3} {'type': 'loss', 'content': 0.0001270804350497201, 'timestamp': '2025-09-30 22:08:14.163416', 'step': 1789, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:08:14.201203', 'step': 1789, 'epoch': 3} {'type': 'loss', 'content': 0.00016834995767567307, 'timestamp': '2025-09-30 22:08:14.204065', 'step': 1790, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:14.252746', 'step': 1790, 'epoch': 3} {'type': 'loss', 'content': 3.9347731217276305e-05, 'timestamp': '2025-09-30 22:08:14.254923', 'step': 1791, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:08:14.293383', 'step': 1791, 'epoch': 3} {'type': 'loss', 'content': 3.699236913234927e-05, 'timestamp': '2025-09-30 22:08:14.318959', 'step': 1792, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:08:14.350695', 'step': 1792, 'epoch': 3} {'type': 'loss', 'content': 0.04252466931939125, 'timestamp': '2025-09-30 22:08:14.355466', 'step': 1793, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:14.389348', 'step': 1793, 'epoch': 3} {'type': 'loss', 'content': 0.003464995650574565, 'timestamp': '2025-09-30 22:08:14.391366', 'step': 1794, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-30 22:08:15.196274', 'step': 1794, 'epoch': 3} {'type': 'pplx', 'content': 120951084.46872666, 'timestamp': '2025-09-30 22:08:15.201859', 'step': 1794, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:15.252216', 'step': 1794, 'epoch': 3} {'type': 'loss', 'content': 2.1755853595095687e-05, 'timestamp': '2025-09-30 22:08:15.257417', 'step': 1795, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:08:15.302369', 'step': 1795, 'epoch': 3} {'type': 'loss', 'content': 5.0607039156602696e-05, 'timestamp': '2025-09-30 22:08:15.340142', 'step': 1796, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:15.389955', 'step': 1796, 'epoch': 3} {'type': 'loss', 'content': 0.00014252289838623255, 'timestamp': '2025-09-30 22:08:15.394160', 'step': 1797, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:08:15.428341', 'step': 1797, 'epoch': 3} {'type': 'loss', 'content': 4.311428710934706e-05, 'timestamp': '2025-09-30 22:08:15.443274', 'step': 1798, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:08:15.490139', 'step': 1798, 'epoch': 3} {'type': 'loss', 'content': 4.962775710737333e-05, 'timestamp': '2025-09-30 22:08:15.504957', 'step': 1799, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:08:15.541027', 'step': 1799, 'epoch': 3} {'type': 'loss', 'content': 7.520474173361436e-05, 'timestamp': '2025-09-30 22:08:15.576857', 'step': 1800, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:08:15.624365', 'step': 1800, 'epoch': 3} {'type': 'loss', 'content': 5.5460110161220655e-05, 'timestamp': '2025-09-30 22:08:15.627503', 'step': 1801, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:08:15.661828', 'step': 1801, 'epoch': 3} {'type': 'loss', 'content': 5.6858716561691836e-05, 'timestamp': '2025-09-30 22:08:15.666471', 'step': 1802, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:08:15.700968', 'step': 1802, 'epoch': 3} {'type': 'loss', 'content': 0.0005492048221640289, 'timestamp': '2025-09-30 22:08:15.705724', 'step': 1803, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:08:15.741314', 'step': 1803, 'epoch': 3} {'type': 'loss', 'content': 0.00012128066009609029, 'timestamp': '2025-09-30 22:08:15.767648', 'step': 1804, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:08:15.813858', 'step': 1804, 'epoch': 3} {'type': 'loss', 'content': 0.005712445825338364, 'timestamp': '2025-09-30 22:08:15.816537', 'step': 1805, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:08:15.849954', 'step': 1805, 'epoch': 3} {'type': 'loss', 'content': 0.0031685256399214268, 'timestamp': '2025-09-30 22:08:15.854239', 'step': 1806, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:08:15.887363', 'step': 1806, 'epoch': 3} {'type': 'loss', 'content': 2.441569813527167e-05, 'timestamp': '2025-09-30 22:08:15.894288', 'step': 1807, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:08:15.929882', 'step': 1807, 'epoch': 3} {'type': 'loss', 'content': 0.018170934170484543, 'timestamp': '2025-09-30 22:08:15.957911', 'step': 1808, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:15.998828', 'step': 1808, 'epoch': 3} {'type': 'loss', 'content': 2.65174348896835e-05, 'timestamp': '2025-09-30 22:08:16.004518', 'step': 1809, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:08:16.039339', 'step': 1809, 'epoch': 3} {'type': 'loss', 'content': 0.00011656737478915602, 'timestamp': '2025-09-30 22:08:16.043396', 'step': 1810, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:08:16.090084', 'step': 1810, 'epoch': 3} {'type': 'loss', 'content': 9.255034092348069e-05, 'timestamp': '2025-09-30 22:08:16.097167', 'step': 1811, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:16.130713', 'step': 1811, 'epoch': 3} {'type': 'loss', 'content': 7.447423558915034e-05, 'timestamp': '2025-09-30 22:08:16.154971', 'step': 1812, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:16.202653', 'step': 1812, 'epoch': 3} {'type': 'loss', 'content': 0.00020663348550442606, 'timestamp': '2025-09-30 22:08:16.209417', 'step': 1813, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:08:16.262784', 'step': 1813, 'epoch': 3} {'type': 'loss', 'content': 4.137990254093893e-05, 'timestamp': '2025-09-30 22:08:16.269464', 'step': 1814, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:08:16.315374', 'step': 1814, 'epoch': 3} {'type': 'loss', 'content': 0.0001488216657890007, 'timestamp': '2025-09-30 22:08:16.317890', 'step': 1815, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:08:16.356081', 'step': 1815, 'epoch': 3} {'type': 'loss', 'content': 9.109562961384654e-05, 'timestamp': '2025-09-30 22:08:16.381173', 'step': 1816, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:08:16.421995', 'step': 1816, 'epoch': 3} {'type': 'loss', 'content': 8.352736767847091e-05, 'timestamp': '2025-09-30 22:08:16.426263', 'step': 1817, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:08:16.466707', 'step': 1817, 'epoch': 3} {'type': 'loss', 'content': 2.977161784656346e-05, 'timestamp': '2025-09-30 22:08:16.470890', 'step': 1818, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:08:16.505994', 'step': 1818, 'epoch': 3} {'type': 'loss', 'content': 0.0001259616547031328, 'timestamp': '2025-09-30 22:08:16.508545', 'step': 1819, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:08:16.542695', 'step': 1819, 'epoch': 3} {'type': 'loss', 'content': 7.263097359100357e-05, 'timestamp': '2025-09-30 22:08:16.570206', 'step': 1820, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:08:16.608132', 'step': 1820, 'epoch': 3} {'type': 'loss', 'content': 8.77750717336312e-05, 'timestamp': '2025-09-30 22:08:16.612891', 'step': 1821, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:16.649859', 'step': 1821, 'epoch': 3} {'type': 'loss', 'content': 4.603018533089198e-05, 'timestamp': '2025-09-30 22:08:16.652882', 'step': 1822, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:16.688440', 'step': 1822, 'epoch': 3} {'type': 'loss', 'content': 0.0004101696249563247, 'timestamp': '2025-09-30 22:08:16.691465', 'step': 1823, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:08:16.726023', 'step': 1823, 'epoch': 3} {'type': 'loss', 'content': 8.775664173299447e-05, 'timestamp': '2025-09-30 22:08:16.750840', 'step': 1824, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:08:16.784702', 'step': 1824, 'epoch': 3} {'type': 'loss', 'content': 0.0006790863699279726, 'timestamp': '2025-09-30 22:08:16.789397', 'step': 1825, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:08:16.822832', 'step': 1825, 'epoch': 3} {'type': 'loss', 'content': 0.0005246041109785438, 'timestamp': '2025-09-30 22:08:16.827178', 'step': 1826, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:08:16.861392', 'step': 1826, 'epoch': 3} {'type': 'loss', 'content': 0.00011121475836262107, 'timestamp': '2025-09-30 22:08:16.866013', 'step': 1827, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:08:16.899442', 'step': 1827, 'epoch': 3} {'type': 'loss', 'content': 3.740801548701711e-05, 'timestamp': '2025-09-30 22:08:16.928008', 'step': 1828, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:08:16.963599', 'step': 1828, 'epoch': 3} {'type': 'loss', 'content': 0.00015850462659727782, 'timestamp': '2025-09-30 22:08:16.966796', 'step': 1829, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:08:17.005488', 'step': 1829, 'epoch': 3} {'type': 'loss', 'content': 4.086808621650562e-05, 'timestamp': '2025-09-30 22:08:17.009996', 'step': 1830, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:08:17.043938', 'step': 1830, 'epoch': 3} {'type': 'loss', 'content': 0.011201680637896061, 'timestamp': '2025-09-30 22:08:17.046266', 'step': 1831, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:08:17.082089', 'step': 1831, 'epoch': 3} {'type': 'loss', 'content': 8.120276470435783e-05, 'timestamp': '2025-09-30 22:08:17.109518', 'step': 1832, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:08:17.142928', 'step': 1832, 'epoch': 3} {'type': 'loss', 'content': 0.031549014151096344, 'timestamp': '2025-09-30 22:08:17.146834', 'step': 1833, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-30 22:08:17.894878', 'step': 1833, 'epoch': 3} {'type': 'pplx', 'content': 129112742.85206084, 'timestamp': '2025-09-30 22:08:17.897575', 'step': 1833, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:17.927259', 'step': 1833, 'epoch': 3} {'type': 'loss', 'content': 0.001220227568410337, 'timestamp': '2025-09-30 22:08:17.929394', 'step': 1834, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:08:17.971377', 'step': 1834, 'epoch': 3} {'type': 'loss', 'content': 0.00038733595283702016, 'timestamp': '2025-09-30 22:08:17.975599', 'step': 1835, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:08:18.010328', 'step': 1835, 'epoch': 3} {'type': 'loss', 'content': 7.79922484070994e-05, 'timestamp': '2025-09-30 22:08:18.034170', 'step': 1836, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:08:18.072204', 'step': 1836, 'epoch': 3} {'type': 'loss', 'content': 0.0011681662872433662, 'timestamp': '2025-09-30 22:08:18.074439', 'step': 1837, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:08:18.121636', 'step': 1837, 'epoch': 3} {'type': 'loss', 'content': 2.5843259209068492e-05, 'timestamp': '2025-09-30 22:08:18.124455', 'step': 1838, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:08:18.156820', 'step': 1838, 'epoch': 3} {'type': 'loss', 'content': 7.065497629810125e-05, 'timestamp': '2025-09-30 22:08:18.159914', 'step': 1839, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:08:18.198259', 'step': 1839, 'epoch': 3} {'type': 'loss', 'content': 3.946414290112443e-05, 'timestamp': '2025-09-30 22:08:18.222454', 'step': 1840, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:08:18.262948', 'step': 1840, 'epoch': 3} {'type': 'loss', 'content': 0.00021928052592556924, 'timestamp': '2025-09-30 22:08:18.265079', 'step': 1841, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:08:18.297876', 'step': 1841, 'epoch': 3} {'type': 'loss', 'content': 0.00023360762861557305, 'timestamp': '2025-09-30 22:08:18.299793', 'step': 1842, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:08:18.332105', 'step': 1842, 'epoch': 3} {'type': 'loss', 'content': 8.189439540728927e-05, 'timestamp': '2025-09-30 22:08:18.334603', 'step': 1843, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:08:18.373101', 'step': 1843, 'epoch': 3} {'type': 'loss', 'content': 0.00012044947652611881, 'timestamp': '2025-09-30 22:08:18.396643', 'step': 1844, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:18.437210', 'step': 1844, 'epoch': 3} {'type': 'loss', 'content': 0.00012855039676651359, 'timestamp': '2025-09-30 22:08:18.439438', 'step': 1845, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:08:18.476909', 'step': 1845, 'epoch': 3} {'type': 'loss', 'content': 6.362907879520208e-05, 'timestamp': '2025-09-30 22:08:18.485087', 'step': 1846, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:18.517382', 'step': 1846, 'epoch': 3} {'type': 'loss', 'content': 0.00016817821597214788, 'timestamp': '2025-09-30 22:08:18.519888', 'step': 1847, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:08:18.558738', 'step': 1847, 'epoch': 3} {'type': 'loss', 'content': 0.0010157003998756409, 'timestamp': '2025-09-30 22:08:18.586910', 'step': 1848, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:08:18.620972', 'step': 1848, 'epoch': 3} {'type': 'loss', 'content': 6.409407797036693e-05, 'timestamp': '2025-09-30 22:08:18.624429', 'step': 1849, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:08:18.658250', 'step': 1849, 'epoch': 3} {'type': 'loss', 'content': 0.00013632301124744117, 'timestamp': '2025-09-30 22:08:18.662956', 'step': 1850, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:08:18.695886', 'step': 1850, 'epoch': 3} {'type': 'loss', 'content': 0.01023082248866558, 'timestamp': '2025-09-30 22:08:18.699569', 'step': 1851, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:18.733099', 'step': 1851, 'epoch': 3} {'type': 'loss', 'content': 0.008190372958779335, 'timestamp': '2025-09-30 22:08:18.757608', 'step': 1852, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-30 22:08:18.790274', 'step': 1852, 'epoch': 3} {'type': 'loss', 'content': 0.00043314872891642153, 'timestamp': '2025-09-30 22:08:18.796711', 'step': 1853, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:08:18.828987', 'step': 1853, 'epoch': 3} {'type': 'loss', 'content': 5.934419823461212e-05, 'timestamp': '2025-09-30 22:08:18.832296', 'step': 1854, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:08:18.871379', 'step': 1854, 'epoch': 3} {'type': 'loss', 'content': 0.0002732239954639226, 'timestamp': '2025-09-30 22:08:18.875970', 'step': 1855, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-30 22:08:18.910790', 'step': 1855, 'epoch': 3} {'type': 'loss', 'content': 4.7785659262444824e-05, 'timestamp': '2025-09-30 22:08:18.935233', 'step': 1856, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:08:18.973620', 'step': 1856, 'epoch': 3} {'type': 'loss', 'content': 0.0002578292042016983, 'timestamp': '2025-09-30 22:08:18.975975', 'step': 1857, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:08:19.016347', 'step': 1857, 'epoch': 3} {'type': 'loss', 'content': 0.00011785969400079921, 'timestamp': '2025-09-30 22:08:19.023376', 'step': 1858, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:08:19.056914', 'step': 1858, 'epoch': 3} {'type': 'loss', 'content': 2.9439393983921036e-05, 'timestamp': '2025-09-30 22:08:19.063975', 'step': 1859, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-30 22:08:19.096222', 'step': 1859, 'epoch': 3} {'type': 'loss', 'content': 0.0003162787761539221, 'timestamp': '2025-09-30 22:08:19.119985', 'step': 1860, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:08:19.154241', 'step': 1860, 'epoch': 3} {'type': 'loss', 'content': 0.00015654593880753964, 'timestamp': '2025-09-30 22:08:19.159920', 'step': 1861, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:08:19.192963', 'step': 1861, 'epoch': 3} {'type': 'loss', 'content': 0.0001386739022564143, 'timestamp': '2025-09-30 22:08:19.197589', 'step': 1862, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:08:19.231116', 'step': 1862, 'epoch': 3} {'type': 'loss', 'content': 0.00011772316793212667, 'timestamp': '2025-09-30 22:08:19.243666', 'step': 1863, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-30 22:08:19.290090', 'step': 1863, 'epoch': 3} {'type': 'loss', 'content': 0.04032059386372566, 'timestamp': '2025-09-30 22:08:19.316983', 'step': 1864, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:08:19.358614', 'step': 1864, 'epoch': 3} {'type': 'loss', 'content': 0.07269955426454544, 'timestamp': '2025-09-30 22:08:19.364288', 'step': 1865, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:08:19.406194', 'step': 1865, 'epoch': 3} {'type': 'loss', 'content': 0.00010652485798345879, 'timestamp': '2025-09-30 22:08:19.411477', 'step': 1866, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:08:19.451042', 'step': 1866, 'epoch': 3} {'type': 'loss', 'content': 6.099715028540231e-05, 'timestamp': '2025-09-30 22:08:19.457453', 'step': 1867, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:08:19.491157', 'step': 1867, 'epoch': 3} {'type': 'loss', 'content': 6.526331708300859e-05, 'timestamp': '2025-09-30 22:08:19.518987', 'step': 1868, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [2, 192], 'flops': 2847885110400}, 'timestamp': '2025-09-30 22:08:19.558052', 'step': 1868, 'epoch': 3} {'type': 'loss', 'content': 4.0736955270403996e-05, 'timestamp': '2025-09-30 22:08:19.561941', 'step': 1869, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-30 22:08:20.317601', 'step': 1869, 'epoch': 3} {'type': 'pplx', 'content': 114416360.51964663, 'timestamp': '2025-09-30 22:08:20.323449', 'step': 1869, 'epoch': 3} {'type': 'best_pplx', 'content': 50567875.175425716, 'timestamp': '2025-09-30 22:08:20.327822', 'step': 1869, 'epoch': 3} {'type': 'best_step', 'content': 39, 'timestamp': '2025-09-30 22:08:20.331888', 'step': 1869, 'epoch': 3} {'type': 'total_pplx_flops', 'content': 5333250945655808, 'timestamp': '2025-09-30 22:08:20.338020', 'step': 1869, 'epoch': 3} {'type': 'total_train_flops', 'content': 9219668431260864, 'timestamp': '2025-09-30 22:08:20.343552', 'step': 1869, 'epoch': 3}