{'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-09-30 22:12:50.849475', 'step': 0, 'epoch': 0} {'type': 'pplx', 'content': 9.2736354286407, 'timestamp': '2025-09-30 22:12:50.855801', 'step': 0, 'epoch': 0} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:12:50.926047', 'step': 0, 'epoch': 1} {'type': 'loss', 'content': 0.29884400963783264, 'timestamp': '2025-09-30 22:12:50.929185', 'step': 1, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:12:50.996988', 'step': 1, 'epoch': 1} {'type': 'loss', 'content': 0.14724677801132202, 'timestamp': '2025-09-30 22:12:51.005634', 'step': 2, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:12:51.040009', 'step': 2, 'epoch': 1} {'type': 'loss', 'content': 0.20179437100887299, 'timestamp': '2025-09-30 22:12:51.047554', 'step': 3, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:12:51.100118', 'step': 3, 'epoch': 1} {'type': 'loss', 'content': 0.21899153292179108, 'timestamp': '2025-09-30 22:12:51.149852', 'step': 4, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:12:51.200265', 'step': 4, 'epoch': 1} {'type': 'loss', 'content': 0.07468795776367188, 'timestamp': '2025-09-30 22:12:51.204880', 'step': 5, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:12:51.237744', 'step': 5, 'epoch': 1} {'type': 'loss', 'content': 0.0699843019247055, 'timestamp': '2025-09-30 22:12:51.245151', 'step': 6, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:12:51.286084', 'step': 6, 'epoch': 1} {'type': 'loss', 'content': 0.05495142564177513, 'timestamp': '2025-09-30 22:12:51.299347', 'step': 7, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:12:51.345859', 'step': 7, 'epoch': 1} {'type': 'loss', 'content': 0.05214886739850044, 'timestamp': '2025-09-30 22:12:51.375264', 'step': 8, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-09-30 22:12:51.430739', 'step': 8, 'epoch': 1} {'type': 'loss', 'content': 0.02578839845955372, 'timestamp': '2025-09-30 22:12:51.443160', 'step': 9, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:12:51.477039', 'step': 9, 'epoch': 1} {'type': 'loss', 'content': 0.020216139033436775, 'timestamp': '2025-09-30 22:12:51.488195', 'step': 10, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:12:51.524025', 'step': 10, 'epoch': 1} {'type': 'loss', 'content': 0.0157146155834198, 'timestamp': '2025-09-30 22:12:51.537275', 'step': 11, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:12:51.574585', 'step': 11, 'epoch': 1} {'type': 'loss', 'content': 0.029712101444602013, 'timestamp': '2025-09-30 22:12:51.608772', 'step': 12, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:12:51.642438', 'step': 12, 'epoch': 1} {'type': 'loss', 'content': 0.022363366559147835, 'timestamp': '2025-09-30 22:12:51.647653', 'step': 13, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:12:51.697402', 'step': 13, 'epoch': 1} {'type': 'loss', 'content': 0.026609627529978752, 'timestamp': '2025-09-30 22:12:51.701911', 'step': 14, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:12:51.740576', 'step': 14, 'epoch': 1} {'type': 'loss', 'content': 0.018716558814048767, 'timestamp': '2025-09-30 22:12:51.751439', 'step': 15, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:12:51.788868', 'step': 15, 'epoch': 1} {'type': 'loss', 'content': 0.014440740458667278, 'timestamp': '2025-09-30 22:12:51.823034', 'step': 16, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:12:51.873571', 'step': 16, 'epoch': 1} {'type': 'loss', 'content': 0.042656391859054565, 'timestamp': '2025-09-30 22:12:51.878131', 'step': 17, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:12:51.912452', 'step': 17, 'epoch': 1} {'type': 'loss', 'content': 0.02184998244047165, 'timestamp': '2025-09-30 22:12:51.924864', 'step': 18, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:12:51.975147', 'step': 18, 'epoch': 1} {'type': 'loss', 'content': 0.03424403443932533, 'timestamp': '2025-09-30 22:12:51.980685', 'step': 19, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:12:52.015920', 'step': 19, 'epoch': 1} {'type': 'loss', 'content': 0.03647345304489136, 'timestamp': '2025-09-30 22:12:52.044551', 'step': 20, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:12:52.085277', 'step': 20, 'epoch': 1} {'type': 'loss', 'content': 0.027352726086974144, 'timestamp': '2025-09-30 22:12:52.090570', 'step': 21, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:12:52.126546', 'step': 21, 'epoch': 1} {'type': 'loss', 'content': 0.035634566098451614, 'timestamp': '2025-09-30 22:12:52.134526', 'step': 22, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:12:52.179890', 'step': 22, 'epoch': 1} {'type': 'loss', 'content': 0.034287016838788986, 'timestamp': '2025-09-30 22:12:52.191422', 'step': 23, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:12:52.225650', 'step': 23, 'epoch': 1} {'type': 'loss', 'content': 0.036502204835414886, 'timestamp': '2025-09-30 22:12:52.256684', 'step': 24, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:12:52.304118', 'step': 24, 'epoch': 1} {'type': 'loss', 'content': 0.014664776623249054, 'timestamp': '2025-09-30 22:12:52.320999', 'step': 25, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:12:52.359135', 'step': 25, 'epoch': 1} {'type': 'loss', 'content': 0.03228773921728134, 'timestamp': '2025-09-30 22:12:52.366322', 'step': 26, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:12:52.408202', 'step': 26, 'epoch': 1} {'type': 'loss', 'content': 0.015143612399697304, 'timestamp': '2025-09-30 22:12:52.421938', 'step': 27, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:12:52.457339', 'step': 27, 'epoch': 1} {'type': 'loss', 'content': 0.025396913290023804, 'timestamp': '2025-09-30 22:12:52.490548', 'step': 28, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:12:52.525372', 'step': 28, 'epoch': 1} {'type': 'loss', 'content': 0.016927070915699005, 'timestamp': '2025-09-30 22:12:52.536003', 'step': 29, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:12:52.580823', 'step': 29, 'epoch': 1} {'type': 'loss', 'content': 0.01682448200881481, 'timestamp': '2025-09-30 22:12:52.594176', 'step': 30, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:12:52.626284', 'step': 30, 'epoch': 1} {'type': 'loss', 'content': 0.041514765471220016, 'timestamp': '2025-09-30 22:12:52.637451', 'step': 31, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:12:52.673962', 'step': 31, 'epoch': 1} {'type': 'loss', 'content': 0.009538545273244381, 'timestamp': '2025-09-30 22:12:52.702975', 'step': 32, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:12:52.746096', 'step': 32, 'epoch': 1} {'type': 'loss', 'content': 0.0234337467700243, 'timestamp': '2025-09-30 22:12:52.758998', 'step': 33, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:12:52.793611', 'step': 33, 'epoch': 1} {'type': 'loss', 'content': 0.031635984778404236, 'timestamp': '2025-09-30 22:12:52.804746', 'step': 34, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:12:52.842929', 'step': 34, 'epoch': 1} {'type': 'loss', 'content': 0.022598372772336006, 'timestamp': '2025-09-30 22:12:52.856537', 'step': 35, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:12:52.895367', 'step': 35, 'epoch': 1} {'type': 'loss', 'content': 0.019552720710635185, 'timestamp': '2025-09-30 22:12:52.929911', 'step': 36, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:12:52.965015', 'step': 36, 'epoch': 1} {'type': 'loss', 'content': 0.025182483717799187, 'timestamp': '2025-09-30 22:12:52.970612', 'step': 37, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:12:53.003075', 'step': 37, 'epoch': 1} {'type': 'loss', 'content': 0.017326978966593742, 'timestamp': '2025-09-30 22:12:53.014247', 'step': 38, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:12:53.046891', 'step': 38, 'epoch': 1} {'type': 'loss', 'content': 0.022275006398558617, 'timestamp': '2025-09-30 22:12:53.053931', 'step': 39, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:12:53.085671', 'step': 39, 'epoch': 1} {'type': 'loss', 'content': 0.032375190407037735, 'timestamp': '2025-09-30 22:12:53.113469', 'step': 40, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:12:53.146897', 'step': 40, 'epoch': 1} {'type': 'loss', 'content': 0.02389678917825222, 'timestamp': '2025-09-30 22:12:53.152448', 'step': 41, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:12:53.184436', 'step': 41, 'epoch': 1} {'type': 'loss', 'content': 0.025564173236489296, 'timestamp': '2025-09-30 22:12:53.194630', 'step': 42, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 15187581968384}, 'timestamp': '2025-09-30 22:12:53.242138', 'step': 42, 'epoch': 1} {'type': 'loss', 'content': 0.010295093059539795, 'timestamp': '2025-09-30 22:12:53.259901', 'step': 43, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:12:53.299768', 'step': 43, 'epoch': 1} {'type': 'loss', 'content': 0.011783291585743427, 'timestamp': '2025-09-30 22:12:53.334630', 'step': 44, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:12:53.385571', 'step': 44, 'epoch': 1} {'type': 'loss', 'content': 0.012898216024041176, 'timestamp': '2025-09-30 22:12:53.396166', 'step': 45, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:12:53.433002', 'step': 45, 'epoch': 1} {'type': 'loss', 'content': 0.023097814992070198, 'timestamp': '2025-09-30 22:12:53.445410', 'step': 46, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:12:53.479343', 'step': 46, 'epoch': 1} {'type': 'loss', 'content': 0.014813568443059921, 'timestamp': '2025-09-30 22:12:53.487286', 'step': 47, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:12:53.528526', 'step': 47, 'epoch': 1} {'type': 'loss', 'content': 0.018811291083693504, 'timestamp': '2025-09-30 22:12:53.560273', 'step': 48, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:12:53.595124', 'step': 48, 'epoch': 1} {'type': 'loss', 'content': 0.02083074115216732, 'timestamp': '2025-09-30 22:12:53.600481', 'step': 49, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:12:53.636287', 'step': 49, 'epoch': 1} {'type': 'loss', 'content': 0.02495446801185608, 'timestamp': '2025-09-30 22:12:53.647467', 'step': 50, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:12:53.681179', 'step': 50, 'epoch': 1} {'type': 'loss', 'content': 0.04506039246916771, 'timestamp': '2025-09-30 22:12:53.688585', 'step': 51, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:12:53.726307', 'step': 51, 'epoch': 1} {'type': 'loss', 'content': 0.018953822553157806, 'timestamp': '2025-09-30 22:12:53.760527', 'step': 52, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:12:53.795049', 'step': 52, 'epoch': 1} {'type': 'loss', 'content': 0.011422491632401943, 'timestamp': '2025-09-30 22:12:53.808178', 'step': 53, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:12:53.842054', 'step': 53, 'epoch': 1} {'type': 'loss', 'content': 0.01761559024453163, 'timestamp': '2025-09-30 22:12:53.853011', 'step': 54, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:12:53.898729', 'step': 54, 'epoch': 1} {'type': 'loss', 'content': 0.026785332709550858, 'timestamp': '2025-09-30 22:12:53.909680', 'step': 55, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:12:53.944445', 'step': 55, 'epoch': 1} {'type': 'loss', 'content': 0.04988979548215866, 'timestamp': '2025-09-30 22:12:53.977479', 'step': 56, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:12:54.011718', 'step': 56, 'epoch': 1} {'type': 'loss', 'content': 0.01828792318701744, 'timestamp': '2025-09-30 22:12:54.022110', 'step': 57, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:12:54.053405', 'step': 57, 'epoch': 1} {'type': 'loss', 'content': 0.015893779695034027, 'timestamp': '2025-09-30 22:12:54.064436', 'step': 58, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:12:54.105945', 'step': 58, 'epoch': 1} {'type': 'loss', 'content': 0.03437896445393562, 'timestamp': '2025-09-30 22:12:54.116986', 'step': 59, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:12:54.152445', 'step': 59, 'epoch': 1} {'type': 'loss', 'content': 0.02998773194849491, 'timestamp': '2025-09-30 22:12:54.180234', 'step': 60, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:12:54.226681', 'step': 60, 'epoch': 1} {'type': 'loss', 'content': 0.01960010826587677, 'timestamp': '2025-09-30 22:12:54.232880', 'step': 61, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:12:54.267808', 'step': 61, 'epoch': 1} {'type': 'loss', 'content': 0.018762405961751938, 'timestamp': '2025-09-30 22:12:54.275082', 'step': 62, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:12:54.311713', 'step': 62, 'epoch': 1} {'type': 'loss', 'content': 0.011765447445213795, 'timestamp': '2025-09-30 22:12:54.323976', 'step': 63, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:12:54.362167', 'step': 63, 'epoch': 1} {'type': 'loss', 'content': 0.010820180177688599, 'timestamp': '2025-09-30 22:12:54.396948', 'step': 64, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:12:54.439330', 'step': 64, 'epoch': 1} {'type': 'loss', 'content': 0.012286880984902382, 'timestamp': '2025-09-30 22:12:54.451483', 'step': 65, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:12:54.484117', 'step': 65, 'epoch': 1} {'type': 'loss', 'content': 0.022625945508480072, 'timestamp': '2025-09-30 22:12:54.491206', 'step': 66, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:12:54.531290', 'step': 66, 'epoch': 1} {'type': 'loss', 'content': 0.022299209609627724, 'timestamp': '2025-09-30 22:12:54.543461', 'step': 67, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:12:54.580701', 'step': 67, 'epoch': 1} {'type': 'loss', 'content': 0.011945872567594051, 'timestamp': '2025-09-30 22:12:54.613935', 'step': 68, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:12:54.648454', 'step': 68, 'epoch': 1} {'type': 'loss', 'content': 0.011568807996809483, 'timestamp': '2025-09-30 22:12:54.657077', 'step': 69, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:12:54.694076', 'step': 69, 'epoch': 1} {'type': 'loss', 'content': 0.012823755852878094, 'timestamp': '2025-09-30 22:12:54.707849', 'step': 70, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:12:54.743116', 'step': 70, 'epoch': 1} {'type': 'loss', 'content': 0.016860121861100197, 'timestamp': '2025-09-30 22:12:54.751007', 'step': 71, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:12:54.784561', 'step': 71, 'epoch': 1} {'type': 'loss', 'content': 0.018639255315065384, 'timestamp': '2025-09-30 22:12:54.817826', 'step': 72, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:12:54.851639', 'step': 72, 'epoch': 1} {'type': 'loss', 'content': 0.014421514235436916, 'timestamp': '2025-09-30 22:12:54.859619', 'step': 73, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:12:54.893271', 'step': 73, 'epoch': 1} {'type': 'loss', 'content': 0.012593203224241734, 'timestamp': '2025-09-30 22:12:54.905702', 'step': 74, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:12:54.939109', 'step': 74, 'epoch': 1} {'type': 'loss', 'content': 0.013217393308877945, 'timestamp': '2025-09-30 22:12:54.950062', 'step': 75, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:12:54.983172', 'step': 75, 'epoch': 1} {'type': 'loss', 'content': 0.019085409119725227, 'timestamp': '2025-09-30 22:12:55.011569', 'step': 76, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:12:55.048985', 'step': 76, 'epoch': 1} {'type': 'loss', 'content': 0.01689778082072735, 'timestamp': '2025-09-30 22:12:55.058655', 'step': 77, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:12:55.093483', 'step': 77, 'epoch': 1} {'type': 'loss', 'content': 0.018813522532582283, 'timestamp': '2025-09-30 22:12:55.105858', 'step': 78, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:12:55.146469', 'step': 78, 'epoch': 1} {'type': 'loss', 'content': 0.013966727070510387, 'timestamp': '2025-09-30 22:12:55.157176', 'step': 79, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:12:55.193670', 'step': 79, 'epoch': 1} {'type': 'loss', 'content': 0.019746946170926094, 'timestamp': '2025-09-30 22:12:55.221316', 'step': 80, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:12:55.256918', 'step': 80, 'epoch': 1} {'type': 'loss', 'content': 0.023544225841760635, 'timestamp': '2025-09-30 22:12:55.264560', 'step': 81, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:12:55.297812', 'step': 81, 'epoch': 1} {'type': 'loss', 'content': 0.016823606565594673, 'timestamp': '2025-09-30 22:12:55.305425', 'step': 82, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:12:55.348005', 'step': 82, 'epoch': 1} {'type': 'loss', 'content': 0.01522775087505579, 'timestamp': '2025-09-30 22:12:55.361382', 'step': 83, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-09-30 22:12:55.411320', 'step': 83, 'epoch': 1} {'type': 'loss', 'content': 0.008765741251409054, 'timestamp': '2025-09-30 22:12:55.445452', 'step': 84, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:12:55.488284', 'step': 84, 'epoch': 1} {'type': 'loss', 'content': 0.013270394876599312, 'timestamp': '2025-09-30 22:12:55.500902', 'step': 85, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:12:55.544283', 'step': 85, 'epoch': 1} {'type': 'loss', 'content': 0.017209267243742943, 'timestamp': '2025-09-30 22:12:55.558140', 'step': 86, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:12:55.601486', 'step': 86, 'epoch': 1} {'type': 'loss', 'content': 0.01655328832566738, 'timestamp': '2025-09-30 22:12:55.615091', 'step': 87, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:12:55.677369', 'step': 87, 'epoch': 1} {'type': 'loss', 'content': 0.014354526065289974, 'timestamp': '2025-09-30 22:12:55.711652', 'step': 88, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:12:55.789119', 'step': 88, 'epoch': 1} {'type': 'loss', 'content': 0.010505059733986855, 'timestamp': '2025-09-30 22:12:55.802304', 'step': 89, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:12:55.866843', 'step': 89, 'epoch': 1} {'type': 'loss', 'content': 0.03894902393221855, 'timestamp': '2025-09-30 22:12:55.873689', 'step': 90, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:12:55.934872', 'step': 90, 'epoch': 1} {'type': 'loss', 'content': 0.0164689589291811, 'timestamp': '2025-09-30 22:12:55.944977', 'step': 91, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:12:56.001555', 'step': 91, 'epoch': 1} {'type': 'loss', 'content': 0.014508247375488281, 'timestamp': '2025-09-30 22:12:56.030211', 'step': 92, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:12:56.077037', 'step': 92, 'epoch': 1} {'type': 'loss', 'content': 0.018277524039149284, 'timestamp': '2025-09-30 22:12:56.084836', 'step': 93, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:12:56.132303', 'step': 93, 'epoch': 1} {'type': 'loss', 'content': 0.018098050728440285, 'timestamp': '2025-09-30 22:12:56.143357', 'step': 94, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:12:56.197899', 'step': 94, 'epoch': 1} {'type': 'loss', 'content': 0.022050291299819946, 'timestamp': '2025-09-30 22:12:56.211512', 'step': 95, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:12:56.264204', 'step': 95, 'epoch': 1} {'type': 'loss', 'content': 0.017889603972434998, 'timestamp': '2025-09-30 22:12:56.297166', 'step': 96, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:12:56.352176', 'step': 96, 'epoch': 1} {'type': 'loss', 'content': 0.019431166350841522, 'timestamp': '2025-09-30 22:12:56.359950', 'step': 97, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:12:56.416477', 'step': 97, 'epoch': 1} {'type': 'loss', 'content': 0.031271129846572876, 'timestamp': '2025-09-30 22:12:56.429068', 'step': 98, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:12:56.479355', 'step': 98, 'epoch': 1} {'type': 'loss', 'content': 0.019475264474749565, 'timestamp': '2025-09-30 22:12:56.493070', 'step': 99, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:12:56.556543', 'step': 99, 'epoch': 1} {'type': 'loss', 'content': 0.013815326616168022, 'timestamp': '2025-09-30 22:12:56.590701', 'step': 100, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:12:56.644790', 'step': 100, 'epoch': 1} {'type': 'loss', 'content': 0.010819070972502232, 'timestamp': '2025-09-30 22:12:56.657871', 'step': 101, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:12:56.716229', 'step': 101, 'epoch': 1} {'type': 'loss', 'content': 0.021440595388412476, 'timestamp': '2025-09-30 22:12:56.728604', 'step': 102, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:12:56.785093', 'step': 102, 'epoch': 1} {'type': 'loss', 'content': 0.012968943454325199, 'timestamp': '2025-09-30 22:12:56.797463', 'step': 103, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:12:56.849593', 'step': 103, 'epoch': 1} {'type': 'loss', 'content': 0.02394869178533554, 'timestamp': '2025-09-30 22:12:56.879534', 'step': 104, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:12:56.941585', 'step': 104, 'epoch': 1} {'type': 'loss', 'content': 0.023614834994077682, 'timestamp': '2025-09-30 22:12:56.949565', 'step': 105, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:12:57.012662', 'step': 105, 'epoch': 1} {'type': 'loss', 'content': 0.018549881875514984, 'timestamp': '2025-09-30 22:12:57.026546', 'step': 106, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:12:57.077639', 'step': 106, 'epoch': 1} {'type': 'loss', 'content': 0.014040866866707802, 'timestamp': '2025-09-30 22:12:57.090117', 'step': 107, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:12:57.134079', 'step': 107, 'epoch': 1} {'type': 'loss', 'content': 0.01790779083967209, 'timestamp': '2025-09-30 22:12:57.168520', 'step': 108, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:12:57.206739', 'step': 108, 'epoch': 1} {'type': 'loss', 'content': 0.013338050805032253, 'timestamp': '2025-09-30 22:12:57.211953', 'step': 109, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:12:57.251146', 'step': 109, 'epoch': 1} {'type': 'loss', 'content': 0.02902776561677456, 'timestamp': '2025-09-30 22:12:57.263651', 'step': 110, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-09-30 22:12:57.307299', 'step': 110, 'epoch': 1} {'type': 'loss', 'content': 0.013689517974853516, 'timestamp': '2025-09-30 22:12:57.323403', 'step': 111, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:12:57.359819', 'step': 111, 'epoch': 1} {'type': 'loss', 'content': 0.01824032887816429, 'timestamp': '2025-09-30 22:12:57.392955', 'step': 112, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:12:57.443807', 'step': 112, 'epoch': 1} {'type': 'loss', 'content': 0.01077995914965868, 'timestamp': '2025-09-30 22:12:57.457091', 'step': 113, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:12:57.490932', 'step': 113, 'epoch': 1} {'type': 'loss', 'content': 0.01976594887673855, 'timestamp': '2025-09-30 22:12:57.499281', 'step': 114, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:12:57.536443', 'step': 114, 'epoch': 1} {'type': 'loss', 'content': 0.03269365429878235, 'timestamp': '2025-09-30 22:12:57.545603', 'step': 115, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-09-30 22:13:00.036032', 'step': 115, 'epoch': 1} {'type': 'pplx', 'content': 5.660676883245411, 'timestamp': '2025-09-30 22:13:00.039028', 'step': 115, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:13:00.077992', 'step': 115, 'epoch': 1} {'type': 'loss', 'content': 0.015604332089424133, 'timestamp': '2025-09-30 22:13:00.114670', 'step': 116, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:00.148349', 'step': 116, 'epoch': 1} {'type': 'loss', 'content': 0.01104702427983284, 'timestamp': '2025-09-30 22:13:00.155849', 'step': 117, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:13:00.188289', 'step': 117, 'epoch': 1} {'type': 'loss', 'content': 0.022734448313713074, 'timestamp': '2025-09-30 22:13:00.200449', 'step': 118, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:00.237036', 'step': 118, 'epoch': 1} {'type': 'loss', 'content': 0.02765716426074505, 'timestamp': '2025-09-30 22:13:00.247954', 'step': 119, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:13:00.287836', 'step': 119, 'epoch': 1} {'type': 'loss', 'content': 0.0134533466771245, 'timestamp': '2025-09-30 22:13:00.322507', 'step': 120, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:13:00.363168', 'step': 120, 'epoch': 1} {'type': 'loss', 'content': 0.03625181317329407, 'timestamp': '2025-09-30 22:13:00.376245', 'step': 121, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:13:00.411393', 'step': 121, 'epoch': 1} {'type': 'loss', 'content': 0.012412041425704956, 'timestamp': '2025-09-30 22:13:00.423924', 'step': 122, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:13:00.461573', 'step': 122, 'epoch': 1} {'type': 'loss', 'content': 0.021080005913972855, 'timestamp': '2025-09-30 22:13:00.475408', 'step': 123, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:13:00.517942', 'step': 123, 'epoch': 1} {'type': 'loss', 'content': 0.011560908518731594, 'timestamp': '2025-09-30 22:13:00.554678', 'step': 124, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:13:00.597417', 'step': 124, 'epoch': 1} {'type': 'loss', 'content': 0.007222126703709364, 'timestamp': '2025-09-30 22:13:00.612857', 'step': 125, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:13:00.667781', 'step': 125, 'epoch': 1} {'type': 'loss', 'content': 0.008931437507271767, 'timestamp': '2025-09-30 22:13:00.680456', 'step': 126, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:00.716445', 'step': 126, 'epoch': 1} {'type': 'loss', 'content': 0.014612638391554356, 'timestamp': '2025-09-30 22:13:00.727428', 'step': 127, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 16611393146432}, 'timestamp': '2025-09-30 22:13:00.789762', 'step': 127, 'epoch': 1} {'type': 'loss', 'content': 0.01625419408082962, 'timestamp': '2025-09-30 22:13:00.829962', 'step': 128, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-09-30 22:13:00.871015', 'step': 128, 'epoch': 1} {'type': 'loss', 'content': 0.009241427294909954, 'timestamp': '2025-09-30 22:13:00.887706', 'step': 129, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:13:00.925307', 'step': 129, 'epoch': 1} {'type': 'loss', 'content': 0.013992900028824806, 'timestamp': '2025-09-30 22:13:00.939249', 'step': 130, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:13:00.979348', 'step': 130, 'epoch': 1} {'type': 'loss', 'content': 0.014244504272937775, 'timestamp': '2025-09-30 22:13:00.991479', 'step': 131, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:13:01.026529', 'step': 131, 'epoch': 1} {'type': 'loss', 'content': 0.018886029720306396, 'timestamp': '2025-09-30 22:13:01.060707', 'step': 132, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-09-30 22:13:01.107858', 'step': 132, 'epoch': 1} {'type': 'loss', 'content': 0.01401935238391161, 'timestamp': '2025-09-30 22:13:01.124565', 'step': 133, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 14712978242368}, 'timestamp': '2025-09-30 22:13:01.174465', 'step': 133, 'epoch': 1} {'type': 'loss', 'content': 0.00951952300965786, 'timestamp': '2025-09-30 22:13:01.191956', 'step': 134, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:13:01.235264', 'step': 134, 'epoch': 1} {'type': 'loss', 'content': 0.020773863419890404, 'timestamp': '2025-09-30 22:13:01.248979', 'step': 135, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:13:01.295943', 'step': 135, 'epoch': 1} {'type': 'loss', 'content': 0.023069093003869057, 'timestamp': '2025-09-30 22:13:01.324159', 'step': 136, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:01.359793', 'step': 136, 'epoch': 1} {'type': 'loss', 'content': 0.0129553796723485, 'timestamp': '2025-09-30 22:13:01.367781', 'step': 137, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:13:01.408205', 'step': 137, 'epoch': 1} {'type': 'loss', 'content': 0.03496166691184044, 'timestamp': '2025-09-30 22:13:01.420778', 'step': 138, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:01.454505', 'step': 138, 'epoch': 1} {'type': 'loss', 'content': 0.018751680850982666, 'timestamp': '2025-09-30 22:13:01.465528', 'step': 139, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:01.510319', 'step': 139, 'epoch': 1} {'type': 'loss', 'content': 0.022187508642673492, 'timestamp': '2025-09-30 22:13:01.539067', 'step': 140, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:01.575918', 'step': 140, 'epoch': 1} {'type': 'loss', 'content': 0.015233765356242657, 'timestamp': '2025-09-30 22:13:01.581613', 'step': 141, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:13:01.621741', 'step': 141, 'epoch': 1} {'type': 'loss', 'content': 0.021425914019346237, 'timestamp': '2025-09-30 22:13:01.635086', 'step': 142, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-09-30 22:13:01.679480', 'step': 142, 'epoch': 1} {'type': 'loss', 'content': 0.016796356067061424, 'timestamp': '2025-09-30 22:13:01.695600', 'step': 143, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:13:01.740055', 'step': 143, 'epoch': 1} {'type': 'loss', 'content': 0.017127713188529015, 'timestamp': '2025-09-30 22:13:01.768395', 'step': 144, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:01.801480', 'step': 144, 'epoch': 1} {'type': 'loss', 'content': 0.016997547820210457, 'timestamp': '2025-09-30 22:13:01.809420', 'step': 145, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:01.847471', 'step': 145, 'epoch': 1} {'type': 'loss', 'content': 0.017635153606534004, 'timestamp': '2025-09-30 22:13:01.855288', 'step': 146, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:13:01.890937', 'step': 146, 'epoch': 1} {'type': 'loss', 'content': 0.015158042311668396, 'timestamp': '2025-09-30 22:13:01.898766', 'step': 147, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:13:01.933770', 'step': 147, 'epoch': 1} {'type': 'loss', 'content': 0.044107209891080856, 'timestamp': '2025-09-30 22:13:01.964766', 'step': 148, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:13:01.999535', 'step': 148, 'epoch': 1} {'type': 'loss', 'content': 0.015536610037088394, 'timestamp': '2025-09-30 22:13:02.010143', 'step': 149, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:02.052349', 'step': 149, 'epoch': 1} {'type': 'loss', 'content': 0.04728955402970314, 'timestamp': '2025-09-30 22:13:02.063475', 'step': 150, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:02.101301', 'step': 150, 'epoch': 1} {'type': 'loss', 'content': 0.026954729110002518, 'timestamp': '2025-09-30 22:13:02.112414', 'step': 151, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:13:02.153727', 'step': 151, 'epoch': 1} {'type': 'loss', 'content': 0.01647934690117836, 'timestamp': '2025-09-30 22:13:02.188629', 'step': 152, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:13:02.229291', 'step': 152, 'epoch': 1} {'type': 'loss', 'content': 0.013347185216844082, 'timestamp': '2025-09-30 22:13:02.239777', 'step': 153, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:13:02.299419', 'step': 153, 'epoch': 1} {'type': 'loss', 'content': 0.009047807194292545, 'timestamp': '2025-09-30 22:13:02.313227', 'step': 154, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:13:02.355995', 'step': 154, 'epoch': 1} {'type': 'loss', 'content': 0.011396903544664383, 'timestamp': '2025-09-30 22:13:02.369721', 'step': 155, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:13:02.415920', 'step': 155, 'epoch': 1} {'type': 'loss', 'content': 0.015042304992675781, 'timestamp': '2025-09-30 22:13:02.452462', 'step': 156, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:13:02.498006', 'step': 156, 'epoch': 1} {'type': 'loss', 'content': 0.022125402465462685, 'timestamp': '2025-09-30 22:13:02.507765', 'step': 157, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:13:02.552007', 'step': 157, 'epoch': 1} {'type': 'loss', 'content': 0.008051242679357529, 'timestamp': '2025-09-30 22:13:02.565862', 'step': 158, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:13:02.599230', 'step': 158, 'epoch': 1} {'type': 'loss', 'content': 0.03221747651696205, 'timestamp': '2025-09-30 22:13:02.611517', 'step': 159, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:13:02.662891', 'step': 159, 'epoch': 1} {'type': 'loss', 'content': 0.025403138250112534, 'timestamp': '2025-09-30 22:13:02.697835', 'step': 160, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:02.733609', 'step': 160, 'epoch': 1} {'type': 'loss', 'content': 0.018038174137473106, 'timestamp': '2025-09-30 22:13:02.739223', 'step': 161, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:13:02.772084', 'step': 161, 'epoch': 1} {'type': 'loss', 'content': 0.015591044910252094, 'timestamp': '2025-09-30 22:13:02.784563', 'step': 162, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:02.817367', 'step': 162, 'epoch': 1} {'type': 'loss', 'content': 0.020338229835033417, 'timestamp': '2025-09-30 22:13:02.828444', 'step': 163, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:13:02.862879', 'step': 163, 'epoch': 1} {'type': 'loss', 'content': 0.012969471514225006, 'timestamp': '2025-09-30 22:13:02.896344', 'step': 164, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:13:02.929993', 'step': 164, 'epoch': 1} {'type': 'loss', 'content': 0.012057175859808922, 'timestamp': '2025-09-30 22:13:02.940689', 'step': 165, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:02.975589', 'step': 165, 'epoch': 1} {'type': 'loss', 'content': 0.01438069622963667, 'timestamp': '2025-09-30 22:13:02.985925', 'step': 166, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:13:03.027125', 'step': 166, 'epoch': 1} {'type': 'loss', 'content': 0.009808840230107307, 'timestamp': '2025-09-30 22:13:03.039356', 'step': 167, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:03.083029', 'step': 167, 'epoch': 1} {'type': 'loss', 'content': 0.012758846394717693, 'timestamp': '2025-09-30 22:13:03.111707', 'step': 168, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:03.146770', 'step': 168, 'epoch': 1} {'type': 'loss', 'content': 0.017916690558195114, 'timestamp': '2025-09-30 22:13:03.152263', 'step': 169, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:03.197680', 'step': 169, 'epoch': 1} {'type': 'loss', 'content': 0.014809397980570793, 'timestamp': '2025-09-30 22:13:03.205675', 'step': 170, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:13:03.244913', 'step': 170, 'epoch': 1} {'type': 'loss', 'content': 0.013050451874732971, 'timestamp': '2025-09-30 22:13:03.258603', 'step': 171, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:13:03.294418', 'step': 171, 'epoch': 1} {'type': 'loss', 'content': 0.013798566535115242, 'timestamp': '2025-09-30 22:13:03.323050', 'step': 172, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:13:03.356661', 'step': 172, 'epoch': 1} {'type': 'loss', 'content': 0.014890993945300579, 'timestamp': '2025-09-30 22:13:03.366580', 'step': 173, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:03.399001', 'step': 173, 'epoch': 1} {'type': 'loss', 'content': 0.014647998847067356, 'timestamp': '2025-09-30 22:13:03.409578', 'step': 174, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:13:03.447934', 'step': 174, 'epoch': 1} {'type': 'loss', 'content': 0.017779076471924782, 'timestamp': '2025-09-30 22:13:03.457024', 'step': 175, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:13:03.498102', 'step': 175, 'epoch': 1} {'type': 'loss', 'content': 0.01286756806075573, 'timestamp': '2025-09-30 22:13:03.531262', 'step': 176, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:13:03.570414', 'step': 176, 'epoch': 1} {'type': 'loss', 'content': 0.010901299305260181, 'timestamp': '2025-09-30 22:13:03.583547', 'step': 177, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:03.622018', 'step': 177, 'epoch': 1} {'type': 'loss', 'content': 0.009808521717786789, 'timestamp': '2025-09-30 22:13:03.629907', 'step': 178, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-09-30 22:13:03.675999', 'step': 178, 'epoch': 1} {'type': 'loss', 'content': 0.007272894959896803, 'timestamp': '2025-09-30 22:13:03.693961', 'step': 179, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:13:03.737094', 'step': 179, 'epoch': 1} {'type': 'loss', 'content': 0.02019336074590683, 'timestamp': '2025-09-30 22:13:03.771301', 'step': 180, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:13:03.806630', 'step': 180, 'epoch': 1} {'type': 'loss', 'content': 0.012186083011329174, 'timestamp': '2025-09-30 22:13:03.817107', 'step': 181, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:03.855226', 'step': 181, 'epoch': 1} {'type': 'loss', 'content': 0.012741667218506336, 'timestamp': '2025-09-30 22:13:03.866387', 'step': 182, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:03.908037', 'step': 182, 'epoch': 1} {'type': 'loss', 'content': 0.04050694778561592, 'timestamp': '2025-09-30 22:13:03.916065', 'step': 183, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:13:03.950799', 'step': 183, 'epoch': 1} {'type': 'loss', 'content': 0.013532925397157669, 'timestamp': '2025-09-30 22:13:03.984233', 'step': 184, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:13:04.035033', 'step': 184, 'epoch': 1} {'type': 'loss', 'content': 0.022425033152103424, 'timestamp': '2025-09-30 22:13:04.043562', 'step': 185, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:04.082249', 'step': 185, 'epoch': 1} {'type': 'loss', 'content': 0.011872397735714912, 'timestamp': '2025-09-30 22:13:04.092620', 'step': 186, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:13:04.134065', 'step': 186, 'epoch': 1} {'type': 'loss', 'content': 0.02194029651582241, 'timestamp': '2025-09-30 22:13:04.147721', 'step': 187, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:13:04.182162', 'step': 187, 'epoch': 1} {'type': 'loss', 'content': 0.03069671429693699, 'timestamp': '2025-09-30 22:13:04.210421', 'step': 188, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:13:04.245500', 'step': 188, 'epoch': 1} {'type': 'loss', 'content': 0.010479142889380455, 'timestamp': '2025-09-30 22:13:04.258121', 'step': 189, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:13:04.290163', 'step': 189, 'epoch': 1} {'type': 'loss', 'content': 0.01092996820807457, 'timestamp': '2025-09-30 22:13:04.302722', 'step': 190, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:13:04.342405', 'step': 190, 'epoch': 1} {'type': 'loss', 'content': 0.01756451278924942, 'timestamp': '2025-09-30 22:13:04.358207', 'step': 191, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:13:04.392179', 'step': 191, 'epoch': 1} {'type': 'loss', 'content': 0.009152082726359367, 'timestamp': '2025-09-30 22:13:04.425573', 'step': 192, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:13:04.456335', 'step': 192, 'epoch': 1} {'type': 'loss', 'content': 0.01600995659828186, 'timestamp': '2025-09-30 22:13:04.460885', 'step': 193, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:04.501867', 'step': 193, 'epoch': 1} {'type': 'loss', 'content': 0.015879737213253975, 'timestamp': '2025-09-30 22:13:04.512284', 'step': 194, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:13:04.555622', 'step': 194, 'epoch': 1} {'type': 'loss', 'content': 0.011121682822704315, 'timestamp': '2025-09-30 22:13:04.571589', 'step': 195, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:13:04.608940', 'step': 195, 'epoch': 1} {'type': 'loss', 'content': 0.01052766665816307, 'timestamp': '2025-09-30 22:13:04.643546', 'step': 196, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:13:04.680449', 'step': 196, 'epoch': 1} {'type': 'loss', 'content': 0.021621566265821457, 'timestamp': '2025-09-30 22:13:04.693157', 'step': 197, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:04.726546', 'step': 197, 'epoch': 1} {'type': 'loss', 'content': 0.016959909349679947, 'timestamp': '2025-09-30 22:13:04.737045', 'step': 198, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:04.768814', 'step': 198, 'epoch': 1} {'type': 'loss', 'content': 0.02073858492076397, 'timestamp': '2025-09-30 22:13:04.778992', 'step': 199, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:04.822896', 'step': 199, 'epoch': 1} {'type': 'loss', 'content': 0.013792769983410835, 'timestamp': '2025-09-30 22:13:04.854174', 'step': 200, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:04.893771', 'step': 200, 'epoch': 1} {'type': 'loss', 'content': 0.016457555815577507, 'timestamp': '2025-09-30 22:13:04.899269', 'step': 201, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:13:04.945889', 'step': 201, 'epoch': 1} {'type': 'loss', 'content': 0.01595127210021019, 'timestamp': '2025-09-30 22:13:04.959324', 'step': 202, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:13:04.996749', 'step': 202, 'epoch': 1} {'type': 'loss', 'content': 0.005044651683419943, 'timestamp': '2025-09-30 22:13:05.010621', 'step': 203, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:13:05.053738', 'step': 203, 'epoch': 1} {'type': 'loss', 'content': 0.010557775385677814, 'timestamp': '2025-09-30 22:13:05.088410', 'step': 204, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:13:05.127278', 'step': 204, 'epoch': 1} {'type': 'loss', 'content': 0.01743490993976593, 'timestamp': '2025-09-30 22:13:05.139857', 'step': 205, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:13:05.173179', 'step': 205, 'epoch': 1} {'type': 'loss', 'content': 0.021075841039419174, 'timestamp': '2025-09-30 22:13:05.185473', 'step': 206, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:13:05.219879', 'step': 206, 'epoch': 1} {'type': 'loss', 'content': 0.02082311548292637, 'timestamp': '2025-09-30 22:13:05.233265', 'step': 207, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:13:05.269150', 'step': 207, 'epoch': 1} {'type': 'loss', 'content': 0.017491625621914864, 'timestamp': '2025-09-30 22:13:05.303842', 'step': 208, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:05.342779', 'step': 208, 'epoch': 1} {'type': 'loss', 'content': 0.015582921914756298, 'timestamp': '2025-09-30 22:13:05.350836', 'step': 209, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:13:05.392387', 'step': 209, 'epoch': 1} {'type': 'loss', 'content': 0.011036443524062634, 'timestamp': '2025-09-30 22:13:05.406008', 'step': 210, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:05.440800', 'step': 210, 'epoch': 1} {'type': 'loss', 'content': 0.017281629145145416, 'timestamp': '2025-09-30 22:13:05.448849', 'step': 211, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:13:05.485090', 'step': 211, 'epoch': 1} {'type': 'loss', 'content': 0.015614760108292103, 'timestamp': '2025-09-30 22:13:05.519272', 'step': 212, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:13:05.558703', 'step': 212, 'epoch': 1} {'type': 'loss', 'content': 0.01294607762247324, 'timestamp': '2025-09-30 22:13:05.568541', 'step': 213, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:05.601656', 'step': 213, 'epoch': 1} {'type': 'loss', 'content': 0.016707144677639008, 'timestamp': '2025-09-30 22:13:05.612699', 'step': 214, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:13:05.645005', 'step': 214, 'epoch': 1} {'type': 'loss', 'content': 0.018367597833275795, 'timestamp': '2025-09-30 22:13:05.657282', 'step': 215, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:13:05.699428', 'step': 215, 'epoch': 1} {'type': 'loss', 'content': 0.013285262510180473, 'timestamp': '2025-09-30 22:13:05.734046', 'step': 216, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:13:05.769701', 'step': 216, 'epoch': 1} {'type': 'loss', 'content': 0.007377085275948048, 'timestamp': '2025-09-30 22:13:05.782322', 'step': 217, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:13:05.836491', 'step': 217, 'epoch': 1} {'type': 'loss', 'content': 0.010679355822503567, 'timestamp': '2025-09-30 22:13:05.849033', 'step': 218, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:13:05.897052', 'step': 218, 'epoch': 1} {'type': 'loss', 'content': 0.0184700358659029, 'timestamp': '2025-09-30 22:13:05.910380', 'step': 219, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:13:05.947423', 'step': 219, 'epoch': 1} {'type': 'loss', 'content': 0.011153987608850002, 'timestamp': '2025-09-30 22:13:05.980686', 'step': 220, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:13:06.015471', 'step': 220, 'epoch': 1} {'type': 'loss', 'content': 0.009359365329146385, 'timestamp': '2025-09-30 22:13:06.034416', 'step': 221, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:13:06.076823', 'step': 221, 'epoch': 1} {'type': 'loss', 'content': 0.01882639341056347, 'timestamp': '2025-09-30 22:13:06.084237', 'step': 222, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:13:06.131449', 'step': 222, 'epoch': 1} {'type': 'loss', 'content': 0.00970226339995861, 'timestamp': '2025-09-30 22:13:06.143890', 'step': 223, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:06.189536', 'step': 223, 'epoch': 1} {'type': 'loss', 'content': 0.018550271168351173, 'timestamp': '2025-09-30 22:13:06.218377', 'step': 224, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:13:06.254555', 'step': 224, 'epoch': 1} {'type': 'loss', 'content': 0.013726242817938328, 'timestamp': '2025-09-30 22:13:06.273913', 'step': 225, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:06.312668', 'step': 225, 'epoch': 1} {'type': 'loss', 'content': 0.013593848794698715, 'timestamp': '2025-09-30 22:13:06.328512', 'step': 226, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:13:06.366949', 'step': 226, 'epoch': 1} {'type': 'loss', 'content': 0.017183566465973854, 'timestamp': '2025-09-30 22:13:06.374517', 'step': 227, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-09-30 22:13:06.424362', 'step': 227, 'epoch': 1} {'type': 'loss', 'content': 0.010725222527980804, 'timestamp': '2025-09-30 22:13:06.462340', 'step': 228, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:13:06.494745', 'step': 228, 'epoch': 1} {'type': 'loss', 'content': 0.012593534775078297, 'timestamp': '2025-09-30 22:13:06.507437', 'step': 229, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:13:06.551389', 'step': 229, 'epoch': 1} {'type': 'loss', 'content': 0.012523045763373375, 'timestamp': '2025-09-30 22:13:06.563627', 'step': 230, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-09-30 22:13:08.958662', 'step': 230, 'epoch': 1} {'type': 'pplx', 'content': 5.586642946954409, 'timestamp': '2025-09-30 22:13:08.962071', 'step': 230, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:08.993642', 'step': 230, 'epoch': 1} {'type': 'loss', 'content': 0.01927979476749897, 'timestamp': '2025-09-30 22:13:09.000193', 'step': 231, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:13:09.032526', 'step': 231, 'epoch': 1} {'type': 'loss', 'content': 0.026295769959688187, 'timestamp': '2025-09-30 22:13:09.065544', 'step': 232, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:09.096533', 'step': 232, 'epoch': 1} {'type': 'loss', 'content': 0.01865268498659134, 'timestamp': '2025-09-30 22:13:09.104636', 'step': 233, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:13:09.143951', 'step': 233, 'epoch': 1} {'type': 'loss', 'content': 0.012808283790946007, 'timestamp': '2025-09-30 22:13:09.156510', 'step': 234, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:09.195595', 'step': 234, 'epoch': 1} {'type': 'loss', 'content': 0.01775694452226162, 'timestamp': '2025-09-30 22:13:09.203343', 'step': 235, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:13:09.239789', 'step': 235, 'epoch': 1} {'type': 'loss', 'content': 0.010759370401501656, 'timestamp': '2025-09-30 22:13:09.274016', 'step': 236, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:09.307573', 'step': 236, 'epoch': 1} {'type': 'loss', 'content': 0.01735626719892025, 'timestamp': '2025-09-30 22:13:09.316361', 'step': 237, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:13:09.352777', 'step': 237, 'epoch': 1} {'type': 'loss', 'content': 0.01837928406894207, 'timestamp': '2025-09-30 22:13:09.365117', 'step': 238, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:13:09.398660', 'step': 238, 'epoch': 1} {'type': 'loss', 'content': 0.015187880955636501, 'timestamp': '2025-09-30 22:13:09.411008', 'step': 239, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:13:09.450674', 'step': 239, 'epoch': 1} {'type': 'loss', 'content': 0.013186373747885227, 'timestamp': '2025-09-30 22:13:09.484894', 'step': 240, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:13:09.524107', 'step': 240, 'epoch': 1} {'type': 'loss', 'content': 0.015222107991576195, 'timestamp': '2025-09-30 22:13:09.537183', 'step': 241, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:09.574072', 'step': 241, 'epoch': 1} {'type': 'loss', 'content': 0.011916671879589558, 'timestamp': '2025-09-30 22:13:09.585349', 'step': 242, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:09.626203', 'step': 242, 'epoch': 1} {'type': 'loss', 'content': 0.019251955673098564, 'timestamp': '2025-09-30 22:13:09.637260', 'step': 243, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:13:09.675465', 'step': 243, 'epoch': 1} {'type': 'loss', 'content': 0.007890812121331692, 'timestamp': '2025-09-30 22:13:09.710122', 'step': 244, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:13:09.745171', 'step': 244, 'epoch': 1} {'type': 'loss', 'content': 0.018134785816073418, 'timestamp': '2025-09-30 22:13:09.758138', 'step': 245, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:13:09.801735', 'step': 245, 'epoch': 1} {'type': 'loss', 'content': 0.01059873029589653, 'timestamp': '2025-09-30 22:13:09.817338', 'step': 246, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:13:09.865109', 'step': 246, 'epoch': 1} {'type': 'loss', 'content': 0.009460191242396832, 'timestamp': '2025-09-30 22:13:09.878488', 'step': 247, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:13:09.917552', 'step': 247, 'epoch': 1} {'type': 'loss', 'content': 0.021795684471726418, 'timestamp': '2025-09-30 22:13:09.950934', 'step': 248, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:13:09.986449', 'step': 248, 'epoch': 1} {'type': 'loss', 'content': 0.01156249362975359, 'timestamp': '2025-09-30 22:13:09.999765', 'step': 249, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:13:10.049108', 'step': 249, 'epoch': 1} {'type': 'loss', 'content': 0.011901349760591984, 'timestamp': '2025-09-30 22:13:10.065014', 'step': 250, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:13:10.108282', 'step': 250, 'epoch': 1} {'type': 'loss', 'content': 0.012487679719924927, 'timestamp': '2025-09-30 22:13:10.122063', 'step': 251, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:13:10.164909', 'step': 251, 'epoch': 1} {'type': 'loss', 'content': 0.005352849140763283, 'timestamp': '2025-09-30 22:13:10.201746', 'step': 252, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:13:10.243540', 'step': 252, 'epoch': 1} {'type': 'loss', 'content': 0.01035391166806221, 'timestamp': '2025-09-30 22:13:10.253527', 'step': 253, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:13:10.293386', 'step': 253, 'epoch': 1} {'type': 'loss', 'content': 0.015251671895384789, 'timestamp': '2025-09-30 22:13:10.306687', 'step': 254, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:13:10.344026', 'step': 254, 'epoch': 1} {'type': 'loss', 'content': 0.010876107029616833, 'timestamp': '2025-09-30 22:13:10.357750', 'step': 255, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:10.390728', 'step': 255, 'epoch': 1} {'type': 'loss', 'content': 0.022635027766227722, 'timestamp': '2025-09-30 22:13:10.422610', 'step': 256, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:13:10.460109', 'step': 256, 'epoch': 1} {'type': 'loss', 'content': 0.017526494339108467, 'timestamp': '2025-09-30 22:13:10.473255', 'step': 257, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:13:10.509052', 'step': 257, 'epoch': 1} {'type': 'loss', 'content': 0.013761814683675766, 'timestamp': '2025-09-30 22:13:10.521293', 'step': 258, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:13:10.567794', 'step': 258, 'epoch': 1} {'type': 'loss', 'content': 0.017204131931066513, 'timestamp': '2025-09-30 22:13:10.581250', 'step': 259, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:13:10.623168', 'step': 259, 'epoch': 1} {'type': 'loss', 'content': 0.01420542597770691, 'timestamp': '2025-09-30 22:13:10.657708', 'step': 260, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 14238374516352}, 'timestamp': '2025-09-30 22:13:10.707682', 'step': 260, 'epoch': 1} {'type': 'loss', 'content': 0.011504081077873707, 'timestamp': '2025-09-30 22:13:10.724696', 'step': 261, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:13:10.762850', 'step': 261, 'epoch': 1} {'type': 'loss', 'content': 0.024398203939199448, 'timestamp': '2025-09-30 22:13:10.776246', 'step': 262, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:10.809490', 'step': 262, 'epoch': 1} {'type': 'loss', 'content': 0.028062906116247177, 'timestamp': '2025-09-30 22:13:10.817255', 'step': 263, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:10.858904', 'step': 263, 'epoch': 1} {'type': 'loss', 'content': 0.030076058581471443, 'timestamp': '2025-09-30 22:13:10.890016', 'step': 264, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:10.931202', 'step': 264, 'epoch': 1} {'type': 'loss', 'content': 0.03159669414162636, 'timestamp': '2025-09-30 22:13:10.936910', 'step': 265, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:13:10.973296', 'step': 265, 'epoch': 1} {'type': 'loss', 'content': 0.015276629477739334, 'timestamp': '2025-09-30 22:13:10.985927', 'step': 266, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:13:11.025178', 'step': 266, 'epoch': 1} {'type': 'loss', 'content': 0.01706337183713913, 'timestamp': '2025-09-30 22:13:11.038547', 'step': 267, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:13:11.072954', 'step': 267, 'epoch': 1} {'type': 'loss', 'content': 0.020861618220806122, 'timestamp': '2025-09-30 22:13:11.106343', 'step': 268, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:13:11.142600', 'step': 268, 'epoch': 1} {'type': 'loss', 'content': 0.023828132078051567, 'timestamp': '2025-09-30 22:13:11.153144', 'step': 269, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:13:11.189906', 'step': 269, 'epoch': 1} {'type': 'loss', 'content': 0.014024309813976288, 'timestamp': '2025-09-30 22:13:11.203285', 'step': 270, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-09-30 22:13:11.247665', 'step': 270, 'epoch': 1} {'type': 'loss', 'content': 0.00785061251372099, 'timestamp': '2025-09-30 22:13:11.264109', 'step': 271, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:13:11.306283', 'step': 271, 'epoch': 1} {'type': 'loss', 'content': 0.012670233845710754, 'timestamp': '2025-09-30 22:13:11.343004', 'step': 272, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:13:11.377886', 'step': 272, 'epoch': 1} {'type': 'loss', 'content': 0.012132198549807072, 'timestamp': '2025-09-30 22:13:11.391174', 'step': 273, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:11.425283', 'step': 273, 'epoch': 1} {'type': 'loss', 'content': 0.022953467443585396, 'timestamp': '2025-09-30 22:13:11.435626', 'step': 274, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-09-30 22:13:11.480483', 'step': 274, 'epoch': 1} {'type': 'loss', 'content': 0.007391286548227072, 'timestamp': '2025-09-30 22:13:11.496649', 'step': 275, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:13:11.534934', 'step': 275, 'epoch': 1} {'type': 'loss', 'content': 0.014905553311109543, 'timestamp': '2025-09-30 22:13:11.569477', 'step': 276, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:13:11.603011', 'step': 276, 'epoch': 1} {'type': 'loss', 'content': 0.01947295479476452, 'timestamp': '2025-09-30 22:13:11.612925', 'step': 277, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:13:11.650933', 'step': 277, 'epoch': 1} {'type': 'loss', 'content': 0.014910265803337097, 'timestamp': '2025-09-30 22:13:11.658440', 'step': 278, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:11.695534', 'step': 278, 'epoch': 1} {'type': 'loss', 'content': 0.019228165969252586, 'timestamp': '2025-09-30 22:13:11.706056', 'step': 279, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:11.741078', 'step': 279, 'epoch': 1} {'type': 'loss', 'content': 0.023536305874586105, 'timestamp': '2025-09-30 22:13:11.772312', 'step': 280, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:13:11.806752', 'step': 280, 'epoch': 1} {'type': 'loss', 'content': 0.03552548587322235, 'timestamp': '2025-09-30 22:13:11.819860', 'step': 281, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:11.851986', 'step': 281, 'epoch': 1} {'type': 'loss', 'content': 0.013345292769372463, 'timestamp': '2025-09-30 22:13:11.862298', 'step': 282, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:13:11.903289', 'step': 282, 'epoch': 1} {'type': 'loss', 'content': 0.017060376703739166, 'timestamp': '2025-09-30 22:13:11.917288', 'step': 283, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:13:11.950807', 'step': 283, 'epoch': 1} {'type': 'loss', 'content': 0.013612824492156506, 'timestamp': '2025-09-30 22:13:11.983886', 'step': 284, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:13:12.026228', 'step': 284, 'epoch': 1} {'type': 'loss', 'content': 0.011212517507374287, 'timestamp': '2025-09-30 22:13:12.041328', 'step': 285, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:13:12.084694', 'step': 285, 'epoch': 1} {'type': 'loss', 'content': 0.019147543236613274, 'timestamp': '2025-09-30 22:13:12.100584', 'step': 286, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:12.138217', 'step': 286, 'epoch': 1} {'type': 'loss', 'content': 0.022566957399249077, 'timestamp': '2025-09-30 22:13:12.148722', 'step': 287, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:12.182427', 'step': 287, 'epoch': 1} {'type': 'loss', 'content': 0.011118652299046516, 'timestamp': '2025-09-30 22:13:12.213659', 'step': 288, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:12.246350', 'step': 288, 'epoch': 1} {'type': 'loss', 'content': 0.026219571009278297, 'timestamp': '2025-09-30 22:13:12.255017', 'step': 289, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:13:12.293929', 'step': 289, 'epoch': 1} {'type': 'loss', 'content': 0.011589091271162033, 'timestamp': '2025-09-30 22:13:12.307321', 'step': 290, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:13:12.344614', 'step': 290, 'epoch': 1} {'type': 'loss', 'content': 0.008985496126115322, 'timestamp': '2025-09-30 22:13:12.357941', 'step': 291, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:13:12.392452', 'step': 291, 'epoch': 1} {'type': 'loss', 'content': 0.014308687299489975, 'timestamp': '2025-09-30 22:13:12.425565', 'step': 292, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:12.460078', 'step': 292, 'epoch': 1} {'type': 'loss', 'content': 0.02304118312895298, 'timestamp': '2025-09-30 22:13:12.469001', 'step': 293, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:13:12.501653', 'step': 293, 'epoch': 1} {'type': 'loss', 'content': 0.013603178784251213, 'timestamp': '2025-09-30 22:13:12.513907', 'step': 294, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:13:12.547749', 'step': 294, 'epoch': 1} {'type': 'loss', 'content': 0.022707384079694748, 'timestamp': '2025-09-30 22:13:12.554843', 'step': 295, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:13:12.598183', 'step': 295, 'epoch': 1} {'type': 'loss', 'content': 0.00937237311154604, 'timestamp': '2025-09-30 22:13:12.634999', 'step': 296, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:13:12.667321', 'step': 296, 'epoch': 1} {'type': 'loss', 'content': 0.02236713282763958, 'timestamp': '2025-09-30 22:13:12.672148', 'step': 297, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:13:12.713502', 'step': 297, 'epoch': 1} {'type': 'loss', 'content': 0.017074013128876686, 'timestamp': '2025-09-30 22:13:12.720372', 'step': 298, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:12.756555', 'step': 298, 'epoch': 1} {'type': 'loss', 'content': 0.0165159460157156, 'timestamp': '2025-09-30 22:13:12.766862', 'step': 299, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:12.801445', 'step': 299, 'epoch': 1} {'type': 'loss', 'content': 0.01714632846415043, 'timestamp': '2025-09-30 22:13:12.833438', 'step': 300, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:13:12.874918', 'step': 300, 'epoch': 1} {'type': 'loss', 'content': 0.012943536043167114, 'timestamp': '2025-09-30 22:13:12.888196', 'step': 301, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:12.919777', 'step': 301, 'epoch': 1} {'type': 'loss', 'content': 0.01896374672651291, 'timestamp': '2025-09-30 22:13:12.927849', 'step': 302, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:13:12.962782', 'step': 302, 'epoch': 1} {'type': 'loss', 'content': 0.03681395947933197, 'timestamp': '2025-09-30 22:13:12.969874', 'step': 303, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:13.002007', 'step': 303, 'epoch': 1} {'type': 'loss', 'content': 0.010998062789440155, 'timestamp': '2025-09-30 22:13:13.034066', 'step': 304, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:13:13.079304', 'step': 304, 'epoch': 1} {'type': 'loss', 'content': 0.010683653876185417, 'timestamp': '2025-09-30 22:13:13.091920', 'step': 305, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:13.124037', 'step': 305, 'epoch': 1} {'type': 'loss', 'content': 0.017140964046120644, 'timestamp': '2025-09-30 22:13:13.131979', 'step': 306, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:13.175756', 'step': 306, 'epoch': 1} {'type': 'loss', 'content': 0.011500844731926918, 'timestamp': '2025-09-30 22:13:13.183647', 'step': 307, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:13:13.221853', 'step': 307, 'epoch': 1} {'type': 'loss', 'content': 0.017765764147043228, 'timestamp': '2025-09-30 22:13:13.256527', 'step': 308, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:13:13.296566', 'step': 308, 'epoch': 1} {'type': 'loss', 'content': 0.0074893394485116005, 'timestamp': '2025-09-30 22:13:13.311921', 'step': 309, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-09-30 22:13:13.354503', 'step': 309, 'epoch': 1} {'type': 'loss', 'content': 0.011833159253001213, 'timestamp': '2025-09-30 22:13:13.371596', 'step': 310, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:13:13.411881', 'step': 310, 'epoch': 1} {'type': 'loss', 'content': 0.018905291333794594, 'timestamp': '2025-09-30 22:13:13.425743', 'step': 311, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:13:13.472333', 'step': 311, 'epoch': 1} {'type': 'loss', 'content': 0.009745593182742596, 'timestamp': '2025-09-30 22:13:13.507202', 'step': 312, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:13.545048', 'step': 312, 'epoch': 1} {'type': 'loss', 'content': 0.025344759225845337, 'timestamp': '2025-09-30 22:13:13.552972', 'step': 313, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:13:13.593207', 'step': 313, 'epoch': 1} {'type': 'loss', 'content': 0.01137094758450985, 'timestamp': '2025-09-30 22:13:13.606978', 'step': 314, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:13:13.640901', 'step': 314, 'epoch': 1} {'type': 'loss', 'content': 0.011146454140543938, 'timestamp': '2025-09-30 22:13:13.653310', 'step': 315, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:13:13.690327', 'step': 315, 'epoch': 1} {'type': 'loss', 'content': 0.0077347299084067345, 'timestamp': '2025-09-30 22:13:13.724511', 'step': 316, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:13:13.763431', 'step': 316, 'epoch': 1} {'type': 'loss', 'content': 0.00844904687255621, 'timestamp': '2025-09-30 22:13:13.776837', 'step': 317, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:13:13.818694', 'step': 317, 'epoch': 1} {'type': 'loss', 'content': 0.010122202336788177, 'timestamp': '2025-09-30 22:13:13.832083', 'step': 318, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:13:13.867450', 'step': 318, 'epoch': 1} {'type': 'loss', 'content': 0.011455384083092213, 'timestamp': '2025-09-30 22:13:13.880021', 'step': 319, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:13:13.917215', 'step': 319, 'epoch': 1} {'type': 'loss', 'content': 0.006118587218225002, 'timestamp': '2025-09-30 22:13:13.951870', 'step': 320, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:13.987479', 'step': 320, 'epoch': 1} {'type': 'loss', 'content': 0.022060874849557877, 'timestamp': '2025-09-30 22:13:14.003334', 'step': 321, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:13:14.039419', 'step': 321, 'epoch': 1} {'type': 'loss', 'content': 0.017150264233350754, 'timestamp': '2025-09-30 22:13:14.051980', 'step': 322, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:13:14.097350', 'step': 322, 'epoch': 1} {'type': 'loss', 'content': 0.019098486751317978, 'timestamp': '2025-09-30 22:13:14.111403', 'step': 323, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:13:14.144914', 'step': 323, 'epoch': 1} {'type': 'loss', 'content': 0.024245485663414, 'timestamp': '2025-09-30 22:13:14.172890', 'step': 324, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:13:14.215024', 'step': 324, 'epoch': 1} {'type': 'loss', 'content': 0.009648287668824196, 'timestamp': '2025-09-30 22:13:14.219837', 'step': 325, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:13:14.253632', 'step': 325, 'epoch': 1} {'type': 'loss', 'content': 0.014924516901373863, 'timestamp': '2025-09-30 22:13:14.260888', 'step': 326, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:13:14.307634', 'step': 326, 'epoch': 1} {'type': 'loss', 'content': 0.01756501942873001, 'timestamp': '2025-09-30 22:13:14.320225', 'step': 327, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:13:14.363150', 'step': 327, 'epoch': 1} {'type': 'loss', 'content': 0.03167622908949852, 'timestamp': '2025-09-30 22:13:14.391898', 'step': 328, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:13:14.427759', 'step': 328, 'epoch': 1} {'type': 'loss', 'content': 0.005993070546537638, 'timestamp': '2025-09-30 22:13:14.440870', 'step': 329, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:14.477194', 'step': 329, 'epoch': 1} {'type': 'loss', 'content': 0.017649002373218536, 'timestamp': '2025-09-30 22:13:14.487505', 'step': 330, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:13:14.532018', 'step': 330, 'epoch': 1} {'type': 'loss', 'content': 0.008263523690402508, 'timestamp': '2025-09-30 22:13:14.545874', 'step': 331, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:13:14.596560', 'step': 331, 'epoch': 1} {'type': 'loss', 'content': 0.011624597012996674, 'timestamp': '2025-09-30 22:13:14.633119', 'step': 332, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:13:14.672852', 'step': 332, 'epoch': 1} {'type': 'loss', 'content': 0.01413639448583126, 'timestamp': '2025-09-30 22:13:14.685908', 'step': 333, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:13:14.724927', 'step': 333, 'epoch': 1} {'type': 'loss', 'content': 0.014897001907229424, 'timestamp': '2025-09-30 22:13:14.738311', 'step': 334, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:13:14.786619', 'step': 334, 'epoch': 1} {'type': 'loss', 'content': 0.006835165433585644, 'timestamp': '2025-09-30 22:13:14.800639', 'step': 335, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:13:14.837125', 'step': 335, 'epoch': 1} {'type': 'loss', 'content': 0.023027604445815086, 'timestamp': '2025-09-30 22:13:14.871449', 'step': 336, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:14.907564', 'step': 336, 'epoch': 1} {'type': 'loss', 'content': 0.0157425869256258, 'timestamp': '2025-09-30 22:13:14.916375', 'step': 337, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:13:14.958049', 'step': 337, 'epoch': 1} {'type': 'loss', 'content': 0.012969511561095715, 'timestamp': '2025-09-30 22:13:14.971367', 'step': 338, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:13:15.010020', 'step': 338, 'epoch': 1} {'type': 'loss', 'content': 0.011469487100839615, 'timestamp': '2025-09-30 22:13:15.017177', 'step': 339, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:15.052390', 'step': 339, 'epoch': 1} {'type': 'loss', 'content': 0.029641326516866684, 'timestamp': '2025-09-30 22:13:15.084751', 'step': 340, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:13:15.124137', 'step': 340, 'epoch': 1} {'type': 'loss', 'content': 0.08696480840444565, 'timestamp': '2025-09-30 22:13:15.128974', 'step': 341, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:15.162128', 'step': 341, 'epoch': 1} {'type': 'loss', 'content': 0.019450241699814796, 'timestamp': '2025-09-30 22:13:15.173369', 'step': 342, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:13:15.212323', 'step': 342, 'epoch': 1} {'type': 'loss', 'content': 0.041848428547382355, 'timestamp': '2025-09-30 22:13:15.219444', 'step': 343, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:15.259029', 'step': 343, 'epoch': 1} {'type': 'loss', 'content': 0.01888955384492874, 'timestamp': '2025-09-30 22:13:15.290364', 'step': 344, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:13:15.323047', 'step': 344, 'epoch': 1} {'type': 'loss', 'content': 0.02714778482913971, 'timestamp': '2025-09-30 22:13:15.328488', 'step': 345, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-09-30 22:13:17.736554', 'step': 345, 'epoch': 1} {'type': 'pplx', 'content': 5.524948511337376, 'timestamp': '2025-09-30 22:13:17.746433', 'step': 345, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:13:17.780544', 'step': 345, 'epoch': 1} {'type': 'loss', 'content': 0.025112979114055634, 'timestamp': '2025-09-30 22:13:17.787437', 'step': 346, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:17.827523', 'step': 346, 'epoch': 1} {'type': 'loss', 'content': 0.019669389352202415, 'timestamp': '2025-09-30 22:13:17.835106', 'step': 347, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:17.882114', 'step': 347, 'epoch': 1} {'type': 'loss', 'content': 0.015546857379376888, 'timestamp': '2025-09-30 22:13:17.913962', 'step': 348, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:13:17.948929', 'step': 348, 'epoch': 1} {'type': 'loss', 'content': 0.01003229059278965, 'timestamp': '2025-09-30 22:13:17.962101', 'step': 349, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:17.995610', 'step': 349, 'epoch': 1} {'type': 'loss', 'content': 0.016755785793066025, 'timestamp': '2025-09-30 22:13:18.006773', 'step': 350, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:18.046062', 'step': 350, 'epoch': 1} {'type': 'loss', 'content': 0.010388433001935482, 'timestamp': '2025-09-30 22:13:18.057205', 'step': 351, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:13:18.093200', 'step': 351, 'epoch': 1} {'type': 'loss', 'content': 0.024582738056778908, 'timestamp': '2025-09-30 22:13:18.127540', 'step': 352, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:13:18.169371', 'step': 352, 'epoch': 1} {'type': 'loss', 'content': 0.0186713095754385, 'timestamp': '2025-09-30 22:13:18.182038', 'step': 353, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:13:18.224574', 'step': 353, 'epoch': 1} {'type': 'loss', 'content': 0.009301440790295601, 'timestamp': '2025-09-30 22:13:18.238338', 'step': 354, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:13:18.281817', 'step': 354, 'epoch': 1} {'type': 'loss', 'content': 0.01219329982995987, 'timestamp': '2025-09-30 22:13:18.295890', 'step': 355, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:13:18.332829', 'step': 355, 'epoch': 1} {'type': 'loss', 'content': 0.019256679341197014, 'timestamp': '2025-09-30 22:13:18.367100', 'step': 356, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:13:18.409278', 'step': 356, 'epoch': 1} {'type': 'loss', 'content': 0.01657373271882534, 'timestamp': '2025-09-30 22:13:18.419278', 'step': 357, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:13:18.465172', 'step': 357, 'epoch': 1} {'type': 'loss', 'content': 0.013607809320092201, 'timestamp': '2025-09-30 22:13:18.477640', 'step': 358, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:13:18.512777', 'step': 358, 'epoch': 1} {'type': 'loss', 'content': 0.025948015972971916, 'timestamp': '2025-09-30 22:13:18.525648', 'step': 359, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:13:18.569299', 'step': 359, 'epoch': 1} {'type': 'loss', 'content': 0.017592886462807655, 'timestamp': '2025-09-30 22:13:18.605981', 'step': 360, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:13:18.650331', 'step': 360, 'epoch': 1} {'type': 'loss', 'content': 0.013935086317360401, 'timestamp': '2025-09-30 22:13:18.663472', 'step': 361, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:18.700824', 'step': 361, 'epoch': 1} {'type': 'loss', 'content': 0.013167818076908588, 'timestamp': '2025-09-30 22:13:18.708808', 'step': 362, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:13:18.749408', 'step': 362, 'epoch': 1} {'type': 'loss', 'content': 0.014511304907500744, 'timestamp': '2025-09-30 22:13:18.765314', 'step': 363, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 14238374516352}, 'timestamp': '2025-09-30 22:13:18.817733', 'step': 363, 'epoch': 1} {'type': 'loss', 'content': 0.012271334417164326, 'timestamp': '2025-09-30 22:13:18.855926', 'step': 364, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:13:18.888882', 'step': 364, 'epoch': 1} {'type': 'loss', 'content': 0.024161912500858307, 'timestamp': '2025-09-30 22:13:18.898943', 'step': 365, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:13:18.933977', 'step': 365, 'epoch': 1} {'type': 'loss', 'content': 0.0163529422134161, 'timestamp': '2025-09-30 22:13:18.946607', 'step': 366, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:18.986572', 'step': 366, 'epoch': 1} {'type': 'loss', 'content': 0.015899039804935455, 'timestamp': '2025-09-30 22:13:18.998613', 'step': 367, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:13:19.034032', 'step': 367, 'epoch': 1} {'type': 'loss', 'content': 0.013269470073282719, 'timestamp': '2025-09-30 22:13:19.066896', 'step': 368, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:19.116795', 'step': 368, 'epoch': 1} {'type': 'loss', 'content': 0.014671982266008854, 'timestamp': '2025-09-30 22:13:19.126244', 'step': 369, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:13:19.164760', 'step': 369, 'epoch': 1} {'type': 'loss', 'content': 0.017914393916726112, 'timestamp': '2025-09-30 22:13:19.172440', 'step': 370, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:19.208243', 'step': 370, 'epoch': 1} {'type': 'loss', 'content': 0.013160360977053642, 'timestamp': '2025-09-30 22:13:19.219289', 'step': 371, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:19.261896', 'step': 371, 'epoch': 1} {'type': 'loss', 'content': 0.019966568797826767, 'timestamp': '2025-09-30 22:13:19.292980', 'step': 372, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:13:19.329699', 'step': 372, 'epoch': 1} {'type': 'loss', 'content': 0.017061376944184303, 'timestamp': '2025-09-30 22:13:19.343672', 'step': 373, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:13:19.386505', 'step': 373, 'epoch': 1} {'type': 'loss', 'content': 0.013974986970424652, 'timestamp': '2025-09-30 22:13:19.399188', 'step': 374, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:13:19.439080', 'step': 374, 'epoch': 1} {'type': 'loss', 'content': 0.010096900165081024, 'timestamp': '2025-09-30 22:13:19.452924', 'step': 375, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:19.485798', 'step': 375, 'epoch': 1} {'type': 'loss', 'content': 0.020089181140065193, 'timestamp': '2025-09-30 22:13:19.514617', 'step': 376, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:19.547293', 'step': 376, 'epoch': 1} {'type': 'loss', 'content': 0.016456788405776024, 'timestamp': '2025-09-30 22:13:19.552726', 'step': 377, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:13:19.587030', 'step': 377, 'epoch': 1} {'type': 'loss', 'content': 0.018948769196867943, 'timestamp': '2025-09-30 22:13:19.599551', 'step': 378, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:13:19.633793', 'step': 378, 'epoch': 1} {'type': 'loss', 'content': 0.034312453120946884, 'timestamp': '2025-09-30 22:13:19.641266', 'step': 379, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:13:19.679663', 'step': 379, 'epoch': 1} {'type': 'loss', 'content': 0.02766266278922558, 'timestamp': '2025-09-30 22:13:19.712814', 'step': 380, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:13:19.745727', 'step': 380, 'epoch': 1} {'type': 'loss', 'content': 0.01396411843597889, 'timestamp': '2025-09-30 22:13:19.753857', 'step': 381, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:13:19.789120', 'step': 381, 'epoch': 1} {'type': 'loss', 'content': 0.02729835733771324, 'timestamp': '2025-09-30 22:13:19.801393', 'step': 382, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:13:19.837563', 'step': 382, 'epoch': 1} {'type': 'loss', 'content': 0.01805323176085949, 'timestamp': '2025-09-30 22:13:19.850047', 'step': 383, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:13:19.890838', 'step': 383, 'epoch': 1} {'type': 'loss', 'content': 0.00845720712095499, 'timestamp': '2025-09-30 22:13:19.925097', 'step': 384, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:19.958952', 'step': 384, 'epoch': 1} {'type': 'loss', 'content': 0.010211989283561707, 'timestamp': '2025-09-30 22:13:19.969208', 'step': 385, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:13:20.005894', 'step': 385, 'epoch': 1} {'type': 'loss', 'content': 0.0188337080180645, 'timestamp': '2025-09-30 22:13:20.013679', 'step': 386, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:13:20.051634', 'step': 386, 'epoch': 1} {'type': 'loss', 'content': 0.014987856149673462, 'timestamp': '2025-09-30 22:13:20.065273', 'step': 387, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:13:20.106865', 'step': 387, 'epoch': 1} {'type': 'loss', 'content': 0.018355386331677437, 'timestamp': '2025-09-30 22:13:20.136337', 'step': 388, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:20.172331', 'step': 388, 'epoch': 1} {'type': 'loss', 'content': 0.01463927049189806, 'timestamp': '2025-09-30 22:13:20.177792', 'step': 389, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:13:20.212534', 'step': 389, 'epoch': 1} {'type': 'loss', 'content': 0.034404098987579346, 'timestamp': '2025-09-30 22:13:20.224814', 'step': 390, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:20.260739', 'step': 390, 'epoch': 1} {'type': 'loss', 'content': 0.01674843579530716, 'timestamp': '2025-09-30 22:13:20.270988', 'step': 391, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:13:20.319174', 'step': 391, 'epoch': 1} {'type': 'loss', 'content': 0.011764826253056526, 'timestamp': '2025-09-30 22:13:20.352559', 'step': 392, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:13:20.401462', 'step': 392, 'epoch': 1} {'type': 'loss', 'content': 0.01702144928276539, 'timestamp': '2025-09-30 22:13:20.414067', 'step': 393, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:13:20.452853', 'step': 393, 'epoch': 1} {'type': 'loss', 'content': 0.01283255685120821, 'timestamp': '2025-09-30 22:13:20.465402', 'step': 394, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:20.502923', 'step': 394, 'epoch': 1} {'type': 'loss', 'content': 0.027312854304909706, 'timestamp': '2025-09-30 22:13:20.510933', 'step': 395, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:13:20.553925', 'step': 395, 'epoch': 1} {'type': 'loss', 'content': 0.015591930598020554, 'timestamp': '2025-09-30 22:13:20.588589', 'step': 396, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:13:20.630990', 'step': 396, 'epoch': 1} {'type': 'loss', 'content': 0.02703159861266613, 'timestamp': '2025-09-30 22:13:20.640864', 'step': 397, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:13:20.682170', 'step': 397, 'epoch': 1} {'type': 'loss', 'content': 0.015556761994957924, 'timestamp': '2025-09-30 22:13:20.695582', 'step': 398, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:13:20.736559', 'step': 398, 'epoch': 1} {'type': 'loss', 'content': 0.009088275954127312, 'timestamp': '2025-09-30 22:13:20.750502', 'step': 399, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:13:20.789467', 'step': 399, 'epoch': 1} {'type': 'loss', 'content': 0.018833929672837257, 'timestamp': '2025-09-30 22:13:20.822884', 'step': 400, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:13:20.861469', 'step': 400, 'epoch': 1} {'type': 'loss', 'content': 0.016304094344377518, 'timestamp': '2025-09-30 22:13:20.873858', 'step': 401, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:20.910605', 'step': 401, 'epoch': 1} {'type': 'loss', 'content': 0.019747715443372726, 'timestamp': '2025-09-30 22:13:20.921026', 'step': 402, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:13:20.955389', 'step': 402, 'epoch': 1} {'type': 'loss', 'content': 0.016585690900683403, 'timestamp': '2025-09-30 22:13:20.962508', 'step': 403, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:13:20.997342', 'step': 403, 'epoch': 1} {'type': 'loss', 'content': 0.01574224978685379, 'timestamp': '2025-09-30 22:13:21.025193', 'step': 404, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:21.058273', 'step': 404, 'epoch': 1} {'type': 'loss', 'content': 0.023047136142849922, 'timestamp': '2025-09-30 22:13:21.063833', 'step': 405, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:21.107626', 'step': 405, 'epoch': 1} {'type': 'loss', 'content': 0.017483817413449287, 'timestamp': '2025-09-30 22:13:21.115650', 'step': 406, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:13:21.149237', 'step': 406, 'epoch': 1} {'type': 'loss', 'content': 0.02771250531077385, 'timestamp': '2025-09-30 22:13:21.153427', 'step': 407, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:13:21.197526', 'step': 407, 'epoch': 1} {'type': 'loss', 'content': 0.0231977179646492, 'timestamp': '2025-09-30 22:13:21.225260', 'step': 408, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 15187581968384}, 'timestamp': '2025-09-30 22:13:21.285570', 'step': 408, 'epoch': 1} {'type': 'loss', 'content': 0.009227665141224861, 'timestamp': '2025-09-30 22:13:21.302906', 'step': 409, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:13:21.349342', 'step': 409, 'epoch': 1} {'type': 'loss', 'content': 0.013030946254730225, 'timestamp': '2025-09-30 22:13:21.363059', 'step': 410, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:13:21.406770', 'step': 410, 'epoch': 1} {'type': 'loss', 'content': 0.030049873515963554, 'timestamp': '2025-09-30 22:13:21.411245', 'step': 411, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:21.444924', 'step': 411, 'epoch': 1} {'type': 'loss', 'content': 0.008911799639463425, 'timestamp': '2025-09-30 22:13:21.476707', 'step': 412, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:13:21.510565', 'step': 412, 'epoch': 1} {'type': 'loss', 'content': 0.023440878838300705, 'timestamp': '2025-09-30 22:13:21.515709', 'step': 413, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:21.554636', 'step': 413, 'epoch': 1} {'type': 'loss', 'content': 0.02695058099925518, 'timestamp': '2025-09-30 22:13:21.562464', 'step': 414, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 14712978242368}, 'timestamp': '2025-09-30 22:13:21.611009', 'step': 414, 'epoch': 1} {'type': 'loss', 'content': 0.007888389751315117, 'timestamp': '2025-09-30 22:13:21.628612', 'step': 415, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:13:21.662425', 'step': 415, 'epoch': 1} {'type': 'loss', 'content': 0.016789231449365616, 'timestamp': '2025-09-30 22:13:21.690813', 'step': 416, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:21.724247', 'step': 416, 'epoch': 1} {'type': 'loss', 'content': 0.009886451996862888, 'timestamp': '2025-09-30 22:13:21.729846', 'step': 417, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:21.762339', 'step': 417, 'epoch': 1} {'type': 'loss', 'content': 0.017675846815109253, 'timestamp': '2025-09-30 22:13:21.770244', 'step': 418, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:21.809806', 'step': 418, 'epoch': 1} {'type': 'loss', 'content': 0.013344887644052505, 'timestamp': '2025-09-30 22:13:21.817771', 'step': 419, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:13:21.861219', 'step': 419, 'epoch': 1} {'type': 'loss', 'content': 0.013717987574636936, 'timestamp': '2025-09-30 22:13:21.895778', 'step': 420, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:13:21.939313', 'step': 420, 'epoch': 1} {'type': 'loss', 'content': 0.025773746892809868, 'timestamp': '2025-09-30 22:13:21.944537', 'step': 421, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:13:21.984423', 'step': 421, 'epoch': 1} {'type': 'loss', 'content': 0.007907262071967125, 'timestamp': '2025-09-30 22:13:21.998102', 'step': 422, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:13:22.035292', 'step': 422, 'epoch': 1} {'type': 'loss', 'content': 0.013128413818776608, 'timestamp': '2025-09-30 22:13:22.048607', 'step': 423, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:13:22.085719', 'step': 423, 'epoch': 1} {'type': 'loss', 'content': 0.01196190994232893, 'timestamp': '2025-09-30 22:13:22.120267', 'step': 424, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-09-30 22:13:22.165576', 'step': 424, 'epoch': 1} {'type': 'loss', 'content': 0.008793395012617111, 'timestamp': '2025-09-30 22:13:22.182246', 'step': 425, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:22.221255', 'step': 425, 'epoch': 1} {'type': 'loss', 'content': 0.021444421261548996, 'timestamp': '2025-09-30 22:13:22.231554', 'step': 426, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:22.266069', 'step': 426, 'epoch': 1} {'type': 'loss', 'content': 0.03694969043135643, 'timestamp': '2025-09-30 22:13:22.273910', 'step': 427, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:22.307561', 'step': 427, 'epoch': 1} {'type': 'loss', 'content': 0.019183486700057983, 'timestamp': '2025-09-30 22:13:22.339317', 'step': 428, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:13:22.377199', 'step': 428, 'epoch': 1} {'type': 'loss', 'content': 0.019474482163786888, 'timestamp': '2025-09-30 22:13:22.382384', 'step': 429, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:13:22.429583', 'step': 429, 'epoch': 1} {'type': 'loss', 'content': 0.009424322284758091, 'timestamp': '2025-09-30 22:13:22.445400', 'step': 430, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:13:22.482833', 'step': 430, 'epoch': 1} {'type': 'loss', 'content': 0.011564414016902447, 'timestamp': '2025-09-30 22:13:22.496826', 'step': 431, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:13:22.536004', 'step': 431, 'epoch': 1} {'type': 'loss', 'content': 0.018689239397644997, 'timestamp': '2025-09-30 22:13:22.570532', 'step': 432, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:22.609955', 'step': 432, 'epoch': 1} {'type': 'loss', 'content': 0.016993923112750053, 'timestamp': '2025-09-30 22:13:22.618518', 'step': 433, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:13:22.651414', 'step': 433, 'epoch': 1} {'type': 'loss', 'content': 0.025945935398340225, 'timestamp': '2025-09-30 22:13:22.655890', 'step': 434, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:13:22.691394', 'step': 434, 'epoch': 1} {'type': 'loss', 'content': 0.015565545298159122, 'timestamp': '2025-09-30 22:13:22.703738', 'step': 435, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:22.738785', 'step': 435, 'epoch': 1} {'type': 'loss', 'content': 0.015718184411525726, 'timestamp': '2025-09-30 22:13:22.770729', 'step': 436, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:13:22.809199', 'step': 436, 'epoch': 1} {'type': 'loss', 'content': 0.019509142264723778, 'timestamp': '2025-09-30 22:13:22.821851', 'step': 437, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:13:22.855519', 'step': 437, 'epoch': 1} {'type': 'loss', 'content': 0.014073725789785385, 'timestamp': '2025-09-30 22:13:22.865411', 'step': 438, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:22.908783', 'step': 438, 'epoch': 1} {'type': 'loss', 'content': 0.02983592450618744, 'timestamp': '2025-09-30 22:13:22.919793', 'step': 439, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:22.961432', 'step': 439, 'epoch': 1} {'type': 'loss', 'content': 0.010194335132837296, 'timestamp': '2025-09-30 22:13:22.995571', 'step': 440, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:13:23.031184', 'step': 440, 'epoch': 1} {'type': 'loss', 'content': 0.014181242324411869, 'timestamp': '2025-09-30 22:13:23.041268', 'step': 441, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:23.083944', 'step': 441, 'epoch': 1} {'type': 'loss', 'content': 0.023306261748075485, 'timestamp': '2025-09-30 22:13:23.091842', 'step': 442, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:13:23.131270', 'step': 442, 'epoch': 1} {'type': 'loss', 'content': 0.016305014491081238, 'timestamp': '2025-09-30 22:13:23.143700', 'step': 443, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:13:23.181474', 'step': 443, 'epoch': 1} {'type': 'loss', 'content': 0.013568984344601631, 'timestamp': '2025-09-30 22:13:23.216185', 'step': 444, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:13:23.255134', 'step': 444, 'epoch': 1} {'type': 'loss', 'content': 0.006708019413053989, 'timestamp': '2025-09-30 22:13:23.268330', 'step': 445, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:13:23.305726', 'step': 445, 'epoch': 1} {'type': 'loss', 'content': 0.012450787238776684, 'timestamp': '2025-09-30 22:13:23.319507', 'step': 446, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-09-30 22:13:23.364332', 'step': 446, 'epoch': 1} {'type': 'loss', 'content': 0.008453257381916046, 'timestamp': '2025-09-30 22:13:23.380649', 'step': 447, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:13:23.418274', 'step': 447, 'epoch': 1} {'type': 'loss', 'content': 0.011293847113847733, 'timestamp': '2025-09-30 22:13:23.451621', 'step': 448, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:13:23.490542', 'step': 448, 'epoch': 1} {'type': 'loss', 'content': 0.012872311286628246, 'timestamp': '2025-09-30 22:13:23.503170', 'step': 449, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:13:23.547079', 'step': 449, 'epoch': 1} {'type': 'loss', 'content': 0.008859308436512947, 'timestamp': '2025-09-30 22:13:23.562927', 'step': 450, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:23.599425', 'step': 450, 'epoch': 1} {'type': 'loss', 'content': 0.021126611158251762, 'timestamp': '2025-09-30 22:13:23.609824', 'step': 451, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:23.642304', 'step': 451, 'epoch': 1} {'type': 'loss', 'content': 0.00786950346082449, 'timestamp': '2025-09-30 22:13:23.674344', 'step': 452, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:23.712524', 'step': 452, 'epoch': 1} {'type': 'loss', 'content': 0.016248704865574837, 'timestamp': '2025-09-30 22:13:23.720653', 'step': 453, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:13:23.759400', 'step': 453, 'epoch': 1} {'type': 'loss', 'content': 0.02498588338494301, 'timestamp': '2025-09-30 22:13:23.772762', 'step': 454, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:13:23.822089', 'step': 454, 'epoch': 1} {'type': 'loss', 'content': 0.010999486781656742, 'timestamp': '2025-09-30 22:13:23.836108', 'step': 455, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:13:23.869514', 'step': 455, 'epoch': 1} {'type': 'loss', 'content': 0.015023881569504738, 'timestamp': '2025-09-30 22:13:23.902942', 'step': 456, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:13:23.937911', 'step': 456, 'epoch': 1} {'type': 'loss', 'content': 0.009870980866253376, 'timestamp': '2025-09-30 22:13:23.947715', 'step': 457, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:13:23.983566', 'step': 457, 'epoch': 1} {'type': 'loss', 'content': 0.01719001494348049, 'timestamp': '2025-09-30 22:13:23.996153', 'step': 458, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:24.032498', 'step': 458, 'epoch': 1} {'type': 'loss', 'content': 0.01666310243308544, 'timestamp': '2025-09-30 22:13:24.040496', 'step': 459, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:13:24.088663', 'step': 459, 'epoch': 1} {'type': 'loss', 'content': 0.01084128674119711, 'timestamp': '2025-09-30 22:13:24.125130', 'step': 460, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-09-30 22:13:26.609779', 'step': 460, 'epoch': 1} {'type': 'pplx', 'content': 5.547471103472087, 'timestamp': '2025-09-30 22:13:26.611919', 'step': 460, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:13:26.651656', 'step': 460, 'epoch': 1} {'type': 'loss', 'content': 0.010452203452587128, 'timestamp': '2025-09-30 22:13:26.667004', 'step': 461, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:13:26.704799', 'step': 461, 'epoch': 1} {'type': 'loss', 'content': 0.011252378113567829, 'timestamp': '2025-09-30 22:13:26.718602', 'step': 462, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:13:26.759757', 'step': 462, 'epoch': 1} {'type': 'loss', 'content': 0.02305806800723076, 'timestamp': '2025-09-30 22:13:26.772321', 'step': 463, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:13:26.811072', 'step': 463, 'epoch': 1} {'type': 'loss', 'content': 0.009146085008978844, 'timestamp': '2025-09-30 22:13:26.845946', 'step': 464, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:13:26.881541', 'step': 464, 'epoch': 1} {'type': 'loss', 'content': 0.025301601737737656, 'timestamp': '2025-09-30 22:13:26.886529', 'step': 465, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:13:26.924240', 'step': 465, 'epoch': 1} {'type': 'loss', 'content': 0.01246948167681694, 'timestamp': '2025-09-30 22:13:26.937990', 'step': 466, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:26.977081', 'step': 466, 'epoch': 1} {'type': 'loss', 'content': 0.03232751041650772, 'timestamp': '2025-09-30 22:13:26.985132', 'step': 467, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:13:27.018357', 'step': 467, 'epoch': 1} {'type': 'loss', 'content': 0.025166211649775505, 'timestamp': '2025-09-30 22:13:27.047916', 'step': 468, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 14238374516352}, 'timestamp': '2025-09-30 22:13:27.089609', 'step': 468, 'epoch': 1} {'type': 'loss', 'content': 0.008584776893258095, 'timestamp': '2025-09-30 22:13:27.106621', 'step': 469, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:27.153880', 'step': 469, 'epoch': 1} {'type': 'loss', 'content': 0.02005261369049549, 'timestamp': '2025-09-30 22:13:27.161878', 'step': 470, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:13:27.194883', 'step': 470, 'epoch': 1} {'type': 'loss', 'content': 0.018570082262158394, 'timestamp': '2025-09-30 22:13:27.202310', 'step': 471, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:27.238764', 'step': 471, 'epoch': 1} {'type': 'loss', 'content': 0.01865404099225998, 'timestamp': '2025-09-30 22:13:27.269970', 'step': 472, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:13:27.311535', 'step': 472, 'epoch': 1} {'type': 'loss', 'content': 0.019251475110650063, 'timestamp': '2025-09-30 22:13:27.322124', 'step': 473, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-09-30 22:13:27.372295', 'step': 473, 'epoch': 1} {'type': 'loss', 'content': 0.009503374807536602, 'timestamp': '2025-09-30 22:13:27.388484', 'step': 474, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:13:27.420591', 'step': 474, 'epoch': 1} {'type': 'loss', 'content': 0.015804406255483627, 'timestamp': '2025-09-30 22:13:27.432758', 'step': 475, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:13:27.472484', 'step': 475, 'epoch': 1} {'type': 'loss', 'content': 0.01220368966460228, 'timestamp': '2025-09-30 22:13:27.505498', 'step': 476, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:27.543287', 'step': 476, 'epoch': 1} {'type': 'loss', 'content': 0.026538586243987083, 'timestamp': '2025-09-30 22:13:27.551290', 'step': 477, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:27.586846', 'step': 477, 'epoch': 1} {'type': 'loss', 'content': 0.015999168157577515, 'timestamp': '2025-09-30 22:13:27.594757', 'step': 478, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:13:27.631937', 'step': 478, 'epoch': 1} {'type': 'loss', 'content': 0.021727586165070534, 'timestamp': '2025-09-30 22:13:27.639478', 'step': 479, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:13:27.675526', 'step': 479, 'epoch': 1} {'type': 'loss', 'content': 0.008494660258293152, 'timestamp': '2025-09-30 22:13:27.708650', 'step': 480, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:13:27.748775', 'step': 480, 'epoch': 1} {'type': 'loss', 'content': 0.006171220447868109, 'timestamp': '2025-09-30 22:13:27.761452', 'step': 481, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:13:27.795638', 'step': 481, 'epoch': 1} {'type': 'loss', 'content': 0.016583433374762535, 'timestamp': '2025-09-30 22:13:27.808208', 'step': 482, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:13:27.850669', 'step': 482, 'epoch': 1} {'type': 'loss', 'content': 0.01650671847164631, 'timestamp': '2025-09-30 22:13:27.864345', 'step': 483, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:27.903833', 'step': 483, 'epoch': 1} {'type': 'loss', 'content': 0.012906375341117382, 'timestamp': '2025-09-30 22:13:27.935343', 'step': 484, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:27.970074', 'step': 484, 'epoch': 1} {'type': 'loss', 'content': 0.019743014127016068, 'timestamp': '2025-09-30 22:13:27.978350', 'step': 485, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:28.013347', 'step': 485, 'epoch': 1} {'type': 'loss', 'content': 0.027920933440327644, 'timestamp': '2025-09-30 22:13:28.024083', 'step': 486, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:28.064736', 'step': 486, 'epoch': 1} {'type': 'loss', 'content': 0.01832328550517559, 'timestamp': '2025-09-30 22:13:28.074965', 'step': 487, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:13:28.115912', 'step': 487, 'epoch': 1} {'type': 'loss', 'content': 0.012525717727839947, 'timestamp': '2025-09-30 22:13:28.152412', 'step': 488, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:13:28.189429', 'step': 488, 'epoch': 1} {'type': 'loss', 'content': 0.019297080114483833, 'timestamp': '2025-09-30 22:13:28.202034', 'step': 489, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:13:28.245096', 'step': 489, 'epoch': 1} {'type': 'loss', 'content': 0.011257869191467762, 'timestamp': '2025-09-30 22:13:28.258863', 'step': 490, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:28.299787', 'step': 490, 'epoch': 1} {'type': 'loss', 'content': 0.022702723741531372, 'timestamp': '2025-09-30 22:13:28.307924', 'step': 491, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:28.342762', 'step': 491, 'epoch': 1} {'type': 'loss', 'content': 0.014692867174744606, 'timestamp': '2025-09-30 22:13:28.373905', 'step': 492, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:13:28.413206', 'step': 492, 'epoch': 1} {'type': 'loss', 'content': 0.012005449272692204, 'timestamp': '2025-09-30 22:13:28.423746', 'step': 493, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:13:28.461744', 'step': 493, 'epoch': 1} {'type': 'loss', 'content': 0.012011608108878136, 'timestamp': '2025-09-30 22:13:28.475488', 'step': 494, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:13:28.517536', 'step': 494, 'epoch': 1} {'type': 'loss', 'content': 0.006444706581532955, 'timestamp': '2025-09-30 22:13:28.531238', 'step': 495, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:13:28.570258', 'step': 495, 'epoch': 1} {'type': 'loss', 'content': 0.007369877304881811, 'timestamp': '2025-09-30 22:13:28.604786', 'step': 496, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:13:28.641550', 'step': 496, 'epoch': 1} {'type': 'loss', 'content': 0.010467078536748886, 'timestamp': '2025-09-30 22:13:28.654717', 'step': 497, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:13:28.687517', 'step': 497, 'epoch': 1} {'type': 'loss', 'content': 0.015244974754750729, 'timestamp': '2025-09-30 22:13:28.699521', 'step': 498, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:28.732464', 'step': 498, 'epoch': 1} {'type': 'loss', 'content': 0.01717539317905903, 'timestamp': '2025-09-30 22:13:28.740334', 'step': 499, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:13:28.783711', 'step': 499, 'epoch': 1} {'type': 'loss', 'content': 0.013126603327691555, 'timestamp': '2025-09-30 22:13:28.816884', 'step': 500, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 500', 'timestamp': '2025-09-30 22:13:34.118711', 'step': 500, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:34.157718', 'step': 500, 'epoch': 1} {'type': 'loss', 'content': 0.015919821336865425, 'timestamp': '2025-09-30 22:13:34.166491', 'step': 501, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:34.205992', 'step': 501, 'epoch': 1} {'type': 'loss', 'content': 0.019572250545024872, 'timestamp': '2025-09-30 22:13:34.213856', 'step': 502, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:13:34.255280', 'step': 502, 'epoch': 1} {'type': 'loss', 'content': 0.009043821133673191, 'timestamp': '2025-09-30 22:13:34.270837', 'step': 503, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:34.308537', 'step': 503, 'epoch': 1} {'type': 'loss', 'content': 0.016094347462058067, 'timestamp': '2025-09-30 22:13:34.337220', 'step': 504, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:13:34.373861', 'step': 504, 'epoch': 1} {'type': 'loss', 'content': 0.014503546059131622, 'timestamp': '2025-09-30 22:13:34.386596', 'step': 505, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:34.419290', 'step': 505, 'epoch': 1} {'type': 'loss', 'content': 0.0143441092222929, 'timestamp': '2025-09-30 22:13:34.430355', 'step': 506, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:13:34.471809', 'step': 506, 'epoch': 1} {'type': 'loss', 'content': 0.009784374386072159, 'timestamp': '2025-09-30 22:13:34.485595', 'step': 507, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:13:34.521390', 'step': 507, 'epoch': 1} {'type': 'loss', 'content': 0.018428310751914978, 'timestamp': '2025-09-30 22:13:34.555645', 'step': 508, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:13:34.594874', 'step': 508, 'epoch': 1} {'type': 'loss', 'content': 0.01567312888801098, 'timestamp': '2025-09-30 22:13:34.605266', 'step': 509, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:13:34.643025', 'step': 509, 'epoch': 1} {'type': 'loss', 'content': 0.015023056417703629, 'timestamp': '2025-09-30 22:13:34.655362', 'step': 510, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:13:34.699084', 'step': 510, 'epoch': 1} {'type': 'loss', 'content': 0.010495069436728954, 'timestamp': '2025-09-30 22:13:34.715011', 'step': 511, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:13:34.754164', 'step': 511, 'epoch': 1} {'type': 'loss', 'content': 0.010150685906410217, 'timestamp': '2025-09-30 22:13:34.789030', 'step': 512, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:34.825356', 'step': 512, 'epoch': 1} {'type': 'loss', 'content': 0.019635174423456192, 'timestamp': '2025-09-30 22:13:34.833928', 'step': 513, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:13:34.876992', 'step': 513, 'epoch': 1} {'type': 'loss', 'content': 0.010541053488850594, 'timestamp': '2025-09-30 22:13:34.890729', 'step': 514, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:13:34.925435', 'step': 514, 'epoch': 1} {'type': 'loss', 'content': 0.019847339019179344, 'timestamp': '2025-09-30 22:13:34.937586', 'step': 515, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:34.971644', 'step': 515, 'epoch': 1} {'type': 'loss', 'content': 0.019181231036782265, 'timestamp': '2025-09-30 22:13:35.003437', 'step': 516, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:13:35.038271', 'step': 516, 'epoch': 1} {'type': 'loss', 'content': 0.01539756078273058, 'timestamp': '2025-09-30 22:13:35.043192', 'step': 517, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:35.080332', 'step': 517, 'epoch': 1} {'type': 'loss', 'content': 0.01914926804602146, 'timestamp': '2025-09-30 22:13:35.090603', 'step': 518, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:35.129685', 'step': 518, 'epoch': 1} {'type': 'loss', 'content': 0.008599474094808102, 'timestamp': '2025-09-30 22:13:35.140777', 'step': 519, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:13:35.183037', 'step': 519, 'epoch': 1} {'type': 'loss', 'content': 0.011888724751770496, 'timestamp': '2025-09-30 22:13:35.211061', 'step': 520, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:35.245377', 'step': 520, 'epoch': 1} {'type': 'loss', 'content': 0.012584488838911057, 'timestamp': '2025-09-30 22:13:35.251150', 'step': 521, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:13:35.283671', 'step': 521, 'epoch': 1} {'type': 'loss', 'content': 0.01680966280400753, 'timestamp': '2025-09-30 22:13:35.290988', 'step': 522, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:13:35.326768', 'step': 522, 'epoch': 1} {'type': 'loss', 'content': 0.014662979170680046, 'timestamp': '2025-09-30 22:13:35.340535', 'step': 523, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:35.372444', 'step': 523, 'epoch': 1} {'type': 'loss', 'content': 0.008960328064858913, 'timestamp': '2025-09-30 22:13:35.401915', 'step': 524, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:13:35.444449', 'step': 524, 'epoch': 1} {'type': 'loss', 'content': 0.008551303297281265, 'timestamp': '2025-09-30 22:13:35.457739', 'step': 525, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:13:35.497904', 'step': 525, 'epoch': 1} {'type': 'loss', 'content': 0.012693502940237522, 'timestamp': '2025-09-30 22:13:35.510516', 'step': 526, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:35.548323', 'step': 526, 'epoch': 1} {'type': 'loss', 'content': 0.014062950387597084, 'timestamp': '2025-09-30 22:13:35.556234', 'step': 527, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:13:35.597961', 'step': 527, 'epoch': 1} {'type': 'loss', 'content': 0.02073533460497856, 'timestamp': '2025-09-30 22:13:35.631141', 'step': 528, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:13:35.675027', 'step': 528, 'epoch': 1} {'type': 'loss', 'content': 0.015901118516921997, 'timestamp': '2025-09-30 22:13:35.688306', 'step': 529, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:13:35.727620', 'step': 529, 'epoch': 1} {'type': 'loss', 'content': 0.008050153031945229, 'timestamp': '2025-09-30 22:13:35.740244', 'step': 530, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:13:35.778005', 'step': 530, 'epoch': 1} {'type': 'loss', 'content': 0.005131041631102562, 'timestamp': '2025-09-30 22:13:35.791791', 'step': 531, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:13:35.824717', 'step': 531, 'epoch': 1} {'type': 'loss', 'content': 0.03233374282717705, 'timestamp': '2025-09-30 22:13:35.852577', 'step': 532, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:13:35.899146', 'step': 532, 'epoch': 1} {'type': 'loss', 'content': 0.04059325531125069, 'timestamp': '2025-09-30 22:13:35.903851', 'step': 533, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:35.939407', 'step': 533, 'epoch': 1} {'type': 'loss', 'content': 0.01124960370361805, 'timestamp': '2025-09-30 22:13:35.947345', 'step': 534, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:13:35.986463', 'step': 534, 'epoch': 1} {'type': 'loss', 'content': 0.010681557469069958, 'timestamp': '2025-09-30 22:13:35.999831', 'step': 535, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:13:36.035706', 'step': 535, 'epoch': 1} {'type': 'loss', 'content': 0.014229071326553822, 'timestamp': '2025-09-30 22:13:36.063884', 'step': 536, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:13:36.101348', 'step': 536, 'epoch': 1} {'type': 'loss', 'content': 0.024254217743873596, 'timestamp': '2025-09-30 22:13:36.106299', 'step': 537, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:13:36.146100', 'step': 537, 'epoch': 1} {'type': 'loss', 'content': 0.015680156648159027, 'timestamp': '2025-09-30 22:13:36.153817', 'step': 538, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:13:36.194299', 'step': 538, 'epoch': 1} {'type': 'loss', 'content': 0.010118505917489529, 'timestamp': '2025-09-30 22:13:36.207615', 'step': 539, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:36.243480', 'step': 539, 'epoch': 1} {'type': 'loss', 'content': 0.03139385208487511, 'timestamp': '2025-09-30 22:13:36.272215', 'step': 540, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:13:36.324438', 'step': 540, 'epoch': 1} {'type': 'loss', 'content': 0.0242606271058321, 'timestamp': '2025-09-30 22:13:36.337548', 'step': 541, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:13:36.374959', 'step': 541, 'epoch': 1} {'type': 'loss', 'content': 0.015055907890200615, 'timestamp': '2025-09-30 22:13:36.388683', 'step': 542, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:13:36.434284', 'step': 542, 'epoch': 1} {'type': 'loss', 'content': 0.015385239385068417, 'timestamp': '2025-09-30 22:13:36.447600', 'step': 543, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:13:36.485204', 'step': 543, 'epoch': 1} {'type': 'loss', 'content': 0.015145394951105118, 'timestamp': '2025-09-30 22:13:36.519976', 'step': 544, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:13:36.556807', 'step': 544, 'epoch': 1} {'type': 'loss', 'content': 0.026151733472943306, 'timestamp': '2025-09-30 22:13:36.566704', 'step': 545, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:36.608606', 'step': 545, 'epoch': 1} {'type': 'loss', 'content': 0.02166786976158619, 'timestamp': '2025-09-30 22:13:36.619705', 'step': 546, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:13:36.666620', 'step': 546, 'epoch': 1} {'type': 'loss', 'content': 0.015373307280242443, 'timestamp': '2025-09-30 22:13:36.679197', 'step': 547, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:13:36.715514', 'step': 547, 'epoch': 1} {'type': 'loss', 'content': 0.018361790105700493, 'timestamp': '2025-09-30 22:13:36.757031', 'step': 548, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:36.795980', 'step': 548, 'epoch': 1} {'type': 'loss', 'content': 0.010503076948225498, 'timestamp': '2025-09-30 22:13:36.804013', 'step': 549, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:13:36.850648', 'step': 549, 'epoch': 1} {'type': 'loss', 'content': 0.021157601848244667, 'timestamp': '2025-09-30 22:13:36.857881', 'step': 550, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:13:36.896797', 'step': 550, 'epoch': 1} {'type': 'loss', 'content': 0.026187656447291374, 'timestamp': '2025-09-30 22:13:36.903817', 'step': 551, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:13:36.951945', 'step': 551, 'epoch': 1} {'type': 'loss', 'content': 0.018559563905000687, 'timestamp': '2025-09-30 22:13:36.982928', 'step': 552, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:37.027735', 'step': 552, 'epoch': 1} {'type': 'loss', 'content': 0.014130979776382446, 'timestamp': '2025-09-30 22:13:37.033314', 'step': 553, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:13:37.073709', 'step': 553, 'epoch': 1} {'type': 'loss', 'content': 0.02166288159787655, 'timestamp': '2025-09-30 22:13:37.081388', 'step': 554, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:13:37.134201', 'step': 554, 'epoch': 1} {'type': 'loss', 'content': 0.017001010477542877, 'timestamp': '2025-09-30 22:13:37.149770', 'step': 555, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:13:37.187826', 'step': 555, 'epoch': 1} {'type': 'loss', 'content': 0.012803551740944386, 'timestamp': '2025-09-30 22:13:37.222427', 'step': 556, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:13:37.256705', 'step': 556, 'epoch': 1} {'type': 'loss', 'content': 0.016770636662840843, 'timestamp': '2025-09-30 22:13:37.262057', 'step': 557, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:13:37.304808', 'step': 557, 'epoch': 1} {'type': 'loss', 'content': 0.03533332794904709, 'timestamp': '2025-09-30 22:13:37.310757', 'step': 558, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:13:37.348671', 'step': 558, 'epoch': 1} {'type': 'loss', 'content': 0.017565395683050156, 'timestamp': '2025-09-30 22:13:37.356290', 'step': 559, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:13:37.396057', 'step': 559, 'epoch': 1} {'type': 'loss', 'content': 0.01131448894739151, 'timestamp': '2025-09-30 22:13:37.429428', 'step': 560, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:13:37.463433', 'step': 560, 'epoch': 1} {'type': 'loss', 'content': 0.015417898073792458, 'timestamp': '2025-09-30 22:13:37.470382', 'step': 561, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:13:37.513981', 'step': 561, 'epoch': 1} {'type': 'loss', 'content': 0.013023782521486282, 'timestamp': '2025-09-30 22:13:37.526341', 'step': 562, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:37.561114', 'step': 562, 'epoch': 1} {'type': 'loss', 'content': 0.014379228465259075, 'timestamp': '2025-09-30 22:13:37.569222', 'step': 563, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:37.604054', 'step': 563, 'epoch': 1} {'type': 'loss', 'content': 0.020593201741576195, 'timestamp': '2025-09-30 22:13:37.632917', 'step': 564, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:37.665735', 'step': 564, 'epoch': 1} {'type': 'loss', 'content': 0.019417183473706245, 'timestamp': '2025-09-30 22:13:37.671416', 'step': 565, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:37.708814', 'step': 565, 'epoch': 1} {'type': 'loss', 'content': 0.007432464510202408, 'timestamp': '2025-09-30 22:13:37.719190', 'step': 566, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:13:37.758493', 'step': 566, 'epoch': 1} {'type': 'loss', 'content': 0.01681244932115078, 'timestamp': '2025-09-30 22:13:37.772197', 'step': 567, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:13:37.825920', 'step': 567, 'epoch': 1} {'type': 'loss', 'content': 0.04454522579908371, 'timestamp': '2025-09-30 22:13:37.855407', 'step': 568, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:37.891774', 'step': 568, 'epoch': 1} {'type': 'loss', 'content': 0.012199216522276402, 'timestamp': '2025-09-30 22:13:37.899677', 'step': 569, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:13:37.932621', 'step': 569, 'epoch': 1} {'type': 'loss', 'content': 0.016835706308484077, 'timestamp': '2025-09-30 22:13:37.939892', 'step': 570, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:13:37.976639', 'step': 570, 'epoch': 1} {'type': 'loss', 'content': 0.017011551186442375, 'timestamp': '2025-09-30 22:13:37.990311', 'step': 571, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:13:38.029616', 'step': 571, 'epoch': 1} {'type': 'loss', 'content': 0.012688270770013332, 'timestamp': '2025-09-30 22:13:38.064153', 'step': 572, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:13:38.102966', 'step': 572, 'epoch': 1} {'type': 'loss', 'content': 0.012485667131841183, 'timestamp': '2025-09-30 22:13:38.115602', 'step': 573, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:38.152527', 'step': 573, 'epoch': 1} {'type': 'loss', 'content': 0.03470342978835106, 'timestamp': '2025-09-30 22:13:38.160515', 'step': 574, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:13:38.193863', 'step': 574, 'epoch': 1} {'type': 'loss', 'content': 0.015874244272708893, 'timestamp': '2025-09-30 22:13:38.201532', 'step': 575, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-09-30 22:13:40.651836', 'step': 575, 'epoch': 1} {'type': 'pplx', 'content': 5.542289261626032, 'timestamp': '2025-09-30 22:13:40.656733', 'step': 575, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:40.694481', 'step': 575, 'epoch': 1} {'type': 'loss', 'content': 0.014259914867579937, 'timestamp': '2025-09-30 22:13:40.725352', 'step': 576, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-09-30 22:13:40.774137', 'step': 576, 'epoch': 1} {'type': 'loss', 'content': 0.011850749142467976, 'timestamp': '2025-09-30 22:13:40.789978', 'step': 577, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:40.835881', 'step': 577, 'epoch': 1} {'type': 'loss', 'content': 0.014616033993661404, 'timestamp': '2025-09-30 22:13:40.847049', 'step': 578, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:13:40.883420', 'step': 578, 'epoch': 1} {'type': 'loss', 'content': 0.023280199617147446, 'timestamp': '2025-09-30 22:13:40.895406', 'step': 579, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:40.931029', 'step': 579, 'epoch': 1} {'type': 'loss', 'content': 0.02082938700914383, 'timestamp': '2025-09-30 22:13:40.959902', 'step': 580, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 17085996872448}, 'timestamp': '2025-09-30 22:13:41.008909', 'step': 580, 'epoch': 1} {'type': 'loss', 'content': 0.013227862305939198, 'timestamp': '2025-09-30 22:13:41.028178', 'step': 581, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:13:41.065658', 'step': 581, 'epoch': 1} {'type': 'loss', 'content': 0.009326431900262833, 'timestamp': '2025-09-30 22:13:41.079662', 'step': 582, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:41.117515', 'step': 582, 'epoch': 1} {'type': 'loss', 'content': 0.023252299055457115, 'timestamp': '2025-09-30 22:13:41.127683', 'step': 583, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:41.167813', 'step': 583, 'epoch': 1} {'type': 'loss', 'content': 0.02949347347021103, 'timestamp': '2025-09-30 22:13:41.198901', 'step': 584, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:13:41.234977', 'step': 584, 'epoch': 1} {'type': 'loss', 'content': 0.015549993142485619, 'timestamp': '2025-09-30 22:13:41.245442', 'step': 585, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:13:41.281783', 'step': 585, 'epoch': 1} {'type': 'loss', 'content': 0.004767420701682568, 'timestamp': '2025-09-30 22:13:41.295684', 'step': 586, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:13:41.333337', 'step': 586, 'epoch': 1} {'type': 'loss', 'content': 0.008911280892789364, 'timestamp': '2025-09-30 22:13:41.346963', 'step': 587, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:13:41.380866', 'step': 587, 'epoch': 1} {'type': 'loss', 'content': 0.01907944679260254, 'timestamp': '2025-09-30 22:13:41.408785', 'step': 588, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:41.449487', 'step': 588, 'epoch': 1} {'type': 'loss', 'content': 0.020382946357131004, 'timestamp': '2025-09-30 22:13:41.457957', 'step': 589, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:41.491429', 'step': 589, 'epoch': 1} {'type': 'loss', 'content': 0.014416304416954517, 'timestamp': '2025-09-30 22:13:41.502478', 'step': 590, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:41.536612', 'step': 590, 'epoch': 1} {'type': 'loss', 'content': 0.023991908878087997, 'timestamp': '2025-09-30 22:13:41.547626', 'step': 591, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:41.582475', 'step': 591, 'epoch': 1} {'type': 'loss', 'content': 0.010505203157663345, 'timestamp': '2025-09-30 22:13:41.613774', 'step': 592, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-09-30 22:13:41.664513', 'step': 592, 'epoch': 1} {'type': 'loss', 'content': 0.005415383726358414, 'timestamp': '2025-09-30 22:13:41.681220', 'step': 593, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:13:41.713701', 'step': 593, 'epoch': 1} {'type': 'loss', 'content': 0.016899965703487396, 'timestamp': '2025-09-30 22:13:41.723283', 'step': 594, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:41.764845', 'step': 594, 'epoch': 1} {'type': 'loss', 'content': 0.015798628330230713, 'timestamp': '2025-09-30 22:13:41.774961', 'step': 595, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:41.808146', 'step': 595, 'epoch': 1} {'type': 'loss', 'content': 0.027095021679997444, 'timestamp': '2025-09-30 22:13:41.836849', 'step': 596, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:41.871885', 'step': 596, 'epoch': 1} {'type': 'loss', 'content': 0.017347702756524086, 'timestamp': '2025-09-30 22:13:41.877564', 'step': 597, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:41.914114', 'step': 597, 'epoch': 1} {'type': 'loss', 'content': 0.0157342329621315, 'timestamp': '2025-09-30 22:13:41.925129', 'step': 598, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:13:41.960821', 'step': 598, 'epoch': 1} {'type': 'loss', 'content': 0.011855707503855228, 'timestamp': '2025-09-30 22:13:41.974239', 'step': 599, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:42.009285', 'step': 599, 'epoch': 1} {'type': 'loss', 'content': 0.021833263337612152, 'timestamp': '2025-09-30 22:13:42.041404', 'step': 600, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:13:42.075086', 'step': 600, 'epoch': 1} {'type': 'loss', 'content': 0.022402599453926086, 'timestamp': '2025-09-30 22:13:42.084943', 'step': 601, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:13:42.125604', 'step': 601, 'epoch': 1} {'type': 'loss', 'content': 0.013091500848531723, 'timestamp': '2025-09-30 22:13:42.134604', 'step': 602, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:13:42.176735', 'step': 602, 'epoch': 1} {'type': 'loss', 'content': 0.01641802117228508, 'timestamp': '2025-09-30 22:13:42.184313', 'step': 603, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:42.218204', 'step': 603, 'epoch': 1} {'type': 'loss', 'content': 0.017607904970645905, 'timestamp': '2025-09-30 22:13:42.250005', 'step': 604, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:13:42.284753', 'step': 604, 'epoch': 1} {'type': 'loss', 'content': 0.017780223861336708, 'timestamp': '2025-09-30 22:13:42.290138', 'step': 605, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:42.326782', 'step': 605, 'epoch': 1} {'type': 'loss', 'content': 0.02521861344575882, 'timestamp': '2025-09-30 22:13:42.337254', 'step': 606, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:13:42.379715', 'step': 606, 'epoch': 1} {'type': 'loss', 'content': 0.015294995158910751, 'timestamp': '2025-09-30 22:13:42.393584', 'step': 607, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:13:42.427099', 'step': 607, 'epoch': 1} {'type': 'loss', 'content': 0.03106614388525486, 'timestamp': '2025-09-30 22:13:42.457640', 'step': 608, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:42.495788', 'step': 608, 'epoch': 1} {'type': 'loss', 'content': 0.01641874760389328, 'timestamp': '2025-09-30 22:13:42.504501', 'step': 609, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:42.544586', 'step': 609, 'epoch': 1} {'type': 'loss', 'content': 0.024906247854232788, 'timestamp': '2025-09-30 22:13:42.554950', 'step': 610, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:42.592108', 'step': 610, 'epoch': 1} {'type': 'loss', 'content': 0.019705880433321, 'timestamp': '2025-09-30 22:13:42.603191', 'step': 611, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:42.637547', 'step': 611, 'epoch': 1} {'type': 'loss', 'content': 0.012636306695640087, 'timestamp': '2025-09-30 22:13:42.669395', 'step': 612, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:13:42.701195', 'step': 612, 'epoch': 1} {'type': 'loss', 'content': 0.0174989253282547, 'timestamp': '2025-09-30 22:13:42.710964', 'step': 613, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:42.749637', 'step': 613, 'epoch': 1} {'type': 'loss', 'content': 0.01590840145945549, 'timestamp': '2025-09-30 22:13:42.760667', 'step': 614, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:42.795312', 'step': 614, 'epoch': 1} {'type': 'loss', 'content': 0.01362869143486023, 'timestamp': '2025-09-30 22:13:42.806454', 'step': 615, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:13:42.848751', 'step': 615, 'epoch': 1} {'type': 'loss', 'content': 0.009033956564962864, 'timestamp': '2025-09-30 22:13:42.882156', 'step': 616, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:42.918397', 'step': 616, 'epoch': 1} {'type': 'loss', 'content': 0.021320108324289322, 'timestamp': '2025-09-30 22:13:42.924003', 'step': 617, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:42.956642', 'step': 617, 'epoch': 1} {'type': 'loss', 'content': 0.019740305840969086, 'timestamp': '2025-09-30 22:13:42.966911', 'step': 618, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:43.011329', 'step': 618, 'epoch': 1} {'type': 'loss', 'content': 0.022858453914523125, 'timestamp': '2025-09-30 22:13:43.022590', 'step': 619, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:43.054936', 'step': 619, 'epoch': 1} {'type': 'loss', 'content': 0.018595917150378227, 'timestamp': '2025-09-30 22:13:43.083658', 'step': 620, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:13:43.120775', 'step': 620, 'epoch': 1} {'type': 'loss', 'content': 0.011981564573943615, 'timestamp': '2025-09-30 22:13:43.133812', 'step': 621, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 14238374516352}, 'timestamp': '2025-09-30 22:13:43.176548', 'step': 621, 'epoch': 1} {'type': 'loss', 'content': 0.01760626956820488, 'timestamp': '2025-09-30 22:13:43.193848', 'step': 622, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:13:43.233906', 'step': 622, 'epoch': 1} {'type': 'loss', 'content': 0.008837936446070671, 'timestamp': '2025-09-30 22:13:43.248018', 'step': 623, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:13:43.284221', 'step': 623, 'epoch': 1} {'type': 'loss', 'content': 0.015375801362097263, 'timestamp': '2025-09-30 22:13:43.318691', 'step': 624, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:13:43.353806', 'step': 624, 'epoch': 1} {'type': 'loss', 'content': 0.014492068439722061, 'timestamp': '2025-09-30 22:13:43.362488', 'step': 625, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:13:43.402925', 'step': 625, 'epoch': 1} {'type': 'loss', 'content': 0.01539452001452446, 'timestamp': '2025-09-30 22:13:43.416672', 'step': 626, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-09-30 22:13:43.462546', 'step': 626, 'epoch': 1} {'type': 'loss', 'content': 0.009434428997337818, 'timestamp': '2025-09-30 22:13:43.479609', 'step': 627, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:13:43.518443', 'step': 627, 'epoch': 1} {'type': 'loss', 'content': 0.01171807199716568, 'timestamp': '2025-09-30 22:13:43.553061', 'step': 628, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:13:43.594337', 'step': 628, 'epoch': 1} {'type': 'loss', 'content': 0.014136611483991146, 'timestamp': '2025-09-30 22:13:43.607650', 'step': 629, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-09-30 22:13:43.656488', 'step': 629, 'epoch': 1} {'type': 'loss', 'content': 0.017313985154032707, 'timestamp': '2025-09-30 22:13:43.672879', 'step': 630, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 14238374516352}, 'timestamp': '2025-09-30 22:13:43.725036', 'step': 630, 'epoch': 1} {'type': 'loss', 'content': 0.0077360207214951515, 'timestamp': '2025-09-30 22:13:43.742352', 'step': 631, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:13:43.783843', 'step': 631, 'epoch': 1} {'type': 'loss', 'content': 0.012235143221914768, 'timestamp': '2025-09-30 22:13:43.818751', 'step': 632, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-09-30 22:13:43.864741', 'step': 632, 'epoch': 1} {'type': 'loss', 'content': 0.0079409284517169, 'timestamp': '2025-09-30 22:13:43.880618', 'step': 633, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:13:43.916500', 'step': 633, 'epoch': 1} {'type': 'loss', 'content': 0.026523813605308533, 'timestamp': '2025-09-30 22:13:43.925891', 'step': 634, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:43.962688', 'step': 634, 'epoch': 1} {'type': 'loss', 'content': 0.008819608949124813, 'timestamp': '2025-09-30 22:13:43.970518', 'step': 635, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:13:44.006185', 'step': 635, 'epoch': 1} {'type': 'loss', 'content': 0.011811159551143646, 'timestamp': '2025-09-30 22:13:44.039358', 'step': 636, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:13:44.079084', 'step': 636, 'epoch': 1} {'type': 'loss', 'content': 0.018270481377840042, 'timestamp': '2025-09-30 22:13:44.088829', 'step': 637, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:13:44.127641', 'step': 637, 'epoch': 1} {'type': 'loss', 'content': 0.014364159666001797, 'timestamp': '2025-09-30 22:13:44.139966', 'step': 638, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:13:44.184200', 'step': 638, 'epoch': 1} {'type': 'loss', 'content': 0.013673737645149231, 'timestamp': '2025-09-30 22:13:44.196801', 'step': 639, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:44.233771', 'step': 639, 'epoch': 1} {'type': 'loss', 'content': 0.013592913746833801, 'timestamp': '2025-09-30 22:13:44.264940', 'step': 640, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:13:44.300405', 'step': 640, 'epoch': 1} {'type': 'loss', 'content': 0.013886515982449055, 'timestamp': '2025-09-30 22:13:44.313121', 'step': 641, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:13:44.354448', 'step': 641, 'epoch': 1} {'type': 'loss', 'content': 0.017853496596217155, 'timestamp': '2025-09-30 22:13:44.367796', 'step': 642, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:13:44.409545', 'step': 642, 'epoch': 1} {'type': 'loss', 'content': 0.013504012487828732, 'timestamp': '2025-09-30 22:13:44.422805', 'step': 643, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:13:44.464796', 'step': 643, 'epoch': 1} {'type': 'loss', 'content': 0.024404358118772507, 'timestamp': '2025-09-30 22:13:44.492542', 'step': 644, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:44.534800', 'step': 644, 'epoch': 1} {'type': 'loss', 'content': 0.012465760111808777, 'timestamp': '2025-09-30 22:13:44.543478', 'step': 645, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:13:44.583035', 'step': 645, 'epoch': 1} {'type': 'loss', 'content': 0.009218334220349789, 'timestamp': '2025-09-30 22:13:44.595325', 'step': 646, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:13:44.640836', 'step': 646, 'epoch': 1} {'type': 'loss', 'content': 0.016696080565452576, 'timestamp': '2025-09-30 22:13:44.648495', 'step': 647, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:13:44.689146', 'step': 647, 'epoch': 1} {'type': 'loss', 'content': 0.015427506528794765, 'timestamp': '2025-09-30 22:13:44.722366', 'step': 648, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:13:44.761503', 'step': 648, 'epoch': 1} {'type': 'loss', 'content': 0.018487118184566498, 'timestamp': '2025-09-30 22:13:44.767816', 'step': 649, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:44.801172', 'step': 649, 'epoch': 1} {'type': 'loss', 'content': 0.019440632313489914, 'timestamp': '2025-09-30 22:13:44.811652', 'step': 650, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:13:44.842722', 'step': 650, 'epoch': 1} {'type': 'loss', 'content': 0.02813410945236683, 'timestamp': '2025-09-30 22:13:44.849701', 'step': 651, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:44.889282', 'step': 651, 'epoch': 1} {'type': 'loss', 'content': 0.0211077481508255, 'timestamp': '2025-09-30 22:13:44.920427', 'step': 652, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:44.958158', 'step': 652, 'epoch': 1} {'type': 'loss', 'content': 0.015093185007572174, 'timestamp': '2025-09-30 22:13:44.966021', 'step': 653, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:13:45.003385', 'step': 653, 'epoch': 1} {'type': 'loss', 'content': 0.020069634541869164, 'timestamp': '2025-09-30 22:13:45.015845', 'step': 654, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:45.049982', 'step': 654, 'epoch': 1} {'type': 'loss', 'content': 0.011099635623395443, 'timestamp': '2025-09-30 22:13:45.060301', 'step': 655, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:45.092528', 'step': 655, 'epoch': 1} {'type': 'loss', 'content': 0.012230321764945984, 'timestamp': '2025-09-30 22:13:45.121315', 'step': 656, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:13:45.154483', 'step': 656, 'epoch': 1} {'type': 'loss', 'content': 0.019323699176311493, 'timestamp': '2025-09-30 22:13:45.159780', 'step': 657, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:13:45.197662', 'step': 657, 'epoch': 1} {'type': 'loss', 'content': 0.01795719750225544, 'timestamp': '2025-09-30 22:13:45.204505', 'step': 658, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:45.236063', 'step': 658, 'epoch': 1} {'type': 'loss', 'content': 0.025249261409044266, 'timestamp': '2025-09-30 22:13:45.244008', 'step': 659, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:45.276674', 'step': 659, 'epoch': 1} {'type': 'loss', 'content': 0.017105920240283012, 'timestamp': '2025-09-30 22:13:45.305612', 'step': 660, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:45.343907', 'step': 660, 'epoch': 1} {'type': 'loss', 'content': 0.010925698094069958, 'timestamp': '2025-09-30 22:13:45.351844', 'step': 661, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:13:45.385988', 'step': 661, 'epoch': 1} {'type': 'loss', 'content': 0.009514889679849148, 'timestamp': '2025-09-30 22:13:45.398487', 'step': 662, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:13:45.435188', 'step': 662, 'epoch': 1} {'type': 'loss', 'content': 0.02289985679090023, 'timestamp': '2025-09-30 22:13:45.439812', 'step': 663, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:13:45.473761', 'step': 663, 'epoch': 1} {'type': 'loss', 'content': 0.03659540042281151, 'timestamp': '2025-09-30 22:13:45.501589', 'step': 664, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:13:45.533023', 'step': 664, 'epoch': 1} {'type': 'loss', 'content': 0.03184283524751663, 'timestamp': '2025-09-30 22:13:45.542741', 'step': 665, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:45.574688', 'step': 665, 'epoch': 1} {'type': 'loss', 'content': 0.00956232100725174, 'timestamp': '2025-09-30 22:13:45.585185', 'step': 666, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:13:45.622382', 'step': 666, 'epoch': 1} {'type': 'loss', 'content': 0.011457591317594051, 'timestamp': '2025-09-30 22:13:45.635753', 'step': 667, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:45.674621', 'step': 667, 'epoch': 1} {'type': 'loss', 'content': 0.024427276104688644, 'timestamp': '2025-09-30 22:13:45.703401', 'step': 668, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:45.743551', 'step': 668, 'epoch': 1} {'type': 'loss', 'content': 0.01626475900411606, 'timestamp': '2025-09-30 22:13:45.751576', 'step': 669, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:13:45.800286', 'step': 669, 'epoch': 1} {'type': 'loss', 'content': 0.010417117737233639, 'timestamp': '2025-09-30 22:13:45.816069', 'step': 670, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:13:45.862642', 'step': 670, 'epoch': 1} {'type': 'loss', 'content': 0.019533686339855194, 'timestamp': '2025-09-30 22:13:45.875952', 'step': 671, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:13:45.911280', 'step': 671, 'epoch': 1} {'type': 'loss', 'content': 0.010500402189791203, 'timestamp': '2025-09-30 22:13:45.945792', 'step': 672, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:13:45.979716', 'step': 672, 'epoch': 1} {'type': 'loss', 'content': 0.013673270121216774, 'timestamp': '2025-09-30 22:13:45.992800', 'step': 673, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:13:46.030483', 'step': 673, 'epoch': 1} {'type': 'loss', 'content': 0.009827575646340847, 'timestamp': '2025-09-30 22:13:46.042761', 'step': 674, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:46.075750', 'step': 674, 'epoch': 1} {'type': 'loss', 'content': 0.014296416193246841, 'timestamp': '2025-09-30 22:13:46.083817', 'step': 675, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:13:46.121583', 'step': 675, 'epoch': 1} {'type': 'loss', 'content': 0.01631656102836132, 'timestamp': '2025-09-30 22:13:46.156078', 'step': 676, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:13:46.191309', 'step': 676, 'epoch': 1} {'type': 'loss', 'content': 0.020268293097615242, 'timestamp': '2025-09-30 22:13:46.206119', 'step': 677, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:13:46.245281', 'step': 677, 'epoch': 1} {'type': 'loss', 'content': 0.013104826211929321, 'timestamp': '2025-09-30 22:13:46.258911', 'step': 678, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:13:46.290720', 'step': 678, 'epoch': 1} {'type': 'loss', 'content': 0.014018409885466099, 'timestamp': '2025-09-30 22:13:46.297705', 'step': 679, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:13:46.334566', 'step': 679, 'epoch': 1} {'type': 'loss', 'content': 0.015949858352541924, 'timestamp': '2025-09-30 22:13:46.363117', 'step': 680, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:13:46.400618', 'step': 680, 'epoch': 1} {'type': 'loss', 'content': 0.018937628716230392, 'timestamp': '2025-09-30 22:13:46.415725', 'step': 681, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:46.451324', 'step': 681, 'epoch': 1} {'type': 'loss', 'content': 0.016113348305225372, 'timestamp': '2025-09-30 22:13:46.461558', 'step': 682, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:13:46.497757', 'step': 682, 'epoch': 1} {'type': 'loss', 'content': 0.01504132803529501, 'timestamp': '2025-09-30 22:13:46.505006', 'step': 683, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:46.538979', 'step': 683, 'epoch': 1} {'type': 'loss', 'content': 0.021047841757535934, 'timestamp': '2025-09-30 22:13:46.570851', 'step': 684, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:46.605659', 'step': 684, 'epoch': 1} {'type': 'loss', 'content': 0.012815596535801888, 'timestamp': '2025-09-30 22:13:46.611235', 'step': 685, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:13:46.645055', 'step': 685, 'epoch': 1} {'type': 'loss', 'content': 0.02239864692091942, 'timestamp': '2025-09-30 22:13:46.657394', 'step': 686, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:13:46.691287', 'step': 686, 'epoch': 1} {'type': 'loss', 'content': 0.05522322282195091, 'timestamp': '2025-09-30 22:13:46.698954', 'step': 687, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:13:46.736092', 'step': 687, 'epoch': 1} {'type': 'loss', 'content': 0.047288164496421814, 'timestamp': '2025-09-30 22:13:46.769520', 'step': 688, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:13:46.806086', 'step': 688, 'epoch': 1} {'type': 'loss', 'content': 0.019594833254814148, 'timestamp': '2025-09-30 22:13:46.819103', 'step': 689, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:46.860683', 'step': 689, 'epoch': 1} {'type': 'loss', 'content': 0.018771354109048843, 'timestamp': '2025-09-30 22:13:46.871037', 'step': 690, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-09-30 22:13:49.364533', 'step': 690, 'epoch': 1} {'type': 'pplx', 'content': 5.484566580135139, 'timestamp': '2025-09-30 22:13:49.368457', 'step': 690, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:49.412394', 'step': 690, 'epoch': 1} {'type': 'loss', 'content': 0.025742974132299423, 'timestamp': '2025-09-30 22:13:49.421766', 'step': 691, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:49.459471', 'step': 691, 'epoch': 1} {'type': 'loss', 'content': 0.013385286554694176, 'timestamp': '2025-09-30 22:13:49.491090', 'step': 692, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:49.526721', 'step': 692, 'epoch': 1} {'type': 'loss', 'content': 0.00965004600584507, 'timestamp': '2025-09-30 22:13:49.534601', 'step': 693, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:13:49.570431', 'step': 693, 'epoch': 1} {'type': 'loss', 'content': 0.018168870359659195, 'timestamp': '2025-09-30 22:13:49.577668', 'step': 694, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:49.613251', 'step': 694, 'epoch': 1} {'type': 'loss', 'content': 0.013506671413779259, 'timestamp': '2025-09-30 22:13:49.624130', 'step': 695, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-09-30 22:13:49.670015', 'step': 695, 'epoch': 1} {'type': 'loss', 'content': 0.009025247767567635, 'timestamp': '2025-09-30 22:13:49.707090', 'step': 696, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:13:49.745185', 'step': 696, 'epoch': 1} {'type': 'loss', 'content': 0.041763078421354294, 'timestamp': '2025-09-30 22:13:49.756531', 'step': 697, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:13:49.799941', 'step': 697, 'epoch': 1} {'type': 'loss', 'content': 0.04631451517343521, 'timestamp': '2025-09-30 22:13:49.813388', 'step': 698, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:13:49.861839', 'step': 698, 'epoch': 1} {'type': 'loss', 'content': 0.01658572070300579, 'timestamp': '2025-09-30 22:13:49.869480', 'step': 699, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:49.906859', 'step': 699, 'epoch': 1} {'type': 'loss', 'content': 0.009366752579808235, 'timestamp': '2025-09-30 22:13:49.938726', 'step': 700, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:49.973301', 'step': 700, 'epoch': 1} {'type': 'loss', 'content': 0.020774291828274727, 'timestamp': '2025-09-30 22:13:49.984670', 'step': 701, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:50.022185', 'step': 701, 'epoch': 1} {'type': 'loss', 'content': 0.015729684382677078, 'timestamp': '2025-09-30 22:13:50.033250', 'step': 702, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:13:50.067552', 'step': 702, 'epoch': 1} {'type': 'loss', 'content': 0.011922284960746765, 'timestamp': '2025-09-30 22:13:50.079759', 'step': 703, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:13:50.134609', 'step': 703, 'epoch': 1} {'type': 'loss', 'content': 0.013412105850875378, 'timestamp': '2025-09-30 22:13:50.162720', 'step': 704, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:50.203110', 'step': 704, 'epoch': 1} {'type': 'loss', 'content': 0.027656683698296547, 'timestamp': '2025-09-30 22:13:50.211735', 'step': 705, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:13:50.246411', 'step': 705, 'epoch': 1} {'type': 'loss', 'content': 0.017529543489217758, 'timestamp': '2025-09-30 22:13:50.253359', 'step': 706, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:13:50.286731', 'step': 706, 'epoch': 1} {'type': 'loss', 'content': 0.012763168662786484, 'timestamp': '2025-09-30 22:13:50.293502', 'step': 707, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:13:50.327321', 'step': 707, 'epoch': 1} {'type': 'loss', 'content': 0.023422418162226677, 'timestamp': '2025-09-30 22:13:50.355691', 'step': 708, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:13:50.390168', 'step': 708, 'epoch': 1} {'type': 'loss', 'content': 0.020299313589930534, 'timestamp': '2025-09-30 22:13:50.395455', 'step': 709, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:13:50.430736', 'step': 709, 'epoch': 1} {'type': 'loss', 'content': 0.01444624736905098, 'timestamp': '2025-09-30 22:13:50.437911', 'step': 710, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:50.473148', 'step': 710, 'epoch': 1} {'type': 'loss', 'content': 0.02190915122628212, 'timestamp': '2025-09-30 22:13:50.482260', 'step': 711, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:50.517666', 'step': 711, 'epoch': 1} {'type': 'loss', 'content': 0.01795337162911892, 'timestamp': '2025-09-30 22:13:50.549058', 'step': 712, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:13:50.591108', 'step': 712, 'epoch': 1} {'type': 'loss', 'content': 0.022492265328764915, 'timestamp': '2025-09-30 22:13:50.595952', 'step': 713, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:50.636614', 'step': 713, 'epoch': 1} {'type': 'loss', 'content': 0.018829617649316788, 'timestamp': '2025-09-30 22:13:50.647014', 'step': 714, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:13:50.683152', 'step': 714, 'epoch': 1} {'type': 'loss', 'content': 0.024367261677980423, 'timestamp': '2025-09-30 22:13:50.687425', 'step': 715, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:13:50.721032', 'step': 715, 'epoch': 1} {'type': 'loss', 'content': 0.01960578002035618, 'timestamp': '2025-09-30 22:13:50.748399', 'step': 716, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:13:50.783905', 'step': 716, 'epoch': 1} {'type': 'loss', 'content': 0.021256817504763603, 'timestamp': '2025-09-30 22:13:50.788518', 'step': 717, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:13:50.829916', 'step': 717, 'epoch': 1} {'type': 'loss', 'content': 0.03248652443289757, 'timestamp': '2025-09-30 22:13:50.839728', 'step': 718, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:13:50.885312', 'step': 718, 'epoch': 1} {'type': 'loss', 'content': 0.028153620660305023, 'timestamp': '2025-09-30 22:13:50.893291', 'step': 719, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:13:50.931729', 'step': 719, 'epoch': 1} {'type': 'loss', 'content': 0.006349663250148296, 'timestamp': '2025-09-30 22:13:50.959774', 'step': 720, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:50.994981', 'step': 720, 'epoch': 1} {'type': 'loss', 'content': 0.018377935513854027, 'timestamp': '2025-09-30 22:13:51.000468', 'step': 721, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:13:51.038006', 'step': 721, 'epoch': 1} {'type': 'loss', 'content': 0.01730031706392765, 'timestamp': '2025-09-30 22:13:51.046409', 'step': 722, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:13:51.079171', 'step': 722, 'epoch': 1} {'type': 'loss', 'content': 0.022774288430809975, 'timestamp': '2025-09-30 22:13:51.086378', 'step': 723, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:13:51.142088', 'step': 723, 'epoch': 1} {'type': 'loss', 'content': 0.020183557644486427, 'timestamp': '2025-09-30 22:13:51.178808', 'step': 724, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:13:51.213986', 'step': 724, 'epoch': 1} {'type': 'loss', 'content': 0.023975694552063942, 'timestamp': '2025-09-30 22:13:51.219234', 'step': 725, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:13:51.262463', 'step': 725, 'epoch': 1} {'type': 'loss', 'content': 0.011222700588405132, 'timestamp': '2025-09-30 22:13:51.269714', 'step': 726, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:51.306276', 'step': 726, 'epoch': 1} {'type': 'loss', 'content': 0.011557246558368206, 'timestamp': '2025-09-30 22:13:51.317352', 'step': 727, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:13:51.356064', 'step': 727, 'epoch': 1} {'type': 'loss', 'content': 0.010076114907860756, 'timestamp': '2025-09-30 22:13:51.390759', 'step': 728, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:51.421783', 'step': 728, 'epoch': 1} {'type': 'loss', 'content': 0.018097903579473495, 'timestamp': '2025-09-30 22:13:51.430402', 'step': 729, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:13:51.468030', 'step': 729, 'epoch': 1} {'type': 'loss', 'content': 0.01810041442513466, 'timestamp': '2025-09-30 22:13:51.475202', 'step': 730, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:13:51.510176', 'step': 730, 'epoch': 1} {'type': 'loss', 'content': 0.014912882819771767, 'timestamp': '2025-09-30 22:13:51.517885', 'step': 731, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:51.554991', 'step': 731, 'epoch': 1} {'type': 'loss', 'content': 0.030342083424329758, 'timestamp': '2025-09-30 22:13:51.586228', 'step': 732, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:51.623186', 'step': 732, 'epoch': 1} {'type': 'loss', 'content': 0.020088672637939453, 'timestamp': '2025-09-30 22:13:51.628865', 'step': 733, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:13:51.669345', 'step': 733, 'epoch': 1} {'type': 'loss', 'content': 0.02861752174794674, 'timestamp': '2025-09-30 22:13:51.681846', 'step': 734, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:51.717305', 'step': 734, 'epoch': 1} {'type': 'loss', 'content': 0.018066808581352234, 'timestamp': '2025-09-30 22:13:51.728252', 'step': 735, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:13:51.765716', 'step': 735, 'epoch': 1} {'type': 'loss', 'content': 0.01734330505132675, 'timestamp': '2025-09-30 22:13:51.793853', 'step': 736, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:13:51.835557', 'step': 736, 'epoch': 1} {'type': 'loss', 'content': 0.01800304837524891, 'timestamp': '2025-09-30 22:13:51.850671', 'step': 737, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:51.883073', 'step': 737, 'epoch': 1} {'type': 'loss', 'content': 0.013623161241412163, 'timestamp': '2025-09-30 22:13:51.891018', 'step': 738, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:51.924341', 'step': 738, 'epoch': 1} {'type': 'loss', 'content': 0.013523743487894535, 'timestamp': '2025-09-30 22:13:51.934770', 'step': 739, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:51.969832', 'step': 739, 'epoch': 1} {'type': 'loss', 'content': 0.017236221581697464, 'timestamp': '2025-09-30 22:13:52.001212', 'step': 740, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:52.045376', 'step': 740, 'epoch': 1} {'type': 'loss', 'content': 0.023654861375689507, 'timestamp': '2025-09-30 22:13:52.051027', 'step': 741, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 14238374516352}, 'timestamp': '2025-09-30 22:13:52.096205', 'step': 741, 'epoch': 1} {'type': 'loss', 'content': 0.010722161270678043, 'timestamp': '2025-09-30 22:13:52.113567', 'step': 742, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:13:52.155172', 'step': 742, 'epoch': 1} {'type': 'loss', 'content': 0.013899749144911766, 'timestamp': '2025-09-30 22:13:52.167517', 'step': 743, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:13:52.206419', 'step': 743, 'epoch': 1} {'type': 'loss', 'content': 0.020585909485816956, 'timestamp': '2025-09-30 22:13:52.234992', 'step': 744, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:13:52.269850', 'step': 744, 'epoch': 1} {'type': 'loss', 'content': 0.01175573468208313, 'timestamp': '2025-09-30 22:13:52.280511', 'step': 745, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:52.321935', 'step': 745, 'epoch': 1} {'type': 'loss', 'content': 0.023122472688555717, 'timestamp': '2025-09-30 22:13:52.329965', 'step': 746, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:13:52.367523', 'step': 746, 'epoch': 1} {'type': 'loss', 'content': 0.019044872373342514, 'timestamp': '2025-09-30 22:13:52.379841', 'step': 747, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:52.422914', 'step': 747, 'epoch': 1} {'type': 'loss', 'content': 0.013782423920929432, 'timestamp': '2025-09-30 22:13:52.451727', 'step': 748, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:52.489172', 'step': 748, 'epoch': 1} {'type': 'loss', 'content': 0.020801430568099022, 'timestamp': '2025-09-30 22:13:52.497779', 'step': 749, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:52.531028', 'step': 749, 'epoch': 1} {'type': 'loss', 'content': 0.023138711228966713, 'timestamp': '2025-09-30 22:13:52.542056', 'step': 750, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:52.580836', 'step': 750, 'epoch': 1} {'type': 'loss', 'content': 0.007247679401189089, 'timestamp': '2025-09-30 22:13:52.591421', 'step': 751, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:13:52.628640', 'step': 751, 'epoch': 1} {'type': 'loss', 'content': 0.011202228255569935, 'timestamp': '2025-09-30 22:13:52.663217', 'step': 752, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:52.702720', 'step': 752, 'epoch': 1} {'type': 'loss', 'content': 0.013906356878578663, 'timestamp': '2025-09-30 22:13:52.711409', 'step': 753, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:52.743736', 'step': 753, 'epoch': 1} {'type': 'loss', 'content': 0.012691138312220573, 'timestamp': '2025-09-30 22:13:52.754963', 'step': 754, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:52.795711', 'step': 754, 'epoch': 1} {'type': 'loss', 'content': 0.016555367037653923, 'timestamp': '2025-09-30 22:13:52.806841', 'step': 755, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:13:52.841675', 'step': 755, 'epoch': 1} {'type': 'loss', 'content': 0.01773860678076744, 'timestamp': '2025-09-30 22:13:52.875041', 'step': 756, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:52.910277', 'step': 756, 'epoch': 1} {'type': 'loss', 'content': 0.02634359337389469, 'timestamp': '2025-09-30 22:13:52.918145', 'step': 757, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:52.959796', 'step': 757, 'epoch': 1} {'type': 'loss', 'content': 0.016910066828131676, 'timestamp': '2025-09-30 22:13:52.970933', 'step': 758, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:13:53.008546', 'step': 758, 'epoch': 1} {'type': 'loss', 'content': 0.0069466689601540565, 'timestamp': '2025-09-30 22:13:53.022322', 'step': 759, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:13:53.062812', 'step': 759, 'epoch': 1} {'type': 'loss', 'content': 0.020431499928236008, 'timestamp': '2025-09-30 22:13:53.097408', 'step': 760, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:13:53.139348', 'step': 760, 'epoch': 1} {'type': 'loss', 'content': 0.015317758545279503, 'timestamp': '2025-09-30 22:13:53.152448', 'step': 761, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:53.192466', 'step': 761, 'epoch': 1} {'type': 'loss', 'content': 0.0235912948846817, 'timestamp': '2025-09-30 22:13:53.202840', 'step': 762, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:13:53.241983', 'step': 762, 'epoch': 1} {'type': 'loss', 'content': 0.009301051497459412, 'timestamp': '2025-09-30 22:13:53.255674', 'step': 763, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:53.289280', 'step': 763, 'epoch': 1} {'type': 'loss', 'content': 0.021411392837762833, 'timestamp': '2025-09-30 22:13:53.321105', 'step': 764, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:13:53.355219', 'step': 764, 'epoch': 1} {'type': 'loss', 'content': 0.017297813668847084, 'timestamp': '2025-09-30 22:13:53.364951', 'step': 765, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 18984411776512}, 'timestamp': '2025-09-30 22:13:53.420524', 'step': 765, 'epoch': 1} {'type': 'loss', 'content': 0.007814758457243443, 'timestamp': '2025-09-30 22:13:53.442153', 'step': 766, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:53.492343', 'step': 766, 'epoch': 1} {'type': 'loss', 'content': 0.00955934077501297, 'timestamp': '2025-09-30 22:13:53.502769', 'step': 767, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:13:53.541235', 'step': 767, 'epoch': 1} {'type': 'loss', 'content': 0.013868585228919983, 'timestamp': '2025-09-30 22:13:53.567216', 'step': 768, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:13:53.611092', 'step': 768, 'epoch': 1} {'type': 'loss', 'content': 0.04621420055627823, 'timestamp': '2025-09-30 22:13:53.615640', 'step': 769, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:13:53.649584', 'step': 769, 'epoch': 1} {'type': 'loss', 'content': 0.012514598667621613, 'timestamp': '2025-09-30 22:13:53.657151', 'step': 770, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:13:53.694645', 'step': 770, 'epoch': 1} {'type': 'loss', 'content': 0.02450910024344921, 'timestamp': '2025-09-30 22:13:53.701862', 'step': 771, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:13:53.745935', 'step': 771, 'epoch': 1} {'type': 'loss', 'content': 0.013912992551922798, 'timestamp': '2025-09-30 22:13:53.780527', 'step': 772, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:53.822118', 'step': 772, 'epoch': 1} {'type': 'loss', 'content': 0.014330840669572353, 'timestamp': '2025-09-30 22:13:53.830134', 'step': 773, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:13:53.873290', 'step': 773, 'epoch': 1} {'type': 'loss', 'content': 0.009840606711804867, 'timestamp': '2025-09-30 22:13:53.886591', 'step': 774, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:13:53.926114', 'step': 774, 'epoch': 1} {'type': 'loss', 'content': 0.01082277949899435, 'timestamp': '2025-09-30 22:13:53.939447', 'step': 775, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:13:53.978363', 'step': 775, 'epoch': 1} {'type': 'loss', 'content': 0.03701702505350113, 'timestamp': '2025-09-30 22:13:54.012899', 'step': 776, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:13:54.055578', 'step': 776, 'epoch': 1} {'type': 'loss', 'content': 0.017155220732092857, 'timestamp': '2025-09-30 22:13:54.068873', 'step': 777, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:54.105009', 'step': 777, 'epoch': 1} {'type': 'loss', 'content': 0.017413174733519554, 'timestamp': '2025-09-30 22:13:54.113036', 'step': 778, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:13:54.159895', 'step': 778, 'epoch': 1} {'type': 'loss', 'content': 0.02033637836575508, 'timestamp': '2025-09-30 22:13:54.167541', 'step': 779, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:13:54.203556', 'step': 779, 'epoch': 1} {'type': 'loss', 'content': 0.021800467744469643, 'timestamp': '2025-09-30 22:13:54.236903', 'step': 780, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:13:54.288664', 'step': 780, 'epoch': 1} {'type': 'loss', 'content': 0.010870695114135742, 'timestamp': '2025-09-30 22:13:54.304095', 'step': 781, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:54.339513', 'step': 781, 'epoch': 1} {'type': 'loss', 'content': 0.02554204687476158, 'timestamp': '2025-09-30 22:13:54.347298', 'step': 782, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:13:54.388721', 'step': 782, 'epoch': 1} {'type': 'loss', 'content': 0.016182314604520798, 'timestamp': '2025-09-30 22:13:54.402104', 'step': 783, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:13:54.462080', 'step': 783, 'epoch': 1} {'type': 'loss', 'content': 0.016113195568323135, 'timestamp': '2025-09-30 22:13:54.495450', 'step': 784, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:54.532332', 'step': 784, 'epoch': 1} {'type': 'loss', 'content': 0.010909128934144974, 'timestamp': '2025-09-30 22:13:54.540352', 'step': 785, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:13:54.584057', 'step': 785, 'epoch': 1} {'type': 'loss', 'content': 0.021256666630506516, 'timestamp': '2025-09-30 22:13:54.591379', 'step': 786, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 16136789420416}, 'timestamp': '2025-09-30 22:13:54.647614', 'step': 786, 'epoch': 1} {'type': 'loss', 'content': 0.006734177935868502, 'timestamp': '2025-09-30 22:13:54.666657', 'step': 787, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:13:54.699301', 'step': 787, 'epoch': 1} {'type': 'loss', 'content': 0.031248245388269424, 'timestamp': '2025-09-30 22:13:54.727530', 'step': 788, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:54.769108', 'step': 788, 'epoch': 1} {'type': 'loss', 'content': 0.012545477598905563, 'timestamp': '2025-09-30 22:13:54.774636', 'step': 789, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:13:54.815393', 'step': 789, 'epoch': 1} {'type': 'loss', 'content': 0.02404787205159664, 'timestamp': '2025-09-30 22:13:54.829196', 'step': 790, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:13:54.883566', 'step': 790, 'epoch': 1} {'type': 'loss', 'content': 0.021722253412008286, 'timestamp': '2025-09-30 22:13:54.891348', 'step': 791, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:54.931166', 'step': 791, 'epoch': 1} {'type': 'loss', 'content': 0.02735544927418232, 'timestamp': '2025-09-30 22:13:54.960073', 'step': 792, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:13:54.996228', 'step': 792, 'epoch': 1} {'type': 'loss', 'content': 0.015375000424683094, 'timestamp': '2025-09-30 22:13:55.006871', 'step': 793, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:55.043368', 'step': 793, 'epoch': 1} {'type': 'loss', 'content': 0.015890859067440033, 'timestamp': '2025-09-30 22:13:55.053473', 'step': 794, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:55.085328', 'step': 794, 'epoch': 1} {'type': 'loss', 'content': 0.029422825202345848, 'timestamp': '2025-09-30 22:13:55.093192', 'step': 795, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:55.136339', 'step': 795, 'epoch': 1} {'type': 'loss', 'content': 0.01636488176882267, 'timestamp': '2025-09-30 22:13:55.167474', 'step': 796, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:13:55.206512', 'step': 796, 'epoch': 1} {'type': 'loss', 'content': 0.019432881847023964, 'timestamp': '2025-09-30 22:13:55.217005', 'step': 797, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:13:55.251024', 'step': 797, 'epoch': 1} {'type': 'loss', 'content': 0.036731038242578506, 'timestamp': '2025-09-30 22:13:55.257974', 'step': 798, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:55.292715', 'step': 798, 'epoch': 1} {'type': 'loss', 'content': 0.025194579735398293, 'timestamp': '2025-09-30 22:13:55.300541', 'step': 799, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:13:55.336626', 'step': 799, 'epoch': 1} {'type': 'loss', 'content': 0.019482208415865898, 'timestamp': '2025-09-30 22:13:55.364778', 'step': 800, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:55.401924', 'step': 800, 'epoch': 1} {'type': 'loss', 'content': 0.026140403002500534, 'timestamp': '2025-09-30 22:13:55.410604', 'step': 801, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:55.443029', 'step': 801, 'epoch': 1} {'type': 'loss', 'content': 0.024593614041805267, 'timestamp': '2025-09-30 22:13:55.451048', 'step': 802, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:13:55.484657', 'step': 802, 'epoch': 1} {'type': 'loss', 'content': 0.03376416862010956, 'timestamp': '2025-09-30 22:13:55.496961', 'step': 803, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:55.532399', 'step': 803, 'epoch': 1} {'type': 'loss', 'content': 0.01586739346385002, 'timestamp': '2025-09-30 22:13:55.563784', 'step': 804, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:55.597543', 'step': 804, 'epoch': 1} {'type': 'loss', 'content': 0.019470948725938797, 'timestamp': '2025-09-30 22:13:55.605561', 'step': 805, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-09-30 22:13:58.084212', 'step': 805, 'epoch': 1} {'type': 'pplx', 'content': 5.446366031615489, 'timestamp': '2025-09-30 22:13:58.086789', 'step': 805, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:13:58.118236', 'step': 805, 'epoch': 1} {'type': 'loss', 'content': 0.013950072228908539, 'timestamp': '2025-09-30 22:13:58.125408', 'step': 806, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:13:58.164289', 'step': 806, 'epoch': 1} {'type': 'loss', 'content': 0.012152438051998615, 'timestamp': '2025-09-30 22:13:58.179806', 'step': 807, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:13:58.212773', 'step': 807, 'epoch': 1} {'type': 'loss', 'content': 0.026414604857563972, 'timestamp': '2025-09-30 22:13:58.240797', 'step': 808, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:13:58.275425', 'step': 808, 'epoch': 1} {'type': 'loss', 'content': 0.012953411787748337, 'timestamp': '2025-09-30 22:13:58.288691', 'step': 809, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 14238374516352}, 'timestamp': '2025-09-30 22:13:58.335386', 'step': 809, 'epoch': 1} {'type': 'loss', 'content': 0.005233216565102339, 'timestamp': '2025-09-30 22:13:58.352633', 'step': 810, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:58.385541', 'step': 810, 'epoch': 1} {'type': 'loss', 'content': 0.017348093912005424, 'timestamp': '2025-09-30 22:13:58.393372', 'step': 811, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:58.424947', 'step': 811, 'epoch': 1} {'type': 'loss', 'content': 0.008451183326542377, 'timestamp': '2025-09-30 22:13:58.456178', 'step': 812, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:13:58.490172', 'step': 812, 'epoch': 1} {'type': 'loss', 'content': 0.009706801734864712, 'timestamp': '2025-09-30 22:13:58.503224', 'step': 813, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:58.543435', 'step': 813, 'epoch': 1} {'type': 'loss', 'content': 0.016594383865594864, 'timestamp': '2025-09-30 22:13:58.554437', 'step': 814, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:58.589296', 'step': 814, 'epoch': 1} {'type': 'loss', 'content': 0.02336704172194004, 'timestamp': '2025-09-30 22:13:58.597199', 'step': 815, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:13:58.636105', 'step': 815, 'epoch': 1} {'type': 'loss', 'content': 0.013078423216938972, 'timestamp': '2025-09-30 22:13:58.670959', 'step': 816, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:13:58.707170', 'step': 816, 'epoch': 1} {'type': 'loss', 'content': 0.009744293987751007, 'timestamp': '2025-09-30 22:13:58.717792', 'step': 817, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:58.751176', 'step': 817, 'epoch': 1} {'type': 'loss', 'content': 0.01637791283428669, 'timestamp': '2025-09-30 22:13:58.762227', 'step': 818, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:13:58.802043', 'step': 818, 'epoch': 1} {'type': 'loss', 'content': 0.012836102396249771, 'timestamp': '2025-09-30 22:13:58.816018', 'step': 819, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:13:58.854037', 'step': 819, 'epoch': 1} {'type': 'loss', 'content': 0.012655460275709629, 'timestamp': '2025-09-30 22:13:58.882444', 'step': 820, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:13:58.922930', 'step': 820, 'epoch': 1} {'type': 'loss', 'content': 0.015463345684111118, 'timestamp': '2025-09-30 22:13:58.933348', 'step': 821, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:13:58.971831', 'step': 821, 'epoch': 1} {'type': 'loss', 'content': 0.023430321365594864, 'timestamp': '2025-09-30 22:13:58.979432', 'step': 822, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:13:59.017428', 'step': 822, 'epoch': 1} {'type': 'loss', 'content': 0.021295029670000076, 'timestamp': '2025-09-30 22:13:59.024919', 'step': 823, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:59.058596', 'step': 823, 'epoch': 1} {'type': 'loss', 'content': 0.010640447959303856, 'timestamp': '2025-09-30 22:13:59.089903', 'step': 824, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:59.125512', 'step': 824, 'epoch': 1} {'type': 'loss', 'content': 0.016809722408652306, 'timestamp': '2025-09-30 22:13:59.131025', 'step': 825, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:13:59.165649', 'step': 825, 'epoch': 1} {'type': 'loss', 'content': 0.011587375774979591, 'timestamp': '2025-09-30 22:13:59.178196', 'step': 826, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:13:59.216681', 'step': 826, 'epoch': 1} {'type': 'loss', 'content': 0.015174712985754013, 'timestamp': '2025-09-30 22:13:59.229261', 'step': 827, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:59.264302', 'step': 827, 'epoch': 1} {'type': 'loss', 'content': 0.009945693425834179, 'timestamp': '2025-09-30 22:13:59.296429', 'step': 828, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:13:59.329270', 'step': 828, 'epoch': 1} {'type': 'loss', 'content': 0.02524462342262268, 'timestamp': '2025-09-30 22:13:59.337420', 'step': 829, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:59.375305', 'step': 829, 'epoch': 1} {'type': 'loss', 'content': 0.017129966989159584, 'timestamp': '2025-09-30 22:13:59.386425', 'step': 830, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:13:59.425848', 'step': 830, 'epoch': 1} {'type': 'loss', 'content': 0.012208385393023491, 'timestamp': '2025-09-30 22:13:59.439564', 'step': 831, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:59.486831', 'step': 831, 'epoch': 1} {'type': 'loss', 'content': 0.014922388829290867, 'timestamp': '2025-09-30 22:13:59.517928', 'step': 832, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:59.559079', 'step': 832, 'epoch': 1} {'type': 'loss', 'content': 0.01866101287305355, 'timestamp': '2025-09-30 22:13:59.564409', 'step': 833, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:59.600361', 'step': 833, 'epoch': 1} {'type': 'loss', 'content': 0.009599830955266953, 'timestamp': '2025-09-30 22:13:59.611407', 'step': 834, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:13:59.644298', 'step': 834, 'epoch': 1} {'type': 'loss', 'content': 0.03083634376525879, 'timestamp': '2025-09-30 22:13:59.654533', 'step': 835, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:13:59.693662', 'step': 835, 'epoch': 1} {'type': 'loss', 'content': 0.010529820807278156, 'timestamp': '2025-09-30 22:13:59.726829', 'step': 836, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:13:59.763676', 'step': 836, 'epoch': 1} {'type': 'loss', 'content': 0.015880852937698364, 'timestamp': '2025-09-30 22:13:59.772306', 'step': 837, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:13:59.808123', 'step': 837, 'epoch': 1} {'type': 'loss', 'content': 0.02119717001914978, 'timestamp': '2025-09-30 22:13:59.816002', 'step': 838, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:13:59.847169', 'step': 838, 'epoch': 1} {'type': 'loss', 'content': 0.01195271871984005, 'timestamp': '2025-09-30 22:13:59.854327', 'step': 839, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:13:59.887415', 'step': 839, 'epoch': 1} {'type': 'loss', 'content': 0.013231132179498672, 'timestamp': '2025-09-30 22:13:59.915775', 'step': 840, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:13:59.953962', 'step': 840, 'epoch': 1} {'type': 'loss', 'content': 0.0175318643450737, 'timestamp': '2025-09-30 22:13:59.963724', 'step': 841, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:13:59.998336', 'step': 841, 'epoch': 1} {'type': 'loss', 'content': 0.0097137950360775, 'timestamp': '2025-09-30 22:14:00.010847', 'step': 842, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:14:00.052076', 'step': 842, 'epoch': 1} {'type': 'loss', 'content': 0.011096476577222347, 'timestamp': '2025-09-30 22:14:00.065803', 'step': 843, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:14:00.101517', 'step': 843, 'epoch': 1} {'type': 'loss', 'content': 0.02286028116941452, 'timestamp': '2025-09-30 22:14:00.130130', 'step': 844, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:00.166872', 'step': 844, 'epoch': 1} {'type': 'loss', 'content': 0.00962063018232584, 'timestamp': '2025-09-30 22:14:00.172624', 'step': 845, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:00.204656', 'step': 845, 'epoch': 1} {'type': 'loss', 'content': 0.022263722494244576, 'timestamp': '2025-09-30 22:14:00.214984', 'step': 846, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:00.247781', 'step': 846, 'epoch': 1} {'type': 'loss', 'content': 0.00966158602386713, 'timestamp': '2025-09-30 22:14:00.258658', 'step': 847, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:00.296570', 'step': 847, 'epoch': 1} {'type': 'loss', 'content': 0.024794315919280052, 'timestamp': '2025-09-30 22:14:00.325349', 'step': 848, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 15662185694400}, 'timestamp': '2025-09-30 22:14:00.385396', 'step': 848, 'epoch': 1} {'type': 'loss', 'content': 0.012166989967226982, 'timestamp': '2025-09-30 22:14:00.401736', 'step': 849, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:14:00.439571', 'step': 849, 'epoch': 1} {'type': 'loss', 'content': 0.008486066944897175, 'timestamp': '2025-09-30 22:14:00.452127', 'step': 850, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:14:00.492815', 'step': 850, 'epoch': 1} {'type': 'loss', 'content': 0.010692514479160309, 'timestamp': '2025-09-30 22:14:00.506623', 'step': 851, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:14:00.541829', 'step': 851, 'epoch': 1} {'type': 'loss', 'content': 0.011581682600080967, 'timestamp': '2025-09-30 22:14:00.576075', 'step': 852, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:00.625311', 'step': 852, 'epoch': 1} {'type': 'loss', 'content': 0.016717543825507164, 'timestamp': '2025-09-30 22:14:00.635069', 'step': 853, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:14:00.673558', 'step': 853, 'epoch': 1} {'type': 'loss', 'content': 0.027172185480594635, 'timestamp': '2025-09-30 22:14:00.687255', 'step': 854, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:00.720660', 'step': 854, 'epoch': 1} {'type': 'loss', 'content': 0.023855775594711304, 'timestamp': '2025-09-30 22:14:00.732711', 'step': 855, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:00.764832', 'step': 855, 'epoch': 1} {'type': 'loss', 'content': 0.018536483868956566, 'timestamp': '2025-09-30 22:14:00.797864', 'step': 856, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:00.835583', 'step': 856, 'epoch': 1} {'type': 'loss', 'content': 0.030458739027380943, 'timestamp': '2025-09-30 22:14:00.845442', 'step': 857, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:00.881114', 'step': 857, 'epoch': 1} {'type': 'loss', 'content': 0.014870762825012207, 'timestamp': '2025-09-30 22:14:00.893227', 'step': 858, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:14:00.930939', 'step': 858, 'epoch': 1} {'type': 'loss', 'content': 0.017727408558130264, 'timestamp': '2025-09-30 22:14:00.944878', 'step': 859, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:00.979186', 'step': 859, 'epoch': 1} {'type': 'loss', 'content': 0.012422531843185425, 'timestamp': '2025-09-30 22:14:01.007999', 'step': 860, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:14:01.045455', 'step': 860, 'epoch': 1} {'type': 'loss', 'content': 0.008018754422664642, 'timestamp': '2025-09-30 22:14:01.058828', 'step': 861, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:14:01.095177', 'step': 861, 'epoch': 1} {'type': 'loss', 'content': 0.01912866346538067, 'timestamp': '2025-09-30 22:14:01.107642', 'step': 862, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:14:01.152731', 'step': 862, 'epoch': 1} {'type': 'loss', 'content': 0.01784403808414936, 'timestamp': '2025-09-30 22:14:01.160292', 'step': 863, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:01.196779', 'step': 863, 'epoch': 1} {'type': 'loss', 'content': 0.022543074563145638, 'timestamp': '2025-09-30 22:14:01.225476', 'step': 864, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:14:01.258613', 'step': 864, 'epoch': 1} {'type': 'loss', 'content': 0.014795443043112755, 'timestamp': '2025-09-30 22:14:01.269110', 'step': 865, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:14:01.307700', 'step': 865, 'epoch': 1} {'type': 'loss', 'content': 0.01816575601696968, 'timestamp': '2025-09-30 22:14:01.321515', 'step': 866, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:01.357297', 'step': 866, 'epoch': 1} {'type': 'loss', 'content': 0.015042226761579514, 'timestamp': '2025-09-30 22:14:01.369627', 'step': 867, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:01.402584', 'step': 867, 'epoch': 1} {'type': 'loss', 'content': 0.010017563588917255, 'timestamp': '2025-09-30 22:14:01.435797', 'step': 868, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:01.476134', 'step': 868, 'epoch': 1} {'type': 'loss', 'content': 0.022046193480491638, 'timestamp': '2025-09-30 22:14:01.486044', 'step': 869, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:01.519190', 'step': 869, 'epoch': 1} {'type': 'loss', 'content': 0.016485564410686493, 'timestamp': '2025-09-30 22:14:01.531321', 'step': 870, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:01.573302', 'step': 870, 'epoch': 1} {'type': 'loss', 'content': 0.014110715128481388, 'timestamp': '2025-09-30 22:14:01.584371', 'step': 871, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:14:01.618712', 'step': 871, 'epoch': 1} {'type': 'loss', 'content': 0.024449339136481285, 'timestamp': '2025-09-30 22:14:01.647066', 'step': 872, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:14:01.692885', 'step': 872, 'epoch': 1} {'type': 'loss', 'content': 0.012250715866684914, 'timestamp': '2025-09-30 22:14:01.706201', 'step': 873, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:14:01.742703', 'step': 873, 'epoch': 1} {'type': 'loss', 'content': 0.015656866133213043, 'timestamp': '2025-09-30 22:14:01.756425', 'step': 874, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:14:01.794283', 'step': 874, 'epoch': 1} {'type': 'loss', 'content': 0.017466336488723755, 'timestamp': '2025-09-30 22:14:01.806792', 'step': 875, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:14:01.845856', 'step': 875, 'epoch': 1} {'type': 'loss', 'content': 0.019561603665351868, 'timestamp': '2025-09-30 22:14:01.880085', 'step': 876, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:01.920849', 'step': 876, 'epoch': 1} {'type': 'loss', 'content': 0.012877970933914185, 'timestamp': '2025-09-30 22:14:01.930750', 'step': 877, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:01.963363', 'step': 877, 'epoch': 1} {'type': 'loss', 'content': 0.018890580162405968, 'timestamp': '2025-09-30 22:14:01.973699', 'step': 878, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:14:02.005660', 'step': 878, 'epoch': 1} {'type': 'loss', 'content': 0.02163480408489704, 'timestamp': '2025-09-30 22:14:02.012736', 'step': 879, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:14:02.052514', 'step': 879, 'epoch': 1} {'type': 'loss', 'content': 0.008167365565896034, 'timestamp': '2025-09-30 22:14:02.087367', 'step': 880, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:02.120831', 'step': 880, 'epoch': 1} {'type': 'loss', 'content': 0.024901211261749268, 'timestamp': '2025-09-30 22:14:02.130637', 'step': 881, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:14:02.168472', 'step': 881, 'epoch': 1} {'type': 'loss', 'content': 0.013065425679087639, 'timestamp': '2025-09-30 22:14:02.182132', 'step': 882, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:02.216822', 'step': 882, 'epoch': 1} {'type': 'loss', 'content': 0.012894055806100368, 'timestamp': '2025-09-30 22:14:02.229099', 'step': 883, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:02.268315', 'step': 883, 'epoch': 1} {'type': 'loss', 'content': 0.015100360848009586, 'timestamp': '2025-09-30 22:14:02.300080', 'step': 884, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:02.336762', 'step': 884, 'epoch': 1} {'type': 'loss', 'content': 0.0276936162263155, 'timestamp': '2025-09-30 22:14:02.342318', 'step': 885, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:14:02.380093', 'step': 885, 'epoch': 1} {'type': 'loss', 'content': 0.012446722947061062, 'timestamp': '2025-09-30 22:14:02.393757', 'step': 886, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:02.427769', 'step': 886, 'epoch': 1} {'type': 'loss', 'content': 0.013708610087633133, 'timestamp': '2025-09-30 22:14:02.439867', 'step': 887, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:14:02.482510', 'step': 887, 'epoch': 1} {'type': 'loss', 'content': 0.011930789798498154, 'timestamp': '2025-09-30 22:14:02.519009', 'step': 888, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:14:02.552566', 'step': 888, 'epoch': 1} {'type': 'loss', 'content': 0.016422098502516747, 'timestamp': '2025-09-30 22:14:02.562655', 'step': 889, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:02.595648', 'step': 889, 'epoch': 1} {'type': 'loss', 'content': 0.018035048618912697, 'timestamp': '2025-09-30 22:14:02.607803', 'step': 890, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:14:02.648594', 'step': 890, 'epoch': 1} {'type': 'loss', 'content': 0.01066044345498085, 'timestamp': '2025-09-30 22:14:02.653057', 'step': 891, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:14:02.686245', 'step': 891, 'epoch': 1} {'type': 'loss', 'content': 0.015982037410140038, 'timestamp': '2025-09-30 22:14:02.714212', 'step': 892, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:14:02.749087', 'step': 892, 'epoch': 1} {'type': 'loss', 'content': 0.013905158266425133, 'timestamp': '2025-09-30 22:14:02.753517', 'step': 893, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:02.789733', 'step': 893, 'epoch': 1} {'type': 'loss', 'content': 0.011530783027410507, 'timestamp': '2025-09-30 22:14:02.797570', 'step': 894, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:02.836499', 'step': 894, 'epoch': 1} {'type': 'loss', 'content': 0.01866287738084793, 'timestamp': '2025-09-30 22:14:02.846416', 'step': 895, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:02.881824', 'step': 895, 'epoch': 1} {'type': 'loss', 'content': 0.011501681990921497, 'timestamp': '2025-09-30 22:14:02.913512', 'step': 896, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:02.957801', 'step': 896, 'epoch': 1} {'type': 'loss', 'content': 0.014294018037617207, 'timestamp': '2025-09-30 22:14:02.966162', 'step': 897, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:14:03.002258', 'step': 897, 'epoch': 1} {'type': 'loss', 'content': 0.01338213961571455, 'timestamp': '2025-09-30 22:14:03.015620', 'step': 898, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:03.048946', 'step': 898, 'epoch': 1} {'type': 'loss', 'content': 0.019916707649827003, 'timestamp': '2025-09-30 22:14:03.059806', 'step': 899, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:14:03.092905', 'step': 899, 'epoch': 1} {'type': 'loss', 'content': 0.019757641479372978, 'timestamp': '2025-09-30 22:14:03.120774', 'step': 900, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:03.153911', 'step': 900, 'epoch': 1} {'type': 'loss', 'content': 0.01972472108900547, 'timestamp': '2025-09-30 22:14:03.162400', 'step': 901, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:03.199982', 'step': 901, 'epoch': 1} {'type': 'loss', 'content': 0.022137483581900597, 'timestamp': '2025-09-30 22:14:03.210684', 'step': 902, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:03.250364', 'step': 902, 'epoch': 1} {'type': 'loss', 'content': 0.017975429072976112, 'timestamp': '2025-09-30 22:14:03.260436', 'step': 903, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:14:03.293549', 'step': 903, 'epoch': 1} {'type': 'loss', 'content': 0.020977221429347992, 'timestamp': '2025-09-30 22:14:03.321326', 'step': 904, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:03.360691', 'step': 904, 'epoch': 1} {'type': 'loss', 'content': 0.00997572485357523, 'timestamp': '2025-09-30 22:14:03.370403', 'step': 905, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:14:03.405374', 'step': 905, 'epoch': 1} {'type': 'loss', 'content': 0.015123856253921986, 'timestamp': '2025-09-30 22:14:03.409939', 'step': 906, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:03.447373', 'step': 906, 'epoch': 1} {'type': 'loss', 'content': 0.016774147748947144, 'timestamp': '2025-09-30 22:14:03.455205', 'step': 907, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:14:03.493286', 'step': 907, 'epoch': 1} {'type': 'loss', 'content': 0.014639715664088726, 'timestamp': '2025-09-30 22:14:03.518567', 'step': 908, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:14:03.552368', 'step': 908, 'epoch': 1} {'type': 'loss', 'content': 0.014328272081911564, 'timestamp': '2025-09-30 22:14:03.557041', 'step': 909, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:14:03.590231', 'step': 909, 'epoch': 1} {'type': 'loss', 'content': 0.02620774507522583, 'timestamp': '2025-09-30 22:14:03.597776', 'step': 910, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:03.638426', 'step': 910, 'epoch': 1} {'type': 'loss', 'content': 0.015904365107417107, 'timestamp': '2025-09-30 22:14:03.646266', 'step': 911, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:03.680075', 'step': 911, 'epoch': 1} {'type': 'loss', 'content': 0.01991444267332554, 'timestamp': '2025-09-30 22:14:03.711154', 'step': 912, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:03.747038', 'step': 912, 'epoch': 1} {'type': 'loss', 'content': 0.01666913367807865, 'timestamp': '2025-09-30 22:14:03.754701', 'step': 913, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:03.788682', 'step': 913, 'epoch': 1} {'type': 'loss', 'content': 0.015176458284258842, 'timestamp': '2025-09-30 22:14:03.796404', 'step': 914, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 15187581968384}, 'timestamp': '2025-09-30 22:14:03.847683', 'step': 914, 'epoch': 1} {'type': 'loss', 'content': 0.007855835370719433, 'timestamp': '2025-09-30 22:14:03.865326', 'step': 915, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:14:03.902357', 'step': 915, 'epoch': 1} {'type': 'loss', 'content': 0.01512453518807888, 'timestamp': '2025-09-30 22:14:03.935847', 'step': 916, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:14:03.980180', 'step': 916, 'epoch': 1} {'type': 'loss', 'content': 0.018737010657787323, 'timestamp': '2025-09-30 22:14:03.992767', 'step': 917, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:04.046413', 'step': 917, 'epoch': 1} {'type': 'loss', 'content': 0.017014063894748688, 'timestamp': '2025-09-30 22:14:04.056561', 'step': 918, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:04.089726', 'step': 918, 'epoch': 1} {'type': 'loss', 'content': 0.012434338219463825, 'timestamp': '2025-09-30 22:14:04.100068', 'step': 919, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:04.139334', 'step': 919, 'epoch': 1} {'type': 'loss', 'content': 0.010637388564646244, 'timestamp': '2025-09-30 22:14:04.168055', 'step': 920, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-09-30 22:14:06.753923', 'step': 920, 'epoch': 1} {'type': 'pplx', 'content': 5.437166011987229, 'timestamp': '2025-09-30 22:14:06.757145', 'step': 920, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:06.791183', 'step': 920, 'epoch': 1} {'type': 'loss', 'content': 0.016253093257546425, 'timestamp': '2025-09-30 22:14:06.797640', 'step': 921, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:06.832012', 'step': 921, 'epoch': 1} {'type': 'loss', 'content': 0.008861783891916275, 'timestamp': '2025-09-30 22:14:06.844066', 'step': 922, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-09-30 22:14:06.889231', 'step': 922, 'epoch': 1} {'type': 'loss', 'content': 0.006681237835437059, 'timestamp': '2025-09-30 22:14:06.906336', 'step': 923, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:06.938193', 'step': 923, 'epoch': 1} {'type': 'loss', 'content': 0.005157914943993092, 'timestamp': '2025-09-30 22:14:06.969313', 'step': 924, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:14:07.009562', 'step': 924, 'epoch': 1} {'type': 'loss', 'content': 0.012271142564713955, 'timestamp': '2025-09-30 22:14:07.014756', 'step': 925, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:14:07.048654', 'step': 925, 'epoch': 1} {'type': 'loss', 'content': 0.010279586538672447, 'timestamp': '2025-09-30 22:14:07.055987', 'step': 926, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:07.091280', 'step': 926, 'epoch': 1} {'type': 'loss', 'content': 0.017878876999020576, 'timestamp': '2025-09-30 22:14:07.101571', 'step': 927, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:14:07.142443', 'step': 927, 'epoch': 1} {'type': 'loss', 'content': 0.04153796657919884, 'timestamp': '2025-09-30 22:14:07.167889', 'step': 928, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:14:07.203007', 'step': 928, 'epoch': 1} {'type': 'loss', 'content': 0.019228620454669, 'timestamp': '2025-09-30 22:14:07.207926', 'step': 929, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:14:07.252292', 'step': 929, 'epoch': 1} {'type': 'loss', 'content': 0.014299347065389156, 'timestamp': '2025-09-30 22:14:07.266084', 'step': 930, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:14:07.304441', 'step': 930, 'epoch': 1} {'type': 'loss', 'content': 0.005411448422819376, 'timestamp': '2025-09-30 22:14:07.311632', 'step': 931, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:07.350062', 'step': 931, 'epoch': 1} {'type': 'loss', 'content': 0.021051695570349693, 'timestamp': '2025-09-30 22:14:07.378792', 'step': 932, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:14:07.412185', 'step': 932, 'epoch': 1} {'type': 'loss', 'content': 0.007767108269035816, 'timestamp': '2025-09-30 22:14:07.422580', 'step': 933, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:14:07.456182', 'step': 933, 'epoch': 1} {'type': 'loss', 'content': 0.02131766267120838, 'timestamp': '2025-09-30 22:14:07.460614', 'step': 934, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:14:07.492316', 'step': 934, 'epoch': 1} {'type': 'loss', 'content': 0.009594579227268696, 'timestamp': '2025-09-30 22:14:07.499404', 'step': 935, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:14:07.535128', 'step': 935, 'epoch': 1} {'type': 'loss', 'content': 0.010118049569427967, 'timestamp': '2025-09-30 22:14:07.569331', 'step': 936, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:14:07.610894', 'step': 936, 'epoch': 1} {'type': 'loss', 'content': 0.008677459321916103, 'timestamp': '2025-09-30 22:14:07.626273', 'step': 937, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:07.661459', 'step': 937, 'epoch': 1} {'type': 'loss', 'content': 0.018763471394777298, 'timestamp': '2025-09-30 22:14:07.671831', 'step': 938, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:14:07.709458', 'step': 938, 'epoch': 1} {'type': 'loss', 'content': 0.016597526147961617, 'timestamp': '2025-09-30 22:14:07.723117', 'step': 939, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-09-30 22:14:07.774652', 'step': 939, 'epoch': 1} {'type': 'loss', 'content': 0.011049261316657066, 'timestamp': '2025-09-30 22:14:07.812623', 'step': 940, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:07.847425', 'step': 940, 'epoch': 1} {'type': 'loss', 'content': 0.02055833861231804, 'timestamp': '2025-09-30 22:14:07.857235', 'step': 941, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:07.890794', 'step': 941, 'epoch': 1} {'type': 'loss', 'content': 0.01371944323182106, 'timestamp': '2025-09-30 22:14:07.901176', 'step': 942, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:14:07.942349', 'step': 942, 'epoch': 1} {'type': 'loss', 'content': 0.01326888706535101, 'timestamp': '2025-09-30 22:14:07.956080', 'step': 943, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:07.991352', 'step': 943, 'epoch': 1} {'type': 'loss', 'content': 0.027512535452842712, 'timestamp': '2025-09-30 22:14:08.022584', 'step': 944, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:08.057180', 'step': 944, 'epoch': 1} {'type': 'loss', 'content': 0.014796360395848751, 'timestamp': '2025-09-30 22:14:08.065225', 'step': 945, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:14:08.106347', 'step': 945, 'epoch': 1} {'type': 'loss', 'content': 0.01242104358971119, 'timestamp': '2025-09-30 22:14:08.121991', 'step': 946, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:14:08.158128', 'step': 946, 'epoch': 1} {'type': 'loss', 'content': 0.017054343596100807, 'timestamp': '2025-09-30 22:14:08.170708', 'step': 947, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:14:08.211536', 'step': 947, 'epoch': 1} {'type': 'loss', 'content': 0.016550425440073013, 'timestamp': '2025-09-30 22:14:08.246131', 'step': 948, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:08.280433', 'step': 948, 'epoch': 1} {'type': 'loss', 'content': 0.02881843037903309, 'timestamp': '2025-09-30 22:14:08.290053', 'step': 949, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:14:08.333504', 'step': 949, 'epoch': 1} {'type': 'loss', 'content': 0.02783220261335373, 'timestamp': '2025-09-30 22:14:08.340996', 'step': 950, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:14:08.377478', 'step': 950, 'epoch': 1} {'type': 'loss', 'content': 0.013467947021126747, 'timestamp': '2025-09-30 22:14:08.385268', 'step': 951, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:14:08.423240', 'step': 951, 'epoch': 1} {'type': 'loss', 'content': 0.02384248748421669, 'timestamp': '2025-09-30 22:14:08.451107', 'step': 952, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:14:08.492528', 'step': 952, 'epoch': 1} {'type': 'loss', 'content': 0.016103466972708702, 'timestamp': '2025-09-30 22:14:08.497755', 'step': 953, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:08.537106', 'step': 953, 'epoch': 1} {'type': 'loss', 'content': 0.017157990485429764, 'timestamp': '2025-09-30 22:14:08.547992', 'step': 954, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:08.583104', 'step': 954, 'epoch': 1} {'type': 'loss', 'content': 0.023314381018280983, 'timestamp': '2025-09-30 22:14:08.590843', 'step': 955, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:08.628707', 'step': 955, 'epoch': 1} {'type': 'loss', 'content': 0.013151245191693306, 'timestamp': '2025-09-30 22:14:08.660630', 'step': 956, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:14:08.692364', 'step': 956, 'epoch': 1} {'type': 'loss', 'content': 0.010166237130761147, 'timestamp': '2025-09-30 22:14:08.702583', 'step': 957, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:08.738726', 'step': 957, 'epoch': 1} {'type': 'loss', 'content': 0.01939929835498333, 'timestamp': '2025-09-30 22:14:08.749959', 'step': 958, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:14:08.785551', 'step': 958, 'epoch': 1} {'type': 'loss', 'content': 0.016085408627986908, 'timestamp': '2025-09-30 22:14:08.798115', 'step': 959, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:14:08.833039', 'step': 959, 'epoch': 1} {'type': 'loss', 'content': 0.020628873258829117, 'timestamp': '2025-09-30 22:14:08.861350', 'step': 960, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:08.898337', 'step': 960, 'epoch': 1} {'type': 'loss', 'content': 0.016140276566147804, 'timestamp': '2025-09-30 22:14:08.906784', 'step': 961, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:14:08.945578', 'step': 961, 'epoch': 1} {'type': 'loss', 'content': 0.012015962973237038, 'timestamp': '2025-09-30 22:14:08.952890', 'step': 962, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:08.992178', 'step': 962, 'epoch': 1} {'type': 'loss', 'content': 0.02607676386833191, 'timestamp': '2025-09-30 22:14:09.003105', 'step': 963, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:14:09.038816', 'step': 963, 'epoch': 1} {'type': 'loss', 'content': 0.011443495750427246, 'timestamp': '2025-09-30 22:14:09.066628', 'step': 964, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:14:09.116595', 'step': 964, 'epoch': 1} {'type': 'loss', 'content': 0.015275280922651291, 'timestamp': '2025-09-30 22:14:09.129973', 'step': 965, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:09.163837', 'step': 965, 'epoch': 1} {'type': 'loss', 'content': 0.01944708079099655, 'timestamp': '2025-09-30 22:14:09.171904', 'step': 966, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:14:09.210415', 'step': 966, 'epoch': 1} {'type': 'loss', 'content': 0.021834930405020714, 'timestamp': '2025-09-30 22:14:09.217237', 'step': 967, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:14:09.262204', 'step': 967, 'epoch': 1} {'type': 'loss', 'content': 0.010521315038204193, 'timestamp': '2025-09-30 22:14:09.296805', 'step': 968, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:14:09.332128', 'step': 968, 'epoch': 1} {'type': 'loss', 'content': 0.015339354053139687, 'timestamp': '2025-09-30 22:14:09.344689', 'step': 969, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:09.391933', 'step': 969, 'epoch': 1} {'type': 'loss', 'content': 0.0168760959059, 'timestamp': '2025-09-30 22:14:09.404235', 'step': 970, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:14:09.437467', 'step': 970, 'epoch': 1} {'type': 'loss', 'content': 0.026797406375408173, 'timestamp': '2025-09-30 22:14:09.449979', 'step': 971, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:14:09.488520', 'step': 971, 'epoch': 1} {'type': 'loss', 'content': 0.016474634408950806, 'timestamp': '2025-09-30 22:14:09.523156', 'step': 972, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 15662185694400}, 'timestamp': '2025-09-30 22:14:09.569953', 'step': 972, 'epoch': 1} {'type': 'loss', 'content': 0.00674683041870594, 'timestamp': '2025-09-30 22:14:09.589236', 'step': 973, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:14:09.631120', 'step': 973, 'epoch': 1} {'type': 'loss', 'content': 0.013944888487458229, 'timestamp': '2025-09-30 22:14:09.646707', 'step': 974, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:09.681364', 'step': 974, 'epoch': 1} {'type': 'loss', 'content': 0.01417811494320631, 'timestamp': '2025-09-30 22:14:09.692359', 'step': 975, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:14:09.733428', 'step': 975, 'epoch': 1} {'type': 'loss', 'content': 0.019666852429509163, 'timestamp': '2025-09-30 22:14:09.767942', 'step': 976, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:09.800486', 'step': 976, 'epoch': 1} {'type': 'loss', 'content': 0.013981659896671772, 'timestamp': '2025-09-30 22:14:09.806180', 'step': 977, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:14:09.848161', 'step': 977, 'epoch': 1} {'type': 'loss', 'content': 0.01465094555169344, 'timestamp': '2025-09-30 22:14:09.860685', 'step': 978, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:14:09.902854', 'step': 978, 'epoch': 1} {'type': 'loss', 'content': 0.009940768592059612, 'timestamp': '2025-09-30 22:14:09.918442', 'step': 979, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:14:09.967549', 'step': 979, 'epoch': 1} {'type': 'loss', 'content': 0.006746287923306227, 'timestamp': '2025-09-30 22:14:10.002313', 'step': 980, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:10.040655', 'step': 980, 'epoch': 1} {'type': 'loss', 'content': 0.03245190531015396, 'timestamp': '2025-09-30 22:14:10.049316', 'step': 981, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:14:10.084868', 'step': 981, 'epoch': 1} {'type': 'loss', 'content': 0.015351356938481331, 'timestamp': '2025-09-30 22:14:10.098249', 'step': 982, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:14:10.131692', 'step': 982, 'epoch': 1} {'type': 'loss', 'content': 0.03203147277235985, 'timestamp': '2025-09-30 22:14:10.138649', 'step': 983, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:10.172607', 'step': 983, 'epoch': 1} {'type': 'loss', 'content': 0.012618161737918854, 'timestamp': '2025-09-30 22:14:10.203729', 'step': 984, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:14:10.245929', 'step': 984, 'epoch': 1} {'type': 'loss', 'content': 0.008273485116660595, 'timestamp': '2025-09-30 22:14:10.256392', 'step': 985, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:14:10.292334', 'step': 985, 'epoch': 1} {'type': 'loss', 'content': 0.012804933823645115, 'timestamp': '2025-09-30 22:14:10.299541', 'step': 986, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:10.332614', 'step': 986, 'epoch': 1} {'type': 'loss', 'content': 0.025396456941962242, 'timestamp': '2025-09-30 22:14:10.343503', 'step': 987, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 16136789420416}, 'timestamp': '2025-09-30 22:14:10.395794', 'step': 987, 'epoch': 1} {'type': 'loss', 'content': 0.019918175414204597, 'timestamp': '2025-09-30 22:14:10.435706', 'step': 988, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:10.469009', 'step': 988, 'epoch': 1} {'type': 'loss', 'content': 0.017784535884857178, 'timestamp': '2025-09-30 22:14:10.477776', 'step': 989, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:14:10.515598', 'step': 989, 'epoch': 1} {'type': 'loss', 'content': 0.010058222338557243, 'timestamp': '2025-09-30 22:14:10.522721', 'step': 990, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-09-30 22:14:10.571820', 'step': 990, 'epoch': 1} {'type': 'loss', 'content': 0.00819582398980856, 'timestamp': '2025-09-30 22:14:10.588129', 'step': 991, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:10.624132', 'step': 991, 'epoch': 1} {'type': 'loss', 'content': 0.01214824989438057, 'timestamp': '2025-09-30 22:14:10.650659', 'step': 992, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:10.686899', 'step': 992, 'epoch': 1} {'type': 'loss', 'content': 0.010886823758482933, 'timestamp': '2025-09-30 22:14:10.696450', 'step': 993, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:10.729350', 'step': 993, 'epoch': 1} {'type': 'loss', 'content': 0.013221736997365952, 'timestamp': '2025-09-30 22:14:10.737085', 'step': 994, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:10.772734', 'step': 994, 'epoch': 1} {'type': 'loss', 'content': 0.015556025318801403, 'timestamp': '2025-09-30 22:14:10.780494', 'step': 995, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:14:10.831426', 'step': 995, 'epoch': 1} {'type': 'loss', 'content': 0.014741851948201656, 'timestamp': '2025-09-30 22:14:10.865611', 'step': 996, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:14:10.897764', 'step': 996, 'epoch': 1} {'type': 'loss', 'content': 0.014557472430169582, 'timestamp': '2025-09-30 22:14:10.903148', 'step': 997, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:14:10.936950', 'step': 997, 'epoch': 1} {'type': 'loss', 'content': 0.011851382441818714, 'timestamp': '2025-09-30 22:14:10.944239', 'step': 998, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:14:10.991666', 'step': 998, 'epoch': 1} {'type': 'loss', 'content': 0.01922006718814373, 'timestamp': '2025-09-30 22:14:10.999261', 'step': 999, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:11.033925', 'step': 999, 'epoch': 1} {'type': 'loss', 'content': 0.016480710357427597, 'timestamp': '2025-09-30 22:14:11.065573', 'step': 1000, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 1000', 'timestamp': '2025-09-30 22:14:16.235356', 'step': 1000, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:14:16.274591', 'step': 1000, 'epoch': 1} {'type': 'loss', 'content': 0.027280423790216446, 'timestamp': '2025-09-30 22:14:16.278282', 'step': 1001, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:16.318809', 'step': 1001, 'epoch': 1} {'type': 'loss', 'content': 0.023930471390485764, 'timestamp': '2025-09-30 22:14:16.329324', 'step': 1002, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:14:16.379352', 'step': 1002, 'epoch': 1} {'type': 'loss', 'content': 0.02310759946703911, 'timestamp': '2025-09-30 22:14:16.383280', 'step': 1003, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:14:16.426238', 'step': 1003, 'epoch': 1} {'type': 'loss', 'content': 0.019107503816485405, 'timestamp': '2025-09-30 22:14:16.463706', 'step': 1004, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:14:16.507751', 'step': 1004, 'epoch': 1} {'type': 'loss', 'content': 0.01613452471792698, 'timestamp': '2025-09-30 22:14:16.512275', 'step': 1005, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:14:16.552761', 'step': 1005, 'epoch': 1} {'type': 'loss', 'content': 0.025932583957910538, 'timestamp': '2025-09-30 22:14:16.557234', 'step': 1006, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:16.597795', 'step': 1006, 'epoch': 1} {'type': 'loss', 'content': 0.017133804038167, 'timestamp': '2025-09-30 22:14:16.607631', 'step': 1007, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:14:16.659475', 'step': 1007, 'epoch': 1} {'type': 'loss', 'content': 0.006820287089794874, 'timestamp': '2025-09-30 22:14:16.694159', 'step': 1008, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:14:16.729978', 'step': 1008, 'epoch': 1} {'type': 'loss', 'content': 0.00994724128395319, 'timestamp': '2025-09-30 22:14:16.743007', 'step': 1009, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:14:16.781840', 'step': 1009, 'epoch': 1} {'type': 'loss', 'content': 0.015249098651111126, 'timestamp': '2025-09-30 22:14:16.788935', 'step': 1010, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:14:16.823322', 'step': 1010, 'epoch': 1} {'type': 'loss', 'content': 0.02501874603331089, 'timestamp': '2025-09-30 22:14:16.830721', 'step': 1011, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:16.870840', 'step': 1011, 'epoch': 1} {'type': 'loss', 'content': 0.017700329422950745, 'timestamp': '2025-09-30 22:14:16.902658', 'step': 1012, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:16.939259', 'step': 1012, 'epoch': 1} {'type': 'loss', 'content': 0.0090402290225029, 'timestamp': '2025-09-30 22:14:16.947896', 'step': 1013, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:17.001464', 'step': 1013, 'epoch': 1} {'type': 'loss', 'content': 0.018181640654802322, 'timestamp': '2025-09-30 22:14:17.012675', 'step': 1014, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:14:17.053290', 'step': 1014, 'epoch': 1} {'type': 'loss', 'content': 0.017796620726585388, 'timestamp': '2025-09-30 22:14:17.066946', 'step': 1015, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:17.105038', 'step': 1015, 'epoch': 1} {'type': 'loss', 'content': 0.013271898031234741, 'timestamp': '2025-09-30 22:14:17.137089', 'step': 1016, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:14:17.178282', 'step': 1016, 'epoch': 1} {'type': 'loss', 'content': 0.038167428225278854, 'timestamp': '2025-09-30 22:14:17.183741', 'step': 1017, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:14:17.220171', 'step': 1017, 'epoch': 1} {'type': 'loss', 'content': 0.019977156072854996, 'timestamp': '2025-09-30 22:14:17.227610', 'step': 1018, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:14:17.267100', 'step': 1018, 'epoch': 1} {'type': 'loss', 'content': 0.011846660636365414, 'timestamp': '2025-09-30 22:14:17.281116', 'step': 1019, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 17560600598464}, 'timestamp': '2025-09-30 22:14:17.338390', 'step': 1019, 'epoch': 1} {'type': 'loss', 'content': 0.007278237491846085, 'timestamp': '2025-09-30 22:14:17.380347', 'step': 1020, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:14:17.419895', 'step': 1020, 'epoch': 1} {'type': 'loss', 'content': 0.025377754122018814, 'timestamp': '2025-09-30 22:14:17.425261', 'step': 1021, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:14:17.461539', 'step': 1021, 'epoch': 1} {'type': 'loss', 'content': 0.021037183701992035, 'timestamp': '2025-09-30 22:14:17.475197', 'step': 1022, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:14:17.511585', 'step': 1022, 'epoch': 1} {'type': 'loss', 'content': 0.015959495678544044, 'timestamp': '2025-09-30 22:14:17.523649', 'step': 1023, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:17.564747', 'step': 1023, 'epoch': 1} {'type': 'loss', 'content': 0.019597096368670464, 'timestamp': '2025-09-30 22:14:17.595875', 'step': 1024, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:14:17.630134', 'step': 1024, 'epoch': 1} {'type': 'loss', 'content': 0.01724843867123127, 'timestamp': '2025-09-30 22:14:17.635208', 'step': 1025, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:14:17.672197', 'step': 1025, 'epoch': 1} {'type': 'loss', 'content': 0.012914427556097507, 'timestamp': '2025-09-30 22:14:17.679794', 'step': 1026, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:14:17.715518', 'step': 1026, 'epoch': 1} {'type': 'loss', 'content': 0.021281026303768158, 'timestamp': '2025-09-30 22:14:17.722737', 'step': 1027, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:14:17.760909', 'step': 1027, 'epoch': 1} {'type': 'loss', 'content': 0.016101287677884102, 'timestamp': '2025-09-30 22:14:17.794311', 'step': 1028, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:17.829972', 'step': 1028, 'epoch': 1} {'type': 'loss', 'content': 0.029025474563241005, 'timestamp': '2025-09-30 22:14:17.838718', 'step': 1029, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:14:17.878883', 'step': 1029, 'epoch': 1} {'type': 'loss', 'content': 0.01193151529878378, 'timestamp': '2025-09-30 22:14:17.891444', 'step': 1030, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:14:17.934011', 'step': 1030, 'epoch': 1} {'type': 'loss', 'content': 0.016847174614667892, 'timestamp': '2025-09-30 22:14:17.941620', 'step': 1031, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:17.982090', 'step': 1031, 'epoch': 1} {'type': 'loss', 'content': 0.021021408960223198, 'timestamp': '2025-09-30 22:14:18.013199', 'step': 1032, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:14:18.045658', 'step': 1032, 'epoch': 1} {'type': 'loss', 'content': 0.030079005286097527, 'timestamp': '2025-09-30 22:14:18.048279', 'step': 1033, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:14:18.087168', 'step': 1033, 'epoch': 1} {'type': 'loss', 'content': 0.012578541412949562, 'timestamp': '2025-09-30 22:14:18.094750', 'step': 1034, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:14:18.137064', 'step': 1034, 'epoch': 1} {'type': 'loss', 'content': 0.008698937483131886, 'timestamp': '2025-09-30 22:14:18.150393', 'step': 1035, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-09-30 22:14:20.662055', 'step': 1035, 'epoch': 1} {'type': 'pplx', 'content': 5.4880656407917385, 'timestamp': '2025-09-30 22:14:20.666086', 'step': 1035, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:20.697900', 'step': 1035, 'epoch': 1} {'type': 'loss', 'content': 0.022248901426792145, 'timestamp': '2025-09-30 22:14:20.725630', 'step': 1036, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:20.759674', 'step': 1036, 'epoch': 1} {'type': 'loss', 'content': 0.018398774787783623, 'timestamp': '2025-09-30 22:14:20.767564', 'step': 1037, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:20.801646', 'step': 1037, 'epoch': 1} {'type': 'loss', 'content': 0.01914854720234871, 'timestamp': '2025-09-30 22:14:20.809670', 'step': 1038, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:20.842123', 'step': 1038, 'epoch': 1} {'type': 'loss', 'content': 0.013537915423512459, 'timestamp': '2025-09-30 22:14:20.854234', 'step': 1039, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:14:20.890682', 'step': 1039, 'epoch': 1} {'type': 'loss', 'content': 0.013754663057625294, 'timestamp': '2025-09-30 22:14:20.924934', 'step': 1040, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:20.958233', 'step': 1040, 'epoch': 1} {'type': 'loss', 'content': 0.015263702720403671, 'timestamp': '2025-09-30 22:14:20.963757', 'step': 1041, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:20.998954', 'step': 1041, 'epoch': 1} {'type': 'loss', 'content': 0.015004323795437813, 'timestamp': '2025-09-30 22:14:21.006844', 'step': 1042, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:21.040676', 'step': 1042, 'epoch': 1} {'type': 'loss', 'content': 0.009397340938448906, 'timestamp': '2025-09-30 22:14:21.048606', 'step': 1043, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:14:21.086414', 'step': 1043, 'epoch': 1} {'type': 'loss', 'content': 0.011286666616797447, 'timestamp': '2025-09-30 22:14:21.119836', 'step': 1044, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:14:21.161164', 'step': 1044, 'epoch': 1} {'type': 'loss', 'content': 0.008173218928277493, 'timestamp': '2025-09-30 22:14:21.176278', 'step': 1045, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:14:21.214987', 'step': 1045, 'epoch': 1} {'type': 'loss', 'content': 0.013183176517486572, 'timestamp': '2025-09-30 22:14:21.228663', 'step': 1046, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:14:21.261374', 'step': 1046, 'epoch': 1} {'type': 'loss', 'content': 0.020087797194719315, 'timestamp': '2025-09-30 22:14:21.268268', 'step': 1047, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-09-30 22:14:21.312141', 'step': 1047, 'epoch': 1} {'type': 'loss', 'content': 0.005610055290162563, 'timestamp': '2025-09-30 22:14:21.349422', 'step': 1048, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:14:21.382345', 'step': 1048, 'epoch': 1} {'type': 'loss', 'content': 0.01692376285791397, 'timestamp': '2025-09-30 22:14:21.394954', 'step': 1049, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:14:21.433831', 'step': 1049, 'epoch': 1} {'type': 'loss', 'content': 0.008124461397528648, 'timestamp': '2025-09-30 22:14:21.447604', 'step': 1050, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:21.490576', 'step': 1050, 'epoch': 1} {'type': 'loss', 'content': 0.014909453690052032, 'timestamp': '2025-09-30 22:14:21.498319', 'step': 1051, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:14:21.534871', 'step': 1051, 'epoch': 1} {'type': 'loss', 'content': 0.011585352011024952, 'timestamp': '2025-09-30 22:14:21.569621', 'step': 1052, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:14:21.602979', 'step': 1052, 'epoch': 1} {'type': 'loss', 'content': 0.020000595599412918, 'timestamp': '2025-09-30 22:14:21.608170', 'step': 1053, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:21.640600', 'step': 1053, 'epoch': 1} {'type': 'loss', 'content': 0.006880574394017458, 'timestamp': '2025-09-30 22:14:21.648492', 'step': 1054, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:21.681540', 'step': 1054, 'epoch': 1} {'type': 'loss', 'content': 0.01206766813993454, 'timestamp': '2025-09-30 22:14:21.691869', 'step': 1055, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:14:21.726364', 'step': 1055, 'epoch': 1} {'type': 'loss', 'content': 0.0255148746073246, 'timestamp': '2025-09-30 22:14:21.759671', 'step': 1056, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-09-30 22:14:21.800043', 'step': 1056, 'epoch': 1} {'type': 'loss', 'content': 0.02468947134912014, 'timestamp': '2025-09-30 22:14:21.815889', 'step': 1057, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:21.850739', 'step': 1057, 'epoch': 1} {'type': 'loss', 'content': 0.02892993949353695, 'timestamp': '2025-09-30 22:14:21.861195', 'step': 1058, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:21.894119', 'step': 1058, 'epoch': 1} {'type': 'loss', 'content': 0.018823428079485893, 'timestamp': '2025-09-30 22:14:21.902073', 'step': 1059, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:14:21.939819', 'step': 1059, 'epoch': 1} {'type': 'loss', 'content': 0.01066051796078682, 'timestamp': '2025-09-30 22:14:21.968210', 'step': 1060, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:14:22.003935', 'step': 1060, 'epoch': 1} {'type': 'loss', 'content': 0.015652211382985115, 'timestamp': '2025-09-30 22:14:22.017042', 'step': 1061, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:14:22.055062', 'step': 1061, 'epoch': 1} {'type': 'loss', 'content': 0.019767604768276215, 'timestamp': '2025-09-30 22:14:22.061976', 'step': 1062, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:14:22.097613', 'step': 1062, 'epoch': 1} {'type': 'loss', 'content': 0.039111003279685974, 'timestamp': '2025-09-30 22:14:22.102027', 'step': 1063, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:22.139543', 'step': 1063, 'epoch': 1} {'type': 'loss', 'content': 0.018683698028326035, 'timestamp': '2025-09-30 22:14:22.171464', 'step': 1064, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:22.209539', 'step': 1064, 'epoch': 1} {'type': 'loss', 'content': 0.010801514610648155, 'timestamp': '2025-09-30 22:14:22.215028', 'step': 1065, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:22.250031', 'step': 1065, 'epoch': 1} {'type': 'loss', 'content': 0.02404649555683136, 'timestamp': '2025-09-30 22:14:22.260474', 'step': 1066, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:14:22.293860', 'step': 1066, 'epoch': 1} {'type': 'loss', 'content': 0.03047175146639347, 'timestamp': '2025-09-30 22:14:22.301612', 'step': 1067, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:14:22.353397', 'step': 1067, 'epoch': 1} {'type': 'loss', 'content': 0.007441402412950993, 'timestamp': '2025-09-30 22:14:22.387552', 'step': 1068, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:22.422255', 'step': 1068, 'epoch': 1} {'type': 'loss', 'content': 0.02320132404565811, 'timestamp': '2025-09-30 22:14:22.430206', 'step': 1069, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:14:22.465093', 'step': 1069, 'epoch': 1} {'type': 'loss', 'content': 0.010687381029129028, 'timestamp': '2025-09-30 22:14:22.472739', 'step': 1070, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:14:22.508105', 'step': 1070, 'epoch': 1} {'type': 'loss', 'content': 0.012789708562195301, 'timestamp': '2025-09-30 22:14:22.512548', 'step': 1071, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:22.544388', 'step': 1071, 'epoch': 1} {'type': 'loss', 'content': 0.018357696011662483, 'timestamp': '2025-09-30 22:14:22.575551', 'step': 1072, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:14:22.610625', 'step': 1072, 'epoch': 1} {'type': 'loss', 'content': 0.031760621815919876, 'timestamp': '2025-09-30 22:14:22.614326', 'step': 1073, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:14:22.650908', 'step': 1073, 'epoch': 1} {'type': 'loss', 'content': 0.02676416002213955, 'timestamp': '2025-09-30 22:14:22.657894', 'step': 1074, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:14:22.691092', 'step': 1074, 'epoch': 1} {'type': 'loss', 'content': 0.07043999433517456, 'timestamp': '2025-09-30 22:14:22.695307', 'step': 1075, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:14:22.729537', 'step': 1075, 'epoch': 1} {'type': 'loss', 'content': 0.007570445071905851, 'timestamp': '2025-09-30 22:14:22.763005', 'step': 1076, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:14:22.795821', 'step': 1076, 'epoch': 1} {'type': 'loss', 'content': 0.015886131674051285, 'timestamp': '2025-09-30 22:14:22.797984', 'step': 1077, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:14:22.835782', 'step': 1077, 'epoch': 1} {'type': 'loss', 'content': 0.018651477992534637, 'timestamp': '2025-09-30 22:14:22.842026', 'step': 1078, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:14:22.873555', 'step': 1078, 'epoch': 1} {'type': 'loss', 'content': 0.01997782289981842, 'timestamp': '2025-09-30 22:14:22.878077', 'step': 1079, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:22.912966', 'step': 1079, 'epoch': 1} {'type': 'loss', 'content': 0.03567449748516083, 'timestamp': '2025-09-30 22:14:22.941766', 'step': 1080, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:14:22.975113', 'step': 1080, 'epoch': 1} {'type': 'loss', 'content': 0.01590901054441929, 'timestamp': '2025-09-30 22:14:22.980351', 'step': 1081, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:14:23.013375', 'step': 1081, 'epoch': 1} {'type': 'loss', 'content': 0.013110446743667126, 'timestamp': '2025-09-30 22:14:23.020551', 'step': 1082, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:14:23.052080', 'step': 1082, 'epoch': 1} {'type': 'loss', 'content': 0.014219080097973347, 'timestamp': '2025-09-30 22:14:23.059653', 'step': 1083, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:14:23.090820', 'step': 1083, 'epoch': 1} {'type': 'loss', 'content': 0.02950241044163704, 'timestamp': '2025-09-30 22:14:23.118585', 'step': 1084, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:14:23.151869', 'step': 1084, 'epoch': 1} {'type': 'loss', 'content': 0.025333222001791, 'timestamp': '2025-09-30 22:14:23.156496', 'step': 1085, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:23.192823', 'step': 1085, 'epoch': 1} {'type': 'loss', 'content': 0.03170863166451454, 'timestamp': '2025-09-30 22:14:23.203879', 'step': 1086, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:23.247110', 'step': 1086, 'epoch': 1} {'type': 'loss', 'content': 0.022048788145184517, 'timestamp': '2025-09-30 22:14:23.259463', 'step': 1087, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:14:23.297996', 'step': 1087, 'epoch': 1} {'type': 'loss', 'content': 0.014543569646775723, 'timestamp': '2025-09-30 22:14:23.332845', 'step': 1088, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:23.366483', 'step': 1088, 'epoch': 1} {'type': 'loss', 'content': 0.022739170119166374, 'timestamp': '2025-09-30 22:14:23.376429', 'step': 1089, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:23.412118', 'step': 1089, 'epoch': 1} {'type': 'loss', 'content': 0.01327939610928297, 'timestamp': '2025-09-30 22:14:23.423364', 'step': 1090, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:23.454545', 'step': 1090, 'epoch': 1} {'type': 'loss', 'content': 0.02299661375582218, 'timestamp': '2025-09-30 22:14:23.465036', 'step': 1091, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:14:23.497521', 'step': 1091, 'epoch': 1} {'type': 'loss', 'content': 0.04441500082612038, 'timestamp': '2025-09-30 22:14:23.525608', 'step': 1092, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:14:23.567168', 'step': 1092, 'epoch': 1} {'type': 'loss', 'content': 0.010146488435566425, 'timestamp': '2025-09-30 22:14:23.582237', 'step': 1093, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:14:23.624756', 'step': 1093, 'epoch': 1} {'type': 'loss', 'content': 0.00972510315477848, 'timestamp': '2025-09-30 22:14:23.638585', 'step': 1094, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:23.671340', 'step': 1094, 'epoch': 1} {'type': 'loss', 'content': 0.028590157628059387, 'timestamp': '2025-09-30 22:14:23.681833', 'step': 1095, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:23.717589', 'step': 1095, 'epoch': 1} {'type': 'loss', 'content': 0.014557278715074062, 'timestamp': '2025-09-30 22:14:23.746408', 'step': 1096, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:14:23.780576', 'step': 1096, 'epoch': 1} {'type': 'loss', 'content': 0.022265970706939697, 'timestamp': '2025-09-30 22:14:23.785867', 'step': 1097, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:14:23.820745', 'step': 1097, 'epoch': 1} {'type': 'loss', 'content': 0.025301869958639145, 'timestamp': '2025-09-30 22:14:23.828313', 'step': 1098, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:14:23.867633', 'step': 1098, 'epoch': 1} {'type': 'loss', 'content': 0.024350661784410477, 'timestamp': '2025-09-30 22:14:23.881637', 'step': 1099, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:14:23.914261', 'step': 1099, 'epoch': 1} {'type': 'loss', 'content': 0.016744161024689674, 'timestamp': '2025-09-30 22:14:23.942857', 'step': 1100, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 14238374516352}, 'timestamp': '2025-09-30 22:14:23.990841', 'step': 1100, 'epoch': 1} {'type': 'loss', 'content': 0.01173662394285202, 'timestamp': '2025-09-30 22:14:24.007775', 'step': 1101, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:24.045520', 'step': 1101, 'epoch': 1} {'type': 'loss', 'content': 0.007207825314253569, 'timestamp': '2025-09-30 22:14:24.053476', 'step': 1102, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:14:24.096192', 'step': 1102, 'epoch': 1} {'type': 'loss', 'content': 0.028237415477633476, 'timestamp': '2025-09-30 22:14:24.103158', 'step': 1103, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:14:24.138708', 'step': 1103, 'epoch': 1} {'type': 'loss', 'content': 0.014486867934465408, 'timestamp': '2025-09-30 22:14:24.167034', 'step': 1104, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:24.203458', 'step': 1104, 'epoch': 1} {'type': 'loss', 'content': 0.021397685632109642, 'timestamp': '2025-09-30 22:14:24.209080', 'step': 1105, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:14:24.240322', 'step': 1105, 'epoch': 1} {'type': 'loss', 'content': 0.01721569150686264, 'timestamp': '2025-09-30 22:14:24.247564', 'step': 1106, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:24.281449', 'step': 1106, 'epoch': 1} {'type': 'loss', 'content': 0.01823001727461815, 'timestamp': '2025-09-30 22:14:24.293727', 'step': 1107, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:14:24.334220', 'step': 1107, 'epoch': 1} {'type': 'loss', 'content': 0.01598893292248249, 'timestamp': '2025-09-30 22:14:24.362286', 'step': 1108, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:14:24.396173', 'step': 1108, 'epoch': 1} {'type': 'loss', 'content': 0.02451957017183304, 'timestamp': '2025-09-30 22:14:24.401384', 'step': 1109, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:14:24.434371', 'step': 1109, 'epoch': 1} {'type': 'loss', 'content': 0.01124908123165369, 'timestamp': '2025-09-30 22:14:24.446933', 'step': 1110, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:24.478816', 'step': 1110, 'epoch': 1} {'type': 'loss', 'content': 0.02102508395910263, 'timestamp': '2025-09-30 22:14:24.489038', 'step': 1111, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:14:24.520432', 'step': 1111, 'epoch': 1} {'type': 'loss', 'content': 0.0185481458902359, 'timestamp': '2025-09-30 22:14:24.548176', 'step': 1112, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:14:24.593310', 'step': 1112, 'epoch': 1} {'type': 'loss', 'content': 0.01727437786757946, 'timestamp': '2025-09-30 22:14:24.606416', 'step': 1113, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:14:24.641066', 'step': 1113, 'epoch': 1} {'type': 'loss', 'content': 0.017133377492427826, 'timestamp': '2025-09-30 22:14:24.648033', 'step': 1114, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:24.682384', 'step': 1114, 'epoch': 1} {'type': 'loss', 'content': 0.028735356405377388, 'timestamp': '2025-09-30 22:14:24.692630', 'step': 1115, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:14:24.733377', 'step': 1115, 'epoch': 1} {'type': 'loss', 'content': 0.03614767640829086, 'timestamp': '2025-09-30 22:14:24.760995', 'step': 1116, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-09-30 22:14:24.804627', 'step': 1116, 'epoch': 1} {'type': 'loss', 'content': 0.00506022060289979, 'timestamp': '2025-09-30 22:14:24.820451', 'step': 1117, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:14:24.855854', 'step': 1117, 'epoch': 1} {'type': 'loss', 'content': 0.025322485715150833, 'timestamp': '2025-09-30 22:14:24.862801', 'step': 1118, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:14:24.900363', 'step': 1118, 'epoch': 1} {'type': 'loss', 'content': 0.01264124270528555, 'timestamp': '2025-09-30 22:14:24.912983', 'step': 1119, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:24.957337', 'step': 1119, 'epoch': 1} {'type': 'loss', 'content': 0.03440183028578758, 'timestamp': '2025-09-30 22:14:24.988650', 'step': 1120, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:14:25.023799', 'step': 1120, 'epoch': 1} {'type': 'loss', 'content': 0.016403350979089737, 'timestamp': '2025-09-30 22:14:25.037178', 'step': 1121, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:14:25.075003', 'step': 1121, 'epoch': 1} {'type': 'loss', 'content': 0.007725847885012627, 'timestamp': '2025-09-30 22:14:25.089002', 'step': 1122, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:25.124913', 'step': 1122, 'epoch': 1} {'type': 'loss', 'content': 0.04689624905586243, 'timestamp': '2025-09-30 22:14:25.137259', 'step': 1123, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:14:25.174114', 'step': 1123, 'epoch': 1} {'type': 'loss', 'content': 0.03898750990629196, 'timestamp': '2025-09-30 22:14:25.203193', 'step': 1124, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:14:25.240590', 'step': 1124, 'epoch': 1} {'type': 'loss', 'content': 0.013921770267188549, 'timestamp': '2025-09-30 22:14:25.253914', 'step': 1125, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:14:25.288376', 'step': 1125, 'epoch': 1} {'type': 'loss', 'content': 0.012004831805825233, 'timestamp': '2025-09-30 22:14:25.300924', 'step': 1126, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:14:25.344075', 'step': 1126, 'epoch': 1} {'type': 'loss', 'content': 0.03051520325243473, 'timestamp': '2025-09-30 22:14:25.359653', 'step': 1127, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:25.393904', 'step': 1127, 'epoch': 1} {'type': 'loss', 'content': 0.025966256856918335, 'timestamp': '2025-09-30 22:14:25.422752', 'step': 1128, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:14:25.457467', 'step': 1128, 'epoch': 1} {'type': 'loss', 'content': 0.008847977966070175, 'timestamp': '2025-09-30 22:14:25.470427', 'step': 1129, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:14:25.506799', 'step': 1129, 'epoch': 1} {'type': 'loss', 'content': 0.011017872951924801, 'timestamp': '2025-09-30 22:14:25.520608', 'step': 1130, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:14:25.554943', 'step': 1130, 'epoch': 1} {'type': 'loss', 'content': 0.03611636534333229, 'timestamp': '2025-09-30 22:14:25.562121', 'step': 1131, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:14:25.596341', 'step': 1131, 'epoch': 1} {'type': 'loss', 'content': 0.014314553700387478, 'timestamp': '2025-09-30 22:14:25.630509', 'step': 1132, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:14:25.668815', 'step': 1132, 'epoch': 1} {'type': 'loss', 'content': 0.018850572407245636, 'timestamp': '2025-09-30 22:14:25.679336', 'step': 1133, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:25.718865', 'step': 1133, 'epoch': 1} {'type': 'loss', 'content': 0.02781030349433422, 'timestamp': '2025-09-30 22:14:25.731201', 'step': 1134, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:25.767129', 'step': 1134, 'epoch': 1} {'type': 'loss', 'content': 0.009797481819987297, 'timestamp': '2025-09-30 22:14:25.778366', 'step': 1135, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:14:25.816442', 'step': 1135, 'epoch': 1} {'type': 'loss', 'content': 0.02433471381664276, 'timestamp': '2025-09-30 22:14:25.844584', 'step': 1136, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:14:25.875939', 'step': 1136, 'epoch': 1} {'type': 'loss', 'content': 0.006771281361579895, 'timestamp': '2025-09-30 22:14:25.880475', 'step': 1137, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:25.914584', 'step': 1137, 'epoch': 1} {'type': 'loss', 'content': 0.012895852327346802, 'timestamp': '2025-09-30 22:14:25.926821', 'step': 1138, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:25.963894', 'step': 1138, 'epoch': 1} {'type': 'loss', 'content': 0.01526438444852829, 'timestamp': '2025-09-30 22:14:25.975018', 'step': 1139, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:26.012594', 'step': 1139, 'epoch': 1} {'type': 'loss', 'content': 0.029251690953969955, 'timestamp': '2025-09-30 22:14:26.049276', 'step': 1140, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:14:26.083490', 'step': 1140, 'epoch': 1} {'type': 'loss', 'content': 0.007598114665597677, 'timestamp': '2025-09-30 22:14:26.096484', 'step': 1141, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:14:26.136465', 'step': 1141, 'epoch': 1} {'type': 'loss', 'content': 0.011994223110377789, 'timestamp': '2025-09-30 22:14:26.144054', 'step': 1142, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:26.199170', 'step': 1142, 'epoch': 1} {'type': 'loss', 'content': 0.011340709403157234, 'timestamp': '2025-09-30 22:14:26.211463', 'step': 1143, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:26.255168', 'step': 1143, 'epoch': 1} {'type': 'loss', 'content': 0.018428653478622437, 'timestamp': '2025-09-30 22:14:26.287209', 'step': 1144, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:26.322571', 'step': 1144, 'epoch': 1} {'type': 'loss', 'content': 0.030467689037322998, 'timestamp': '2025-09-30 22:14:26.328109', 'step': 1145, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:26.363426', 'step': 1145, 'epoch': 1} {'type': 'loss', 'content': 0.018375704064965248, 'timestamp': '2025-09-30 22:14:26.375743', 'step': 1146, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:14:26.412068', 'step': 1146, 'epoch': 1} {'type': 'loss', 'content': 0.019979046657681465, 'timestamp': '2025-09-30 22:14:26.419092', 'step': 1147, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:26.452579', 'step': 1147, 'epoch': 1} {'type': 'loss', 'content': 0.023155096918344498, 'timestamp': '2025-09-30 22:14:26.485760', 'step': 1148, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:26.518356', 'step': 1148, 'epoch': 1} {'type': 'loss', 'content': 0.008853713050484657, 'timestamp': '2025-09-30 22:14:26.527040', 'step': 1149, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:14:26.561309', 'step': 1149, 'epoch': 1} {'type': 'loss', 'content': 0.020854298025369644, 'timestamp': '2025-09-30 22:14:26.569089', 'step': 1150, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-09-30 22:14:29.078713', 'step': 1150, 'epoch': 1} {'type': 'pplx', 'content': 5.631088735057543, 'timestamp': '2025-09-30 22:14:29.081608', 'step': 1150, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:29.120320', 'step': 1150, 'epoch': 1} {'type': 'loss', 'content': 0.02393251657485962, 'timestamp': '2025-09-30 22:14:29.130476', 'step': 1151, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:29.166702', 'step': 1151, 'epoch': 1} {'type': 'loss', 'content': 0.03351360186934471, 'timestamp': '2025-09-30 22:14:29.198568', 'step': 1152, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-09-30 22:14:29.243341', 'step': 1152, 'epoch': 1} {'type': 'loss', 'content': 0.005868937820196152, 'timestamp': '2025-09-30 22:14:29.260017', 'step': 1153, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:14:29.301274', 'step': 1153, 'epoch': 1} {'type': 'loss', 'content': 0.00861198641359806, 'timestamp': '2025-09-30 22:14:29.314661', 'step': 1154, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:14:29.353983', 'step': 1154, 'epoch': 1} {'type': 'loss', 'content': 0.012496614828705788, 'timestamp': '2025-09-30 22:14:29.367678', 'step': 1155, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:14:29.407525', 'step': 1155, 'epoch': 1} {'type': 'loss', 'content': 0.012300015427172184, 'timestamp': '2025-09-30 22:14:29.441709', 'step': 1156, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:14:29.479946', 'step': 1156, 'epoch': 1} {'type': 'loss', 'content': 0.009164150804281235, 'timestamp': '2025-09-30 22:14:29.493085', 'step': 1157, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:14:29.528327', 'step': 1157, 'epoch': 1} {'type': 'loss', 'content': 0.013321910984814167, 'timestamp': '2025-09-30 22:14:29.540883', 'step': 1158, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:29.574927', 'step': 1158, 'epoch': 1} {'type': 'loss', 'content': 0.013937913812696934, 'timestamp': '2025-09-30 22:14:29.587268', 'step': 1159, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:14:29.626438', 'step': 1159, 'epoch': 1} {'type': 'loss', 'content': 0.009495115838944912, 'timestamp': '2025-09-30 22:14:29.661047', 'step': 1160, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-09-30 22:14:29.700907', 'step': 1160, 'epoch': 1} {'type': 'loss', 'content': 0.01062469556927681, 'timestamp': '2025-09-30 22:14:29.716571', 'step': 1161, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:29.758185', 'step': 1161, 'epoch': 1} {'type': 'loss', 'content': 0.02869350276887417, 'timestamp': '2025-09-30 22:14:29.769352', 'step': 1162, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:14:29.805584', 'step': 1162, 'epoch': 1} {'type': 'loss', 'content': 0.009210729040205479, 'timestamp': '2025-09-30 22:14:29.818147', 'step': 1163, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:14:29.858283', 'step': 1163, 'epoch': 1} {'type': 'loss', 'content': 0.011261607520282269, 'timestamp': '2025-09-30 22:14:29.892954', 'step': 1164, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:14:29.935282', 'step': 1164, 'epoch': 1} {'type': 'loss', 'content': 0.020356422290205956, 'timestamp': '2025-09-30 22:14:29.945868', 'step': 1165, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:14:29.987048', 'step': 1165, 'epoch': 1} {'type': 'loss', 'content': 0.015439392998814583, 'timestamp': '2025-09-30 22:14:29.999613', 'step': 1166, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:14:30.034919', 'step': 1166, 'epoch': 1} {'type': 'loss', 'content': 0.016605112701654434, 'timestamp': '2025-09-30 22:14:30.047465', 'step': 1167, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:14:30.081251', 'step': 1167, 'epoch': 1} {'type': 'loss', 'content': 0.012671479023993015, 'timestamp': '2025-09-30 22:14:30.114709', 'step': 1168, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:14:30.149852', 'step': 1168, 'epoch': 1} {'type': 'loss', 'content': 0.012448888272047043, 'timestamp': '2025-09-30 22:14:30.160565', 'step': 1169, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:14:30.197732', 'step': 1169, 'epoch': 1} {'type': 'loss', 'content': 0.019691286608576775, 'timestamp': '2025-09-30 22:14:30.211539', 'step': 1170, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:14:30.254743', 'step': 1170, 'epoch': 1} {'type': 'loss', 'content': 0.0102019552141428, 'timestamp': '2025-09-30 22:14:30.270309', 'step': 1171, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:14:30.313658', 'step': 1171, 'epoch': 1} {'type': 'loss', 'content': 0.011992924846708775, 'timestamp': '2025-09-30 22:14:30.348271', 'step': 1172, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 15187581968384}, 'timestamp': '2025-09-30 22:14:30.392234', 'step': 1172, 'epoch': 1} {'type': 'loss', 'content': 0.008996584452688694, 'timestamp': '2025-09-30 22:14:30.409569', 'step': 1173, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:14:30.454367', 'step': 1173, 'epoch': 1} {'type': 'loss', 'content': 0.018789565190672874, 'timestamp': '2025-09-30 22:14:30.467717', 'step': 1174, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:30.501875', 'step': 1174, 'epoch': 1} {'type': 'loss', 'content': 0.01498965360224247, 'timestamp': '2025-09-30 22:14:30.512971', 'step': 1175, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:30.547409', 'step': 1175, 'epoch': 1} {'type': 'loss', 'content': 0.01690087839961052, 'timestamp': '2025-09-30 22:14:30.580578', 'step': 1176, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:14:30.623001', 'step': 1176, 'epoch': 1} {'type': 'loss', 'content': 0.014346581883728504, 'timestamp': '2025-09-30 22:14:30.635623', 'step': 1177, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:14:30.674179', 'step': 1177, 'epoch': 1} {'type': 'loss', 'content': 0.010005800984799862, 'timestamp': '2025-09-30 22:14:30.687990', 'step': 1178, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:14:30.726189', 'step': 1178, 'epoch': 1} {'type': 'loss', 'content': 0.01065401453524828, 'timestamp': '2025-09-30 22:14:30.740030', 'step': 1179, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:14:30.776887', 'step': 1179, 'epoch': 1} {'type': 'loss', 'content': 0.013994435779750347, 'timestamp': '2025-09-30 22:14:30.811093', 'step': 1180, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:30.842113', 'step': 1180, 'epoch': 1} {'type': 'loss', 'content': 0.015725145116448402, 'timestamp': '2025-09-30 22:14:30.852237', 'step': 1181, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:30.884891', 'step': 1181, 'epoch': 1} {'type': 'loss', 'content': 0.01210531685501337, 'timestamp': '2025-09-30 22:14:30.896074', 'step': 1182, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:30.933914', 'step': 1182, 'epoch': 1} {'type': 'loss', 'content': 0.018449081107974052, 'timestamp': '2025-09-30 22:14:30.945014', 'step': 1183, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:30.979632', 'step': 1183, 'epoch': 1} {'type': 'loss', 'content': 0.022223036736249924, 'timestamp': '2025-09-30 22:14:31.012832', 'step': 1184, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:14:31.045348', 'step': 1184, 'epoch': 1} {'type': 'loss', 'content': 0.013939021155238152, 'timestamp': '2025-09-30 22:14:31.050217', 'step': 1185, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:31.086672', 'step': 1185, 'epoch': 1} {'type': 'loss', 'content': 0.01341447327286005, 'timestamp': '2025-09-30 22:14:31.098002', 'step': 1186, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-09-30 22:14:31.144403', 'step': 1186, 'epoch': 1} {'type': 'loss', 'content': 0.019850490614771843, 'timestamp': '2025-09-30 22:14:31.161418', 'step': 1187, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:31.197931', 'step': 1187, 'epoch': 1} {'type': 'loss', 'content': 0.013188116252422333, 'timestamp': '2025-09-30 22:14:31.231109', 'step': 1188, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:14:31.266289', 'step': 1188, 'epoch': 1} {'type': 'loss', 'content': 0.015668513253331184, 'timestamp': '2025-09-30 22:14:31.278951', 'step': 1189, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:14:31.331133', 'step': 1189, 'epoch': 1} {'type': 'loss', 'content': 0.007118385750800371, 'timestamp': '2025-09-30 22:14:31.346845', 'step': 1190, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 16136789420416}, 'timestamp': '2025-09-30 22:14:31.405785', 'step': 1190, 'epoch': 1} {'type': 'loss', 'content': 0.00617865938693285, 'timestamp': '2025-09-30 22:14:31.424889', 'step': 1191, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:14:31.461054', 'step': 1191, 'epoch': 1} {'type': 'loss', 'content': 0.026619745418429375, 'timestamp': '2025-09-30 22:14:31.495860', 'step': 1192, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:31.533161', 'step': 1192, 'epoch': 1} {'type': 'loss', 'content': 0.031537625938653946, 'timestamp': '2025-09-30 22:14:31.543031', 'step': 1193, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:31.575561', 'step': 1193, 'epoch': 1} {'type': 'loss', 'content': 0.01197479572147131, 'timestamp': '2025-09-30 22:14:31.587903', 'step': 1194, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:14:31.627188', 'step': 1194, 'epoch': 1} {'type': 'loss', 'content': 0.008407039567828178, 'timestamp': '2025-09-30 22:14:31.640941', 'step': 1195, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:14:31.679357', 'step': 1195, 'epoch': 1} {'type': 'loss', 'content': 0.011829300783574581, 'timestamp': '2025-09-30 22:14:31.714096', 'step': 1196, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:14:31.759203', 'step': 1196, 'epoch': 1} {'type': 'loss', 'content': 0.0072440290823578835, 'timestamp': '2025-09-30 22:14:31.774582', 'step': 1197, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:14:31.814995', 'step': 1197, 'epoch': 1} {'type': 'loss', 'content': 0.014621003530919552, 'timestamp': '2025-09-30 22:14:31.828696', 'step': 1198, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:31.865096', 'step': 1198, 'epoch': 1} {'type': 'loss', 'content': 0.014671620912849903, 'timestamp': '2025-09-30 22:14:31.875936', 'step': 1199, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:14:31.911932', 'step': 1199, 'epoch': 1} {'type': 'loss', 'content': 0.016262758523225784, 'timestamp': '2025-09-30 22:14:31.945212', 'step': 1200, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-09-30 22:14:31.995437', 'step': 1200, 'epoch': 1} {'type': 'loss', 'content': 0.017557835206389427, 'timestamp': '2025-09-30 22:14:32.011241', 'step': 1201, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:32.058041', 'step': 1201, 'epoch': 1} {'type': 'loss', 'content': 0.02594558708369732, 'timestamp': '2025-09-30 22:14:32.068004', 'step': 1202, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:14:32.105487', 'step': 1202, 'epoch': 1} {'type': 'loss', 'content': 0.014456644654273987, 'timestamp': '2025-09-30 22:14:32.118885', 'step': 1203, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-09-30 22:14:32.162977', 'step': 1203, 'epoch': 1} {'type': 'loss', 'content': 0.005826851818710566, 'timestamp': '2025-09-30 22:14:32.199991', 'step': 1204, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:14:32.238766', 'step': 1204, 'epoch': 1} {'type': 'loss', 'content': 0.01122973021119833, 'timestamp': '2025-09-30 22:14:32.251887', 'step': 1205, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:14:32.289668', 'step': 1205, 'epoch': 1} {'type': 'loss', 'content': 0.013107744045555592, 'timestamp': '2025-09-30 22:14:32.302134', 'step': 1206, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 14238374516352}, 'timestamp': '2025-09-30 22:14:32.352305', 'step': 1206, 'epoch': 1} {'type': 'loss', 'content': 0.009503448382019997, 'timestamp': '2025-09-30 22:14:32.369615', 'step': 1207, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:14:32.419930', 'step': 1207, 'epoch': 1} {'type': 'loss', 'content': 0.00695630582049489, 'timestamp': '2025-09-30 22:14:32.454828', 'step': 1208, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:32.491661', 'step': 1208, 'epoch': 1} {'type': 'loss', 'content': 0.02943275310099125, 'timestamp': '2025-09-30 22:14:32.496914', 'step': 1209, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:14:32.531863', 'step': 1209, 'epoch': 1} {'type': 'loss', 'content': 0.026188671588897705, 'timestamp': '2025-09-30 22:14:32.545403', 'step': 1210, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:32.582654', 'step': 1210, 'epoch': 1} {'type': 'loss', 'content': 0.02500862441956997, 'timestamp': '2025-09-30 22:14:32.590451', 'step': 1211, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:32.627491', 'step': 1211, 'epoch': 1} {'type': 'loss', 'content': 0.027926690876483917, 'timestamp': '2025-09-30 22:14:32.656152', 'step': 1212, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:32.697633', 'step': 1212, 'epoch': 1} {'type': 'loss', 'content': 0.02507833205163479, 'timestamp': '2025-09-30 22:14:32.703018', 'step': 1213, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:32.737873', 'step': 1213, 'epoch': 1} {'type': 'loss', 'content': 0.01936890184879303, 'timestamp': '2025-09-30 22:14:32.745479', 'step': 1214, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:14:32.782811', 'step': 1214, 'epoch': 1} {'type': 'loss', 'content': 0.02364104799926281, 'timestamp': '2025-09-30 22:14:32.790444', 'step': 1215, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:32.829715', 'step': 1215, 'epoch': 1} {'type': 'loss', 'content': 0.018656298518180847, 'timestamp': '2025-09-30 22:14:32.860840', 'step': 1216, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:14:32.897430', 'step': 1216, 'epoch': 1} {'type': 'loss', 'content': 0.030074406415224075, 'timestamp': '2025-09-30 22:14:32.905252', 'step': 1217, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:32.945989', 'step': 1217, 'epoch': 1} {'type': 'loss', 'content': 0.02442289888858795, 'timestamp': '2025-09-30 22:14:32.956477', 'step': 1218, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:14:32.994799', 'step': 1218, 'epoch': 1} {'type': 'loss', 'content': 0.0336812362074852, 'timestamp': '2025-09-30 22:14:33.007365', 'step': 1219, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:33.043199', 'step': 1219, 'epoch': 1} {'type': 'loss', 'content': 0.034401409327983856, 'timestamp': '2025-09-30 22:14:33.072080', 'step': 1220, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:14:33.107277', 'step': 1220, 'epoch': 1} {'type': 'loss', 'content': 0.035166334360837936, 'timestamp': '2025-09-30 22:14:33.112603', 'step': 1221, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:33.154645', 'step': 1221, 'epoch': 1} {'type': 'loss', 'content': 0.03142448142170906, 'timestamp': '2025-09-30 22:14:33.165080', 'step': 1222, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:14:33.201292', 'step': 1222, 'epoch': 1} {'type': 'loss', 'content': 0.025196803733706474, 'timestamp': '2025-09-30 22:14:33.208397', 'step': 1223, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:14:33.262664', 'step': 1223, 'epoch': 1} {'type': 'loss', 'content': 0.020468074828386307, 'timestamp': '2025-09-30 22:14:33.290524', 'step': 1224, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:14:33.324942', 'step': 1224, 'epoch': 1} {'type': 'loss', 'content': 0.013399248011410236, 'timestamp': '2025-09-30 22:14:33.329252', 'step': 1225, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:14:33.382658', 'step': 1225, 'epoch': 1} {'type': 'loss', 'content': 0.012052931822836399, 'timestamp': '2025-09-30 22:14:33.390425', 'step': 1226, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:33.438194', 'step': 1226, 'epoch': 1} {'type': 'loss', 'content': 0.02524169534444809, 'timestamp': '2025-09-30 22:14:33.449895', 'step': 1227, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:33.492781', 'step': 1227, 'epoch': 1} {'type': 'loss', 'content': 0.017910048365592957, 'timestamp': '2025-09-30 22:14:33.521444', 'step': 1228, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:14:33.566482', 'step': 1228, 'epoch': 1} {'type': 'loss', 'content': 0.02475748024880886, 'timestamp': '2025-09-30 22:14:33.572577', 'step': 1229, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:33.606170', 'step': 1229, 'epoch': 1} {'type': 'loss', 'content': 0.011676200665533543, 'timestamp': '2025-09-30 22:14:33.618458', 'step': 1230, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:14:33.656791', 'step': 1230, 'epoch': 1} {'type': 'loss', 'content': 0.02215118147432804, 'timestamp': '2025-09-30 22:14:33.670176', 'step': 1231, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:14:33.720673', 'step': 1231, 'epoch': 1} {'type': 'loss', 'content': 0.03192958980798721, 'timestamp': '2025-09-30 22:14:33.756721', 'step': 1232, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:33.804183', 'step': 1232, 'epoch': 1} {'type': 'loss', 'content': 0.01824454963207245, 'timestamp': '2025-09-30 22:14:33.809803', 'step': 1233, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:33.854045', 'step': 1233, 'epoch': 1} {'type': 'loss', 'content': 0.027758145704865456, 'timestamp': '2025-09-30 22:14:33.862041', 'step': 1234, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:33.899491', 'step': 1234, 'epoch': 1} {'type': 'loss', 'content': 0.0229659266769886, 'timestamp': '2025-09-30 22:14:33.910584', 'step': 1235, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:33.951116', 'step': 1235, 'epoch': 1} {'type': 'loss', 'content': 0.023478014394640923, 'timestamp': '2025-09-30 22:14:33.982309', 'step': 1236, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:14:34.022237', 'step': 1236, 'epoch': 1} {'type': 'loss', 'content': 0.037181347608566284, 'timestamp': '2025-09-30 22:14:34.027942', 'step': 1237, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:14:34.074477', 'step': 1237, 'epoch': 1} {'type': 'loss', 'content': 0.02985082007944584, 'timestamp': '2025-09-30 22:14:34.081615', 'step': 1238, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:14:34.116074', 'step': 1238, 'epoch': 1} {'type': 'loss', 'content': 0.018528126180171967, 'timestamp': '2025-09-30 22:14:34.123241', 'step': 1239, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:14:34.156297', 'step': 1239, 'epoch': 1} {'type': 'loss', 'content': 0.02232883870601654, 'timestamp': '2025-09-30 22:14:34.189770', 'step': 1240, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:34.223282', 'step': 1240, 'epoch': 1} {'type': 'loss', 'content': 0.015140527859330177, 'timestamp': '2025-09-30 22:14:34.231488', 'step': 1241, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:14:34.270300', 'step': 1241, 'epoch': 1} {'type': 'loss', 'content': 0.01367161888629198, 'timestamp': '2025-09-30 22:14:34.277822', 'step': 1242, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:14:34.316622', 'step': 1242, 'epoch': 1} {'type': 'loss', 'content': 0.020984509959816933, 'timestamp': '2025-09-30 22:14:34.324075', 'step': 1243, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:34.357271', 'step': 1243, 'epoch': 1} {'type': 'loss', 'content': 0.02360849268734455, 'timestamp': '2025-09-30 22:14:34.386089', 'step': 1244, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:14:34.420326', 'step': 1244, 'epoch': 1} {'type': 'loss', 'content': 0.022554300725460052, 'timestamp': '2025-09-30 22:14:34.424955', 'step': 1245, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:34.458835', 'step': 1245, 'epoch': 1} {'type': 'loss', 'content': 0.033704500645399094, 'timestamp': '2025-09-30 22:14:34.466752', 'step': 1246, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:34.502718', 'step': 1246, 'epoch': 1} {'type': 'loss', 'content': 0.03208060562610626, 'timestamp': '2025-09-30 22:14:34.513758', 'step': 1247, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:34.547728', 'step': 1247, 'epoch': 1} {'type': 'loss', 'content': 0.025324244052171707, 'timestamp': '2025-09-30 22:14:34.578881', 'step': 1248, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:34.619602', 'step': 1248, 'epoch': 1} {'type': 'loss', 'content': 0.02882443554699421, 'timestamp': '2025-09-30 22:14:34.628351', 'step': 1249, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:14:34.666145', 'step': 1249, 'epoch': 1} {'type': 'loss', 'content': 0.020039178431034088, 'timestamp': '2025-09-30 22:14:34.673931', 'step': 1250, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:14:34.705421', 'step': 1250, 'epoch': 1} {'type': 'loss', 'content': 0.031007476150989532, 'timestamp': '2025-09-30 22:14:34.712649', 'step': 1251, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:34.747486', 'step': 1251, 'epoch': 1} {'type': 'loss', 'content': 0.01620536856353283, 'timestamp': '2025-09-30 22:14:34.776275', 'step': 1252, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:34.812433', 'step': 1252, 'epoch': 1} {'type': 'loss', 'content': 0.0327889584004879, 'timestamp': '2025-09-30 22:14:34.820569', 'step': 1253, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:34.854146', 'step': 1253, 'epoch': 1} {'type': 'loss', 'content': 0.018718326464295387, 'timestamp': '2025-09-30 22:14:34.864299', 'step': 1254, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:34.902071', 'step': 1254, 'epoch': 1} {'type': 'loss', 'content': 0.013132144697010517, 'timestamp': '2025-09-30 22:14:34.912528', 'step': 1255, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:34.950600', 'step': 1255, 'epoch': 1} {'type': 'loss', 'content': 0.016166796907782555, 'timestamp': '2025-09-30 22:14:34.982665', 'step': 1256, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:35.017807', 'step': 1256, 'epoch': 1} {'type': 'loss', 'content': 0.04190756008028984, 'timestamp': '2025-09-30 22:14:35.026037', 'step': 1257, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:35.063925', 'step': 1257, 'epoch': 1} {'type': 'loss', 'content': 0.02226782590150833, 'timestamp': '2025-09-30 22:14:35.075370', 'step': 1258, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:35.110791', 'step': 1258, 'epoch': 1} {'type': 'loss', 'content': 0.015998445451259613, 'timestamp': '2025-09-30 22:14:35.118812', 'step': 1259, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:35.152127', 'step': 1259, 'epoch': 1} {'type': 'loss', 'content': 0.018795574083924294, 'timestamp': '2025-09-30 22:14:35.184045', 'step': 1260, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:35.220739', 'step': 1260, 'epoch': 1} {'type': 'loss', 'content': 0.014654216356575489, 'timestamp': '2025-09-30 22:14:35.229523', 'step': 1261, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:14:35.265657', 'step': 1261, 'epoch': 1} {'type': 'loss', 'content': 0.026398485526442528, 'timestamp': '2025-09-30 22:14:35.278217', 'step': 1262, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:14:35.311735', 'step': 1262, 'epoch': 1} {'type': 'loss', 'content': 0.02187102660536766, 'timestamp': '2025-09-30 22:14:35.324343', 'step': 1263, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:35.356785', 'step': 1263, 'epoch': 1} {'type': 'loss', 'content': 0.012795783579349518, 'timestamp': '2025-09-30 22:14:35.388589', 'step': 1264, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:35.425154', 'step': 1264, 'epoch': 1} {'type': 'loss', 'content': 0.019397811964154243, 'timestamp': '2025-09-30 22:14:35.433992', 'step': 1265, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-09-30 22:14:37.879632', 'step': 1265, 'epoch': 1} {'type': 'pplx', 'content': 5.495163605757066, 'timestamp': '2025-09-30 22:14:37.884261', 'step': 1265, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:37.922690', 'step': 1265, 'epoch': 1} {'type': 'loss', 'content': 0.01661515049636364, 'timestamp': '2025-09-30 22:14:37.932598', 'step': 1266, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:37.969570', 'step': 1266, 'epoch': 1} {'type': 'loss', 'content': 0.016776934266090393, 'timestamp': '2025-09-30 22:14:37.981879', 'step': 1267, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:38.014273', 'step': 1267, 'epoch': 1} {'type': 'loss', 'content': 0.026616569608449936, 'timestamp': '2025-09-30 22:14:38.042927', 'step': 1268, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:14:38.081740', 'step': 1268, 'epoch': 1} {'type': 'loss', 'content': 0.02219020016491413, 'timestamp': '2025-09-30 22:14:38.094395', 'step': 1269, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:38.129588', 'step': 1269, 'epoch': 1} {'type': 'loss', 'content': 0.016495492309331894, 'timestamp': '2025-09-30 22:14:38.140702', 'step': 1270, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:38.173871', 'step': 1270, 'epoch': 1} {'type': 'loss', 'content': 0.019733276218175888, 'timestamp': '2025-09-30 22:14:38.186251', 'step': 1271, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:14:38.223489', 'step': 1271, 'epoch': 1} {'type': 'loss', 'content': 0.016676638275384903, 'timestamp': '2025-09-30 22:14:38.257710', 'step': 1272, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:38.290270', 'step': 1272, 'epoch': 1} {'type': 'loss', 'content': 0.02926846593618393, 'timestamp': '2025-09-30 22:14:38.298062', 'step': 1273, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:14:38.332042', 'step': 1273, 'epoch': 1} {'type': 'loss', 'content': 0.018852192908525467, 'timestamp': '2025-09-30 22:14:38.344616', 'step': 1274, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:38.378349', 'step': 1274, 'epoch': 1} {'type': 'loss', 'content': 0.039707720279693604, 'timestamp': '2025-09-30 22:14:38.390685', 'step': 1275, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:14:38.426839', 'step': 1275, 'epoch': 1} {'type': 'loss', 'content': 0.022720344364643097, 'timestamp': '2025-09-30 22:14:38.460267', 'step': 1276, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:14:38.496673', 'step': 1276, 'epoch': 1} {'type': 'loss', 'content': 0.018529046326875687, 'timestamp': '2025-09-30 22:14:38.509329', 'step': 1277, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:38.547858', 'step': 1277, 'epoch': 1} {'type': 'loss', 'content': 0.010352366603910923, 'timestamp': '2025-09-30 22:14:38.558888', 'step': 1278, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:38.599350', 'step': 1278, 'epoch': 1} {'type': 'loss', 'content': 0.02025136537849903, 'timestamp': '2025-09-30 22:14:38.613005', 'step': 1279, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:38.658819', 'step': 1279, 'epoch': 1} {'type': 'loss', 'content': 0.043575845658779144, 'timestamp': '2025-09-30 22:14:38.692118', 'step': 1280, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:38.728383', 'step': 1280, 'epoch': 1} {'type': 'loss', 'content': 0.024662936106324196, 'timestamp': '2025-09-30 22:14:38.736414', 'step': 1281, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:38.772107', 'step': 1281, 'epoch': 1} {'type': 'loss', 'content': 0.037019431591033936, 'timestamp': '2025-09-30 22:14:38.783355', 'step': 1282, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:14:38.817932', 'step': 1282, 'epoch': 1} {'type': 'loss', 'content': 0.025818834081292152, 'timestamp': '2025-09-30 22:14:38.830510', 'step': 1283, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:38.865102', 'step': 1283, 'epoch': 1} {'type': 'loss', 'content': 0.023233827203512192, 'timestamp': '2025-09-30 22:14:38.897257', 'step': 1284, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:14:38.929162', 'step': 1284, 'epoch': 1} {'type': 'loss', 'content': 0.019260883331298828, 'timestamp': '2025-09-30 22:14:38.939907', 'step': 1285, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:38.978083', 'step': 1285, 'epoch': 1} {'type': 'loss', 'content': 0.03170757740736008, 'timestamp': '2025-09-30 22:14:38.988566', 'step': 1286, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:14:39.021346', 'step': 1286, 'epoch': 1} {'type': 'loss', 'content': 0.03846264258027077, 'timestamp': '2025-09-30 22:14:39.028749', 'step': 1287, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:39.066305', 'step': 1287, 'epoch': 1} {'type': 'loss', 'content': 0.019059956073760986, 'timestamp': '2025-09-30 22:14:39.099473', 'step': 1288, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:39.136954', 'step': 1288, 'epoch': 1} {'type': 'loss', 'content': 0.03577815741300583, 'timestamp': '2025-09-30 22:14:39.145843', 'step': 1289, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:39.181812', 'step': 1289, 'epoch': 1} {'type': 'loss', 'content': 0.014517645351588726, 'timestamp': '2025-09-30 22:14:39.189757', 'step': 1290, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:39.226189', 'step': 1290, 'epoch': 1} {'type': 'loss', 'content': 0.024149658158421516, 'timestamp': '2025-09-30 22:14:39.238513', 'step': 1291, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:39.272275', 'step': 1291, 'epoch': 1} {'type': 'loss', 'content': 0.020498551428318024, 'timestamp': '2025-09-30 22:14:39.301028', 'step': 1292, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:14:39.334014', 'step': 1292, 'epoch': 1} {'type': 'loss', 'content': 0.02780788764357567, 'timestamp': '2025-09-30 22:14:39.344882', 'step': 1293, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:39.376736', 'step': 1293, 'epoch': 1} {'type': 'loss', 'content': 0.0203973688185215, 'timestamp': '2025-09-30 22:14:39.389145', 'step': 1294, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:39.423129', 'step': 1294, 'epoch': 1} {'type': 'loss', 'content': 0.02105989307165146, 'timestamp': '2025-09-30 22:14:39.435507', 'step': 1295, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:14:39.471633', 'step': 1295, 'epoch': 1} {'type': 'loss', 'content': 0.03150323033332825, 'timestamp': '2025-09-30 22:14:39.505139', 'step': 1296, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:14:39.538556', 'step': 1296, 'epoch': 1} {'type': 'loss', 'content': 0.016306867823004723, 'timestamp': '2025-09-30 22:14:39.551240', 'step': 1297, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:14:39.584222', 'step': 1297, 'epoch': 1} {'type': 'loss', 'content': 0.02700851857662201, 'timestamp': '2025-09-30 22:14:39.596869', 'step': 1298, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:39.634537', 'step': 1298, 'epoch': 1} {'type': 'loss', 'content': 0.016827302053570747, 'timestamp': '2025-09-30 22:14:39.645725', 'step': 1299, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:39.678289', 'step': 1299, 'epoch': 1} {'type': 'loss', 'content': 0.017379000782966614, 'timestamp': '2025-09-30 22:14:39.711492', 'step': 1300, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:14:39.748075', 'step': 1300, 'epoch': 1} {'type': 'loss', 'content': 0.018542049452662468, 'timestamp': '2025-09-30 22:14:39.760723', 'step': 1301, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:14:39.800412', 'step': 1301, 'epoch': 1} {'type': 'loss', 'content': 0.05579542741179466, 'timestamp': '2025-09-30 22:14:39.814118', 'step': 1302, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:14:39.850470', 'step': 1302, 'epoch': 1} {'type': 'loss', 'content': 0.025843879207968712, 'timestamp': '2025-09-30 22:14:39.863029', 'step': 1303, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:14:39.898563', 'step': 1303, 'epoch': 1} {'type': 'loss', 'content': 0.03374844789505005, 'timestamp': '2025-09-30 22:14:39.931997', 'step': 1304, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:39.971672', 'step': 1304, 'epoch': 1} {'type': 'loss', 'content': 0.03356417641043663, 'timestamp': '2025-09-30 22:14:39.980435', 'step': 1305, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:14:40.020789', 'step': 1305, 'epoch': 1} {'type': 'loss', 'content': 0.0371815450489521, 'timestamp': '2025-09-30 22:14:40.033381', 'step': 1306, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:14:40.071666', 'step': 1306, 'epoch': 1} {'type': 'loss', 'content': 0.0254992563277483, 'timestamp': '2025-09-30 22:14:40.084243', 'step': 1307, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:40.124501', 'step': 1307, 'epoch': 1} {'type': 'loss', 'content': 0.023711930960416794, 'timestamp': '2025-09-30 22:14:40.157656', 'step': 1308, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:14:40.196953', 'step': 1308, 'epoch': 1} {'type': 'loss', 'content': 0.013582438230514526, 'timestamp': '2025-09-30 22:14:40.202118', 'step': 1309, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:40.236947', 'step': 1309, 'epoch': 1} {'type': 'loss', 'content': 0.022634677588939667, 'timestamp': '2025-09-30 22:14:40.249257', 'step': 1310, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:14:40.283455', 'step': 1310, 'epoch': 1} {'type': 'loss', 'content': 0.02410058304667473, 'timestamp': '2025-09-30 22:14:40.295969', 'step': 1311, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:40.330288', 'step': 1311, 'epoch': 1} {'type': 'loss', 'content': 0.027308452874422073, 'timestamp': '2025-09-30 22:14:40.361513', 'step': 1312, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:40.395542', 'step': 1312, 'epoch': 1} {'type': 'loss', 'content': 0.0207161046564579, 'timestamp': '2025-09-30 22:14:40.403591', 'step': 1313, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:40.439897', 'step': 1313, 'epoch': 1} {'type': 'loss', 'content': 0.023866860195994377, 'timestamp': '2025-09-30 22:14:40.451974', 'step': 1314, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:14:40.485646', 'step': 1314, 'epoch': 1} {'type': 'loss', 'content': 0.026835141703486443, 'timestamp': '2025-09-30 22:14:40.492867', 'step': 1315, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:14:40.528042', 'step': 1315, 'epoch': 1} {'type': 'loss', 'content': 0.01742064207792282, 'timestamp': '2025-09-30 22:14:40.556549', 'step': 1316, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:14:40.592346', 'step': 1316, 'epoch': 1} {'type': 'loss', 'content': 0.01809242181479931, 'timestamp': '2025-09-30 22:14:40.597522', 'step': 1317, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:14:40.633796', 'step': 1317, 'epoch': 1} {'type': 'loss', 'content': 0.010253416374325752, 'timestamp': '2025-09-30 22:14:40.641422', 'step': 1318, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:14:40.676059', 'step': 1318, 'epoch': 1} {'type': 'loss', 'content': 0.026114145293831825, 'timestamp': '2025-09-30 22:14:40.683700', 'step': 1319, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:14:40.720993', 'step': 1319, 'epoch': 1} {'type': 'loss', 'content': 0.031265173107385635, 'timestamp': '2025-09-30 22:14:40.749599', 'step': 1320, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:40.788316', 'step': 1320, 'epoch': 1} {'type': 'loss', 'content': 0.019768333062529564, 'timestamp': '2025-09-30 22:14:40.794019', 'step': 1321, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:40.826280', 'step': 1321, 'epoch': 1} {'type': 'loss', 'content': 0.019529767334461212, 'timestamp': '2025-09-30 22:14:40.836695', 'step': 1322, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:14:40.869249', 'step': 1322, 'epoch': 1} {'type': 'loss', 'content': 0.015189185738563538, 'timestamp': '2025-09-30 22:14:40.875993', 'step': 1323, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:14:40.909013', 'step': 1323, 'epoch': 1} {'type': 'loss', 'content': 0.016119806095957756, 'timestamp': '2025-09-30 22:14:40.937565', 'step': 1324, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:14:40.970539', 'step': 1324, 'epoch': 1} {'type': 'loss', 'content': 0.026351531967520714, 'timestamp': '2025-09-30 22:14:40.975830', 'step': 1325, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:14:41.015553', 'step': 1325, 'epoch': 1} {'type': 'loss', 'content': 0.025196142494678497, 'timestamp': '2025-09-30 22:14:41.019633', 'step': 1326, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:41.052832', 'step': 1326, 'epoch': 1} {'type': 'loss', 'content': 0.028438767418265343, 'timestamp': '2025-09-30 22:14:41.063183', 'step': 1327, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:41.100579', 'step': 1327, 'epoch': 1} {'type': 'loss', 'content': 0.015023917891085148, 'timestamp': '2025-09-30 22:14:41.133774', 'step': 1328, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:14:41.168955', 'step': 1328, 'epoch': 1} {'type': 'loss', 'content': 0.024687767028808594, 'timestamp': '2025-09-30 22:14:41.182033', 'step': 1329, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:14:41.215953', 'step': 1329, 'epoch': 1} {'type': 'loss', 'content': 0.025566160678863525, 'timestamp': '2025-09-30 22:14:41.223160', 'step': 1330, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:41.256592', 'step': 1330, 'epoch': 1} {'type': 'loss', 'content': 0.03352648764848709, 'timestamp': '2025-09-30 22:14:41.268716', 'step': 1331, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:14:41.306170', 'step': 1331, 'epoch': 1} {'type': 'loss', 'content': 0.01492286752909422, 'timestamp': '2025-09-30 22:14:41.340370', 'step': 1332, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:41.374577', 'step': 1332, 'epoch': 1} {'type': 'loss', 'content': 0.023383338004350662, 'timestamp': '2025-09-30 22:14:41.383279', 'step': 1333, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:14:41.424534', 'step': 1333, 'epoch': 1} {'type': 'loss', 'content': 0.04956074804067612, 'timestamp': '2025-09-30 22:14:41.437901', 'step': 1334, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:41.472351', 'step': 1334, 'epoch': 1} {'type': 'loss', 'content': 0.05456084758043289, 'timestamp': '2025-09-30 22:14:41.484663', 'step': 1335, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:41.518630', 'step': 1335, 'epoch': 1} {'type': 'loss', 'content': 0.020251190289855003, 'timestamp': '2025-09-30 22:14:41.550794', 'step': 1336, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:41.583031', 'step': 1336, 'epoch': 1} {'type': 'loss', 'content': 0.03886295482516289, 'timestamp': '2025-09-30 22:14:41.591081', 'step': 1337, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:14:41.625815', 'step': 1337, 'epoch': 1} {'type': 'loss', 'content': 0.010892719961702824, 'timestamp': '2025-09-30 22:14:41.638365', 'step': 1338, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:41.678044', 'step': 1338, 'epoch': 1} {'type': 'loss', 'content': 0.011457058601081371, 'timestamp': '2025-09-30 22:14:41.690487', 'step': 1339, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:41.724460', 'step': 1339, 'epoch': 1} {'type': 'loss', 'content': 0.016294119879603386, 'timestamp': '2025-09-30 22:14:41.757673', 'step': 1340, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:41.798916', 'step': 1340, 'epoch': 1} {'type': 'loss', 'content': 0.0155197037383914, 'timestamp': '2025-09-30 22:14:41.807023', 'step': 1341, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:41.852745', 'step': 1341, 'epoch': 1} {'type': 'loss', 'content': 0.024812180548906326, 'timestamp': '2025-09-30 22:14:41.863821', 'step': 1342, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:41.898943', 'step': 1342, 'epoch': 1} {'type': 'loss', 'content': 0.03152640536427498, 'timestamp': '2025-09-30 22:14:41.909395', 'step': 1343, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:41.948050', 'step': 1343, 'epoch': 1} {'type': 'loss', 'content': 0.030339188873767853, 'timestamp': '2025-09-30 22:14:41.979127', 'step': 1344, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:42.015722', 'step': 1344, 'epoch': 1} {'type': 'loss', 'content': 0.022944064810872078, 'timestamp': '2025-09-30 22:14:42.025847', 'step': 1345, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:42.062117', 'step': 1345, 'epoch': 1} {'type': 'loss', 'content': 0.027891971170902252, 'timestamp': '2025-09-30 22:14:42.074425', 'step': 1346, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:42.111761', 'step': 1346, 'epoch': 1} {'type': 'loss', 'content': 0.02328789047896862, 'timestamp': '2025-09-30 22:14:42.122638', 'step': 1347, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:14:42.163005', 'step': 1347, 'epoch': 1} {'type': 'loss', 'content': 0.03815057501196861, 'timestamp': '2025-09-30 22:14:42.197692', 'step': 1348, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:14:42.242963', 'step': 1348, 'epoch': 1} {'type': 'loss', 'content': 0.019010033458471298, 'timestamp': '2025-09-30 22:14:42.253478', 'step': 1349, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:42.286346', 'step': 1349, 'epoch': 1} {'type': 'loss', 'content': 0.016303110867738724, 'timestamp': '2025-09-30 22:14:42.294189', 'step': 1350, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:14:42.326820', 'step': 1350, 'epoch': 1} {'type': 'loss', 'content': 0.019749823957681656, 'timestamp': '2025-09-30 22:14:42.339396', 'step': 1351, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:14:42.388655', 'step': 1351, 'epoch': 1} {'type': 'loss', 'content': 0.024683495983481407, 'timestamp': '2025-09-30 22:14:42.416958', 'step': 1352, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:42.451188', 'step': 1352, 'epoch': 1} {'type': 'loss', 'content': 0.02669571153819561, 'timestamp': '2025-09-30 22:14:42.459856', 'step': 1353, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:42.500323', 'step': 1353, 'epoch': 1} {'type': 'loss', 'content': 0.02056029625236988, 'timestamp': '2025-09-30 22:14:42.511373', 'step': 1354, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:14:42.547536', 'step': 1354, 'epoch': 1} {'type': 'loss', 'content': 0.014292556792497635, 'timestamp': '2025-09-30 22:14:42.554701', 'step': 1355, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:14:42.589863', 'step': 1355, 'epoch': 1} {'type': 'loss', 'content': 0.030081013217568398, 'timestamp': '2025-09-30 22:14:42.618177', 'step': 1356, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:14:42.652620', 'step': 1356, 'epoch': 1} {'type': 'loss', 'content': 0.02174229733645916, 'timestamp': '2025-09-30 22:14:42.657371', 'step': 1357, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:14:42.691366', 'step': 1357, 'epoch': 1} {'type': 'loss', 'content': 0.027395816519856453, 'timestamp': '2025-09-30 22:14:42.695884', 'step': 1358, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:14:42.730620', 'step': 1358, 'epoch': 1} {'type': 'loss', 'content': 0.03220497816801071, 'timestamp': '2025-09-30 22:14:42.737679', 'step': 1359, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:42.770565', 'step': 1359, 'epoch': 1} {'type': 'loss', 'content': 0.024621393531560898, 'timestamp': '2025-09-30 22:14:42.801843', 'step': 1360, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:42.834826', 'step': 1360, 'epoch': 1} {'type': 'loss', 'content': 0.009342104196548462, 'timestamp': '2025-09-30 22:14:42.842796', 'step': 1361, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:14:42.875495', 'step': 1361, 'epoch': 1} {'type': 'loss', 'content': 0.03075343370437622, 'timestamp': '2025-09-30 22:14:42.888029', 'step': 1362, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:14:42.927529', 'step': 1362, 'epoch': 1} {'type': 'loss', 'content': 0.02079709991812706, 'timestamp': '2025-09-30 22:14:42.941240', 'step': 1363, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:42.986053', 'step': 1363, 'epoch': 1} {'type': 'loss', 'content': 0.01046017650514841, 'timestamp': '2025-09-30 22:14:43.014860', 'step': 1364, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:14:43.049961', 'step': 1364, 'epoch': 1} {'type': 'loss', 'content': 0.02314908802509308, 'timestamp': '2025-09-30 22:14:43.065044', 'step': 1365, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:14:43.107156', 'step': 1365, 'epoch': 1} {'type': 'loss', 'content': 0.018210845068097115, 'timestamp': '2025-09-30 22:14:43.123947', 'step': 1366, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:43.180868', 'step': 1366, 'epoch': 1} {'type': 'loss', 'content': 0.010480647906661034, 'timestamp': '2025-09-30 22:14:43.192049', 'step': 1367, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:14:43.225069', 'step': 1367, 'epoch': 1} {'type': 'loss', 'content': 0.02072896622121334, 'timestamp': '2025-09-30 22:14:43.253146', 'step': 1368, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:43.289303', 'step': 1368, 'epoch': 1} {'type': 'loss', 'content': 0.017621014267206192, 'timestamp': '2025-09-30 22:14:43.297288', 'step': 1369, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:14:43.341459', 'step': 1369, 'epoch': 1} {'type': 'loss', 'content': 0.02624642662703991, 'timestamp': '2025-09-30 22:14:43.355294', 'step': 1370, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:43.394559', 'step': 1370, 'epoch': 1} {'type': 'loss', 'content': 0.024825556203722954, 'timestamp': '2025-09-30 22:14:43.402584', 'step': 1371, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:43.437831', 'step': 1371, 'epoch': 1} {'type': 'loss', 'content': 0.026172012090682983, 'timestamp': '2025-09-30 22:14:43.469777', 'step': 1372, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:43.506668', 'step': 1372, 'epoch': 1} {'type': 'loss', 'content': 0.03473340719938278, 'timestamp': '2025-09-30 22:14:43.515356', 'step': 1373, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:14:43.565492', 'step': 1373, 'epoch': 1} {'type': 'loss', 'content': 0.021380873396992683, 'timestamp': '2025-09-30 22:14:43.578114', 'step': 1374, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:43.617474', 'step': 1374, 'epoch': 1} {'type': 'loss', 'content': 0.01798097975552082, 'timestamp': '2025-09-30 22:14:43.627646', 'step': 1375, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:43.664719', 'step': 1375, 'epoch': 1} {'type': 'loss', 'content': 0.010863942094147205, 'timestamp': '2025-09-30 22:14:43.695929', 'step': 1376, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:14:43.738635', 'step': 1376, 'epoch': 1} {'type': 'loss', 'content': 0.017268171533942223, 'timestamp': '2025-09-30 22:14:43.749059', 'step': 1377, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:43.788075', 'step': 1377, 'epoch': 1} {'type': 'loss', 'content': 0.013549873605370522, 'timestamp': '2025-09-30 22:14:43.799274', 'step': 1378, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:14:43.835508', 'step': 1378, 'epoch': 1} {'type': 'loss', 'content': 0.028922399505972862, 'timestamp': '2025-09-30 22:14:43.842921', 'step': 1379, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:43.888352', 'step': 1379, 'epoch': 1} {'type': 'loss', 'content': 0.03547342121601105, 'timestamp': '2025-09-30 22:14:43.920201', 'step': 1380, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-09-30 22:14:46.365049', 'step': 1380, 'epoch': 1} {'type': 'pplx', 'content': 5.286725956468357, 'timestamp': '2025-09-30 22:14:46.367959', 'step': 1380, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:14:46.404069', 'step': 1380, 'epoch': 1} {'type': 'loss', 'content': 0.03862202540040016, 'timestamp': '2025-09-30 22:14:46.417036', 'step': 1381, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:46.450342', 'step': 1381, 'epoch': 1} {'type': 'loss', 'content': 0.01243166346102953, 'timestamp': '2025-09-30 22:14:46.458144', 'step': 1382, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:14:46.493939', 'step': 1382, 'epoch': 1} {'type': 'loss', 'content': 0.029087301343679428, 'timestamp': '2025-09-30 22:14:46.507613', 'step': 1383, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:14:46.544961', 'step': 1383, 'epoch': 1} {'type': 'loss', 'content': 0.027522187680006027, 'timestamp': '2025-09-30 22:14:46.579171', 'step': 1384, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:46.622304', 'step': 1384, 'epoch': 1} {'type': 'loss', 'content': 0.023662393912672997, 'timestamp': '2025-09-30 22:14:46.632103', 'step': 1385, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:14:46.676339', 'step': 1385, 'epoch': 1} {'type': 'loss', 'content': 0.018998509272933006, 'timestamp': '2025-09-30 22:14:46.689756', 'step': 1386, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:46.729346', 'step': 1386, 'epoch': 1} {'type': 'loss', 'content': 0.012357273139059544, 'timestamp': '2025-09-30 22:14:46.741469', 'step': 1387, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:46.776033', 'step': 1387, 'epoch': 1} {'type': 'loss', 'content': 0.018847810104489326, 'timestamp': '2025-09-30 22:14:46.807929', 'step': 1388, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:14:46.842264', 'step': 1388, 'epoch': 1} {'type': 'loss', 'content': 0.016082987189292908, 'timestamp': '2025-09-30 22:14:46.847053', 'step': 1389, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:14:46.885966', 'step': 1389, 'epoch': 1} {'type': 'loss', 'content': 0.013551075011491776, 'timestamp': '2025-09-30 22:14:46.893599', 'step': 1390, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:14:46.932531', 'step': 1390, 'epoch': 1} {'type': 'loss', 'content': 0.03684048727154732, 'timestamp': '2025-09-30 22:14:46.939847', 'step': 1391, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:14:46.977409', 'step': 1391, 'epoch': 1} {'type': 'loss', 'content': 0.00784347578883171, 'timestamp': '2025-09-30 22:14:47.005745', 'step': 1392, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:14:47.040739', 'step': 1392, 'epoch': 1} {'type': 'loss', 'content': 0.012781267985701561, 'timestamp': '2025-09-30 22:14:47.053380', 'step': 1393, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:47.091382', 'step': 1393, 'epoch': 1} {'type': 'loss', 'content': 0.018491802737116814, 'timestamp': '2025-09-30 22:14:47.102395', 'step': 1394, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:47.135934', 'step': 1394, 'epoch': 1} {'type': 'loss', 'content': 0.01759738102555275, 'timestamp': '2025-09-30 22:14:47.147046', 'step': 1395, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:14:47.197583', 'step': 1395, 'epoch': 1} {'type': 'loss', 'content': 0.01280723512172699, 'timestamp': '2025-09-30 22:14:47.232297', 'step': 1396, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:47.273176', 'step': 1396, 'epoch': 1} {'type': 'loss', 'content': 0.021734340116381645, 'timestamp': '2025-09-30 22:14:47.281804', 'step': 1397, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:47.317166', 'step': 1397, 'epoch': 1} {'type': 'loss', 'content': 0.01839459501206875, 'timestamp': '2025-09-30 22:14:47.328093', 'step': 1398, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:47.362740', 'step': 1398, 'epoch': 1} {'type': 'loss', 'content': 0.02381458505988121, 'timestamp': '2025-09-30 22:14:47.373081', 'step': 1399, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:47.416305', 'step': 1399, 'epoch': 1} {'type': 'loss', 'content': 0.015866320580244064, 'timestamp': '2025-09-30 22:14:47.447390', 'step': 1400, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:47.480792', 'step': 1400, 'epoch': 1} {'type': 'loss', 'content': 0.011865449137985706, 'timestamp': '2025-09-30 22:14:47.491301', 'step': 1401, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:14:47.524867', 'step': 1401, 'epoch': 1} {'type': 'loss', 'content': 0.02674529328942299, 'timestamp': '2025-09-30 22:14:47.531964', 'step': 1402, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:47.570980', 'step': 1402, 'epoch': 1} {'type': 'loss', 'content': 0.016522124409675598, 'timestamp': '2025-09-30 22:14:47.581363', 'step': 1403, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:47.624708', 'step': 1403, 'epoch': 1} {'type': 'loss', 'content': 0.015584531240165234, 'timestamp': '2025-09-30 22:14:47.653479', 'step': 1404, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:14:47.693659', 'step': 1404, 'epoch': 1} {'type': 'loss', 'content': 0.024292299523949623, 'timestamp': '2025-09-30 22:14:47.706285', 'step': 1405, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:47.744131', 'step': 1405, 'epoch': 1} {'type': 'loss', 'content': 0.01535879261791706, 'timestamp': '2025-09-30 22:14:47.752109', 'step': 1406, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:47.798486', 'step': 1406, 'epoch': 1} {'type': 'loss', 'content': 0.01519247330725193, 'timestamp': '2025-09-30 22:14:47.809629', 'step': 1407, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:14:47.851854', 'step': 1407, 'epoch': 1} {'type': 'loss', 'content': 0.027948128059506416, 'timestamp': '2025-09-30 22:14:47.885364', 'step': 1408, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:47.926169', 'step': 1408, 'epoch': 1} {'type': 'loss', 'content': 0.04772356152534485, 'timestamp': '2025-09-30 22:14:47.934966', 'step': 1409, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:47.974889', 'step': 1409, 'epoch': 1} {'type': 'loss', 'content': 0.011634491384029388, 'timestamp': '2025-09-30 22:14:47.987128', 'step': 1410, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:48.024939', 'step': 1410, 'epoch': 1} {'type': 'loss', 'content': 0.04621616378426552, 'timestamp': '2025-09-30 22:14:48.033222', 'step': 1411, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:48.074759', 'step': 1411, 'epoch': 1} {'type': 'loss', 'content': 0.028265489265322685, 'timestamp': '2025-09-30 22:14:48.105905', 'step': 1412, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:14:48.147053', 'step': 1412, 'epoch': 1} {'type': 'loss', 'content': 0.018142448738217354, 'timestamp': '2025-09-30 22:14:48.157535', 'step': 1413, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:48.199585', 'step': 1413, 'epoch': 1} {'type': 'loss', 'content': 0.019383694976568222, 'timestamp': '2025-09-30 22:14:48.211948', 'step': 1414, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:14:48.249624', 'step': 1414, 'epoch': 1} {'type': 'loss', 'content': 0.016686297953128815, 'timestamp': '2025-09-30 22:14:48.262951', 'step': 1415, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:48.306134', 'step': 1415, 'epoch': 1} {'type': 'loss', 'content': 0.02211880125105381, 'timestamp': '2025-09-30 22:14:48.338033', 'step': 1416, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:14:48.373783', 'step': 1416, 'epoch': 1} {'type': 'loss', 'content': 0.01303754560649395, 'timestamp': '2025-09-30 22:14:48.383084', 'step': 1417, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:48.420102', 'step': 1417, 'epoch': 1} {'type': 'loss', 'content': 0.010236444883048534, 'timestamp': '2025-09-30 22:14:48.431310', 'step': 1418, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:14:48.476595', 'step': 1418, 'epoch': 1} {'type': 'loss', 'content': 0.028854433447122574, 'timestamp': '2025-09-30 22:14:48.484170', 'step': 1419, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:48.524885', 'step': 1419, 'epoch': 1} {'type': 'loss', 'content': 0.02037026733160019, 'timestamp': '2025-09-30 22:14:48.556063', 'step': 1420, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:48.594988', 'step': 1420, 'epoch': 1} {'type': 'loss', 'content': 0.0194195918738842, 'timestamp': '2025-09-30 22:14:48.603650', 'step': 1421, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:48.646552', 'step': 1421, 'epoch': 1} {'type': 'loss', 'content': 0.017392108216881752, 'timestamp': '2025-09-30 22:14:48.654318', 'step': 1422, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:48.698651', 'step': 1422, 'epoch': 1} {'type': 'loss', 'content': 0.021164512261748314, 'timestamp': '2025-09-30 22:14:48.709651', 'step': 1423, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:14:48.751784', 'step': 1423, 'epoch': 1} {'type': 'loss', 'content': 0.022570928558707237, 'timestamp': '2025-09-30 22:14:48.779645', 'step': 1424, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:14:48.817560', 'step': 1424, 'epoch': 1} {'type': 'loss', 'content': 0.021067731082439423, 'timestamp': '2025-09-30 22:14:48.822464', 'step': 1425, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:14:48.860132', 'step': 1425, 'epoch': 1} {'type': 'loss', 'content': 0.018480515107512474, 'timestamp': '2025-09-30 22:14:48.867124', 'step': 1426, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:14:48.902698', 'step': 1426, 'epoch': 1} {'type': 'loss', 'content': 0.010004106909036636, 'timestamp': '2025-09-30 22:14:48.912820', 'step': 1427, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:48.950185', 'step': 1427, 'epoch': 1} {'type': 'loss', 'content': 0.011603604070842266, 'timestamp': '2025-09-30 22:14:48.981323', 'step': 1428, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:14:49.022742', 'step': 1428, 'epoch': 1} {'type': 'loss', 'content': 0.01244189403951168, 'timestamp': '2025-09-30 22:14:49.033237', 'step': 1429, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:49.079495', 'step': 1429, 'epoch': 1} {'type': 'loss', 'content': 0.024397503584623337, 'timestamp': '2025-09-30 22:14:49.090936', 'step': 1430, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:14:49.125591', 'step': 1430, 'epoch': 1} {'type': 'loss', 'content': 0.019764477387070656, 'timestamp': '2025-09-30 22:14:49.136417', 'step': 1431, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:14:49.176049', 'step': 1431, 'epoch': 1} {'type': 'loss', 'content': 0.01619694009423256, 'timestamp': '2025-09-30 22:14:49.203961', 'step': 1432, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:14:49.239933', 'step': 1432, 'epoch': 1} {'type': 'loss', 'content': 0.024240665137767792, 'timestamp': '2025-09-30 22:14:49.249496', 'step': 1433, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:14:49.292665', 'step': 1433, 'epoch': 1} {'type': 'loss', 'content': 0.020648330450057983, 'timestamp': '2025-09-30 22:14:49.306019', 'step': 1434, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:49.342856', 'step': 1434, 'epoch': 1} {'type': 'loss', 'content': 0.01836603321135044, 'timestamp': '2025-09-30 22:14:49.358454', 'step': 1435, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:49.394412', 'step': 1435, 'epoch': 1} {'type': 'loss', 'content': 0.019589319825172424, 'timestamp': '2025-09-30 22:14:49.425397', 'step': 1436, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:49.462067', 'step': 1436, 'epoch': 1} {'type': 'loss', 'content': 0.0132905263453722, 'timestamp': '2025-09-30 22:14:49.467811', 'step': 1437, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:49.513656', 'step': 1437, 'epoch': 1} {'type': 'loss', 'content': 0.013705338351428509, 'timestamp': '2025-09-30 22:14:49.526568', 'step': 1438, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:49.561658', 'step': 1438, 'epoch': 1} {'type': 'loss', 'content': 0.007382605224847794, 'timestamp': '2025-09-30 22:14:49.572692', 'step': 1439, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:49.605953', 'step': 1439, 'epoch': 1} {'type': 'loss', 'content': 0.021737150847911835, 'timestamp': '2025-09-30 22:14:49.634868', 'step': 1440, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:14:49.680452', 'step': 1440, 'epoch': 1} {'type': 'loss', 'content': 0.022566504776477814, 'timestamp': '2025-09-30 22:14:49.685634', 'step': 1441, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:49.731781', 'step': 1441, 'epoch': 1} {'type': 'loss', 'content': 0.020912060514092445, 'timestamp': '2025-09-30 22:14:49.739698', 'step': 1442, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:14:49.776740', 'step': 1442, 'epoch': 1} {'type': 'loss', 'content': 0.02049718052148819, 'timestamp': '2025-09-30 22:14:49.784392', 'step': 1443, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:49.827083', 'step': 1443, 'epoch': 1} {'type': 'loss', 'content': 0.01551031693816185, 'timestamp': '2025-09-30 22:14:49.860135', 'step': 1444, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:49.896701', 'step': 1444, 'epoch': 1} {'type': 'loss', 'content': 0.026235654950141907, 'timestamp': '2025-09-30 22:14:49.902310', 'step': 1445, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:49.936472', 'step': 1445, 'epoch': 1} {'type': 'loss', 'content': 0.013224468566477299, 'timestamp': '2025-09-30 22:14:49.944278', 'step': 1446, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:14:49.982896', 'step': 1446, 'epoch': 1} {'type': 'loss', 'content': 0.01962916925549507, 'timestamp': '2025-09-30 22:14:49.996254', 'step': 1447, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:50.036035', 'step': 1447, 'epoch': 1} {'type': 'loss', 'content': 0.022626500576734543, 'timestamp': '2025-09-30 22:14:50.064701', 'step': 1448, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:50.105501', 'step': 1448, 'epoch': 1} {'type': 'loss', 'content': 0.027859408408403397, 'timestamp': '2025-09-30 22:14:50.115635', 'step': 1449, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:14:50.155662', 'step': 1449, 'epoch': 1} {'type': 'loss', 'content': 0.013107175007462502, 'timestamp': '2025-09-30 22:14:50.168243', 'step': 1450, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:14:50.208708', 'step': 1450, 'epoch': 1} {'type': 'loss', 'content': 0.010612815618515015, 'timestamp': '2025-09-30 22:14:50.222118', 'step': 1451, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:14:50.261565', 'step': 1451, 'epoch': 1} {'type': 'loss', 'content': 0.017334220930933952, 'timestamp': '2025-09-30 22:14:50.289320', 'step': 1452, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:14:50.324188', 'step': 1452, 'epoch': 1} {'type': 'loss', 'content': 0.0179128535091877, 'timestamp': '2025-09-30 22:14:50.327415', 'step': 1453, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:14:50.363052', 'step': 1453, 'epoch': 1} {'type': 'loss', 'content': 0.015791935846209526, 'timestamp': '2025-09-30 22:14:50.375569', 'step': 1454, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:50.418426', 'step': 1454, 'epoch': 1} {'type': 'loss', 'content': 0.022962335497140884, 'timestamp': '2025-09-30 22:14:50.430170', 'step': 1455, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:14:50.475264', 'step': 1455, 'epoch': 1} {'type': 'loss', 'content': 0.01597381755709648, 'timestamp': '2025-09-30 22:14:50.507608', 'step': 1456, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:50.540652', 'step': 1456, 'epoch': 1} {'type': 'loss', 'content': 0.02298254333436489, 'timestamp': '2025-09-30 22:14:50.552374', 'step': 1457, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:50.592361', 'step': 1457, 'epoch': 1} {'type': 'loss', 'content': 0.03316576033830643, 'timestamp': '2025-09-30 22:14:50.605231', 'step': 1458, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:14:50.641681', 'step': 1458, 'epoch': 1} {'type': 'loss', 'content': 0.048388849943876266, 'timestamp': '2025-09-30 22:14:50.653407', 'step': 1459, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:14:50.686171', 'step': 1459, 'epoch': 1} {'type': 'loss', 'content': 0.028634879738092422, 'timestamp': '2025-09-30 22:14:50.714133', 'step': 1460, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:50.755989', 'step': 1460, 'epoch': 1} {'type': 'loss', 'content': 0.018213100731372833, 'timestamp': '2025-09-30 22:14:50.763160', 'step': 1461, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:50.806069', 'step': 1461, 'epoch': 1} {'type': 'loss', 'content': 0.023431608453392982, 'timestamp': '2025-09-30 22:14:50.814058', 'step': 1462, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:50.851758', 'step': 1462, 'epoch': 1} {'type': 'loss', 'content': 0.014353444799780846, 'timestamp': '2025-09-30 22:14:50.862888', 'step': 1463, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:50.905329', 'step': 1463, 'epoch': 1} {'type': 'loss', 'content': 0.028549939393997192, 'timestamp': '2025-09-30 22:14:50.934180', 'step': 1464, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:50.968627', 'step': 1464, 'epoch': 1} {'type': 'loss', 'content': 0.016568686813116074, 'timestamp': '2025-09-30 22:14:50.981827', 'step': 1465, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:14:51.018628', 'step': 1465, 'epoch': 1} {'type': 'loss', 'content': 0.02902393415570259, 'timestamp': '2025-09-30 22:14:51.026356', 'step': 1466, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:14:51.063954', 'step': 1466, 'epoch': 1} {'type': 'loss', 'content': 0.01421070285141468, 'timestamp': '2025-09-30 22:14:51.076566', 'step': 1467, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:51.120548', 'step': 1467, 'epoch': 1} {'type': 'loss', 'content': 0.023018157109618187, 'timestamp': '2025-09-30 22:14:51.153703', 'step': 1468, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:51.190625', 'step': 1468, 'epoch': 1} {'type': 'loss', 'content': 0.03066314198076725, 'timestamp': '2025-09-30 22:14:51.199382', 'step': 1469, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:51.238666', 'step': 1469, 'epoch': 1} {'type': 'loss', 'content': 0.022856958210468292, 'timestamp': '2025-09-30 22:14:51.249772', 'step': 1470, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:51.282580', 'step': 1470, 'epoch': 1} {'type': 'loss', 'content': 0.018278757110238075, 'timestamp': '2025-09-30 22:14:51.290309', 'step': 1471, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:14:51.332739', 'step': 1471, 'epoch': 1} {'type': 'loss', 'content': 0.015473520383238792, 'timestamp': '2025-09-30 22:14:51.361506', 'step': 1472, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:14:51.397460', 'step': 1472, 'epoch': 1} {'type': 'loss', 'content': 0.029607480391860008, 'timestamp': '2025-09-30 22:14:51.402376', 'step': 1473, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:51.437797', 'step': 1473, 'epoch': 1} {'type': 'loss', 'content': 0.01510325912386179, 'timestamp': '2025-09-30 22:14:51.448861', 'step': 1474, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:51.487589', 'step': 1474, 'epoch': 1} {'type': 'loss', 'content': 0.02018461562693119, 'timestamp': '2025-09-30 22:14:51.497693', 'step': 1475, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:51.544161', 'step': 1475, 'epoch': 1} {'type': 'loss', 'content': 0.026856016367673874, 'timestamp': '2025-09-30 22:14:51.577294', 'step': 1476, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:51.611748', 'step': 1476, 'epoch': 1} {'type': 'loss', 'content': 0.026579078286886215, 'timestamp': '2025-09-30 22:14:51.619703', 'step': 1477, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:14:51.654763', 'step': 1477, 'epoch': 1} {'type': 'loss', 'content': 0.032938793301582336, 'timestamp': '2025-09-30 22:14:51.662463', 'step': 1478, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:51.695924', 'step': 1478, 'epoch': 1} {'type': 'loss', 'content': 0.025103628635406494, 'timestamp': '2025-09-30 22:14:51.703825', 'step': 1479, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:51.744172', 'step': 1479, 'epoch': 1} {'type': 'loss', 'content': 0.018662355840206146, 'timestamp': '2025-09-30 22:14:51.775221', 'step': 1480, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:51.815735', 'step': 1480, 'epoch': 1} {'type': 'loss', 'content': 0.028233205899596214, 'timestamp': '2025-09-30 22:14:51.824452', 'step': 1481, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:51.860848', 'step': 1481, 'epoch': 1} {'type': 'loss', 'content': 0.034671224653720856, 'timestamp': '2025-09-30 22:14:51.871956', 'step': 1482, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:51.909759', 'step': 1482, 'epoch': 1} {'type': 'loss', 'content': 0.010450835339725018, 'timestamp': '2025-09-30 22:14:51.920796', 'step': 1483, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:14:51.960392', 'step': 1483, 'epoch': 1} {'type': 'loss', 'content': 0.03715112432837486, 'timestamp': '2025-09-30 22:14:51.995030', 'step': 1484, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:14:52.035435', 'step': 1484, 'epoch': 1} {'type': 'loss', 'content': 0.026053503155708313, 'timestamp': '2025-09-30 22:14:52.048462', 'step': 1485, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:14:52.084696', 'step': 1485, 'epoch': 1} {'type': 'loss', 'content': 0.026194270700216293, 'timestamp': '2025-09-30 22:14:52.091998', 'step': 1486, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:14:52.136783', 'step': 1486, 'epoch': 1} {'type': 'loss', 'content': 0.010904263705015182, 'timestamp': '2025-09-30 22:14:52.149360', 'step': 1487, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 15187581968384}, 'timestamp': '2025-09-30 22:14:52.206340', 'step': 1487, 'epoch': 1} {'type': 'loss', 'content': 0.014208262786269188, 'timestamp': '2025-09-30 22:14:52.244887', 'step': 1488, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:52.281853', 'step': 1488, 'epoch': 1} {'type': 'loss', 'content': 0.01684318482875824, 'timestamp': '2025-09-30 22:14:52.291660', 'step': 1489, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:14:52.328868', 'step': 1489, 'epoch': 1} {'type': 'loss', 'content': 0.019845174625515938, 'timestamp': '2025-09-30 22:14:52.341130', 'step': 1490, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:52.381516', 'step': 1490, 'epoch': 1} {'type': 'loss', 'content': 0.018346065655350685, 'timestamp': '2025-09-30 22:14:52.391742', 'step': 1491, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:52.430278', 'step': 1491, 'epoch': 1} {'type': 'loss', 'content': 0.020569315180182457, 'timestamp': '2025-09-30 22:14:52.461495', 'step': 1492, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:14:52.497691', 'step': 1492, 'epoch': 1} {'type': 'loss', 'content': 0.03632209822535515, 'timestamp': '2025-09-30 22:14:52.503283', 'step': 1493, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:14:52.536043', 'step': 1493, 'epoch': 1} {'type': 'loss', 'content': 0.009968671016395092, 'timestamp': '2025-09-30 22:14:52.543035', 'step': 1494, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:52.577073', 'step': 1494, 'epoch': 1} {'type': 'loss', 'content': 0.01671568490564823, 'timestamp': '2025-09-30 22:14:52.588133', 'step': 1495, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-09-30 22:14:55.060162', 'step': 1495, 'epoch': 1} {'type': 'pplx', 'content': 5.379322825697576, 'timestamp': '2025-09-30 22:14:55.062560', 'step': 1495, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:14:55.098604', 'step': 1495, 'epoch': 1} {'type': 'loss', 'content': 0.021652352064847946, 'timestamp': '2025-09-30 22:14:55.128525', 'step': 1496, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:14:55.165493', 'step': 1496, 'epoch': 1} {'type': 'loss', 'content': 0.01600632071495056, 'timestamp': '2025-09-30 22:14:55.178513', 'step': 1497, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:14:55.214822', 'step': 1497, 'epoch': 1} {'type': 'loss', 'content': 0.023342151194810867, 'timestamp': '2025-09-30 22:14:55.222394', 'step': 1498, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:14:55.256101', 'step': 1498, 'epoch': 1} {'type': 'loss', 'content': 0.025111747905611992, 'timestamp': '2025-09-30 22:14:55.263723', 'step': 1499, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:14:55.312464', 'step': 1499, 'epoch': 1} {'type': 'loss', 'content': 0.015089130960404873, 'timestamp': '2025-09-30 22:14:55.344332', 'step': 1500, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 1500', 'timestamp': '2025-09-30 22:15:00.722951', 'step': 1500, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:00.758414', 'step': 1500, 'epoch': 1} {'type': 'loss', 'content': 0.01703370362520218, 'timestamp': '2025-09-30 22:15:00.765556', 'step': 1501, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:00.799722', 'step': 1501, 'epoch': 1} {'type': 'loss', 'content': 0.007595858536660671, 'timestamp': '2025-09-30 22:15:00.812250', 'step': 1502, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:00.852574', 'step': 1502, 'epoch': 1} {'type': 'loss', 'content': 0.010175961069762707, 'timestamp': '2025-09-30 22:15:00.865827', 'step': 1503, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:00.902350', 'step': 1503, 'epoch': 1} {'type': 'loss', 'content': 0.009721358306705952, 'timestamp': '2025-09-30 22:15:00.933344', 'step': 1504, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:00.967035', 'step': 1504, 'epoch': 1} {'type': 'loss', 'content': 0.009681370109319687, 'timestamp': '2025-09-30 22:15:00.975783', 'step': 1505, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:01.011856', 'step': 1505, 'epoch': 1} {'type': 'loss', 'content': 0.011668430641293526, 'timestamp': '2025-09-30 22:15:01.023057', 'step': 1506, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:15:01.056579', 'step': 1506, 'epoch': 1} {'type': 'loss', 'content': 0.030000098049640656, 'timestamp': '2025-09-30 22:15:01.063787', 'step': 1507, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:15:01.100682', 'step': 1507, 'epoch': 1} {'type': 'loss', 'content': 0.012646145187318325, 'timestamp': '2025-09-30 22:15:01.129508', 'step': 1508, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:15:01.164880', 'step': 1508, 'epoch': 1} {'type': 'loss', 'content': 0.006477945484220982, 'timestamp': '2025-09-30 22:15:01.177937', 'step': 1509, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:01.212380', 'step': 1509, 'epoch': 1} {'type': 'loss', 'content': 0.013549396768212318, 'timestamp': '2025-09-30 22:15:01.223509', 'step': 1510, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:01.257809', 'step': 1510, 'epoch': 1} {'type': 'loss', 'content': 0.02432546205818653, 'timestamp': '2025-09-30 22:15:01.268977', 'step': 1511, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:01.306775', 'step': 1511, 'epoch': 1} {'type': 'loss', 'content': 0.017720526084303856, 'timestamp': '2025-09-30 22:15:01.338002', 'step': 1512, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:01.369703', 'step': 1512, 'epoch': 1} {'type': 'loss', 'content': 0.006004958879202604, 'timestamp': '2025-09-30 22:15:01.379666', 'step': 1513, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:01.417717', 'step': 1513, 'epoch': 1} {'type': 'loss', 'content': 0.033136531710624695, 'timestamp': '2025-09-30 22:15:01.430007', 'step': 1514, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:15:01.464629', 'step': 1514, 'epoch': 1} {'type': 'loss', 'content': 0.028014734387397766, 'timestamp': '2025-09-30 22:15:01.472430', 'step': 1515, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:01.510144', 'step': 1515, 'epoch': 1} {'type': 'loss', 'content': 0.03069695271551609, 'timestamp': '2025-09-30 22:15:01.542134', 'step': 1516, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:01.582810', 'step': 1516, 'epoch': 1} {'type': 'loss', 'content': 0.015695618465542793, 'timestamp': '2025-09-30 22:15:01.590867', 'step': 1517, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:15:01.622988', 'step': 1517, 'epoch': 1} {'type': 'loss', 'content': 0.040362466126680374, 'timestamp': '2025-09-30 22:15:01.630605', 'step': 1518, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:15:01.666964', 'step': 1518, 'epoch': 1} {'type': 'loss', 'content': 0.02767367660999298, 'timestamp': '2025-09-30 22:15:01.674185', 'step': 1519, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:15:01.710346', 'step': 1519, 'epoch': 1} {'type': 'loss', 'content': 0.04135942831635475, 'timestamp': '2025-09-30 22:15:01.739107', 'step': 1520, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:15:01.776647', 'step': 1520, 'epoch': 1} {'type': 'loss', 'content': 0.026216236874461174, 'timestamp': '2025-09-30 22:15:01.781834', 'step': 1521, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:01.823340', 'step': 1521, 'epoch': 1} {'type': 'loss', 'content': 0.03001749888062477, 'timestamp': '2025-09-30 22:15:01.835705', 'step': 1522, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:01.874484', 'step': 1522, 'epoch': 1} {'type': 'loss', 'content': 0.019269688054919243, 'timestamp': '2025-09-30 22:15:01.887783', 'step': 1523, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:15:01.928177', 'step': 1523, 'epoch': 1} {'type': 'loss', 'content': 0.02227448858320713, 'timestamp': '2025-09-30 22:15:01.956960', 'step': 1524, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:15:01.994250', 'step': 1524, 'epoch': 1} {'type': 'loss', 'content': 0.02891368605196476, 'timestamp': '2025-09-30 22:15:01.999631', 'step': 1525, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:15:02.038187', 'step': 1525, 'epoch': 1} {'type': 'loss', 'content': 0.017416177317500114, 'timestamp': '2025-09-30 22:15:02.045461', 'step': 1526, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:02.077628', 'step': 1526, 'epoch': 1} {'type': 'loss', 'content': 0.02120266482234001, 'timestamp': '2025-09-30 22:15:02.088476', 'step': 1527, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:02.131866', 'step': 1527, 'epoch': 1} {'type': 'loss', 'content': 0.05429844185709953, 'timestamp': '2025-09-30 22:15:02.166163', 'step': 1528, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:02.200272', 'step': 1528, 'epoch': 1} {'type': 'loss', 'content': 0.0243589635938406, 'timestamp': '2025-09-30 22:15:02.208258', 'step': 1529, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:02.241422', 'step': 1529, 'epoch': 1} {'type': 'loss', 'content': 0.02745889127254486, 'timestamp': '2025-09-30 22:15:02.252511', 'step': 1530, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:15:02.287663', 'step': 1530, 'epoch': 1} {'type': 'loss', 'content': 0.025427240878343582, 'timestamp': '2025-09-30 22:15:02.301380', 'step': 1531, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:02.335046', 'step': 1531, 'epoch': 1} {'type': 'loss', 'content': 0.018605539575219154, 'timestamp': '2025-09-30 22:15:02.368455', 'step': 1532, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:02.406488', 'step': 1532, 'epoch': 1} {'type': 'loss', 'content': 0.01158945169299841, 'timestamp': '2025-09-30 22:15:02.415333', 'step': 1533, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:02.451096', 'step': 1533, 'epoch': 1} {'type': 'loss', 'content': 0.0385240763425827, 'timestamp': '2025-09-30 22:15:02.463587', 'step': 1534, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:02.500080', 'step': 1534, 'epoch': 1} {'type': 'loss', 'content': 0.021807149052619934, 'timestamp': '2025-09-30 22:15:02.511301', 'step': 1535, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:02.543570', 'step': 1535, 'epoch': 1} {'type': 'loss', 'content': 0.013656704686582088, 'timestamp': '2025-09-30 22:15:02.577040', 'step': 1536, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:02.616217', 'step': 1536, 'epoch': 1} {'type': 'loss', 'content': 0.013688210397958755, 'timestamp': '2025-09-30 22:15:02.628896', 'step': 1537, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:02.673505', 'step': 1537, 'epoch': 1} {'type': 'loss', 'content': 0.03932627663016319, 'timestamp': '2025-09-30 22:15:02.685787', 'step': 1538, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:02.723870', 'step': 1538, 'epoch': 1} {'type': 'loss', 'content': 0.016051799058914185, 'timestamp': '2025-09-30 22:15:02.736412', 'step': 1539, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:02.770999', 'step': 1539, 'epoch': 1} {'type': 'loss', 'content': 0.04664245992898941, 'timestamp': '2025-09-30 22:15:02.804290', 'step': 1540, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:02.836058', 'step': 1540, 'epoch': 1} {'type': 'loss', 'content': 0.02219100296497345, 'timestamp': '2025-09-30 22:15:02.844255', 'step': 1541, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:15:02.890175', 'step': 1541, 'epoch': 1} {'type': 'loss', 'content': 0.00859779305756092, 'timestamp': '2025-09-30 22:15:02.903836', 'step': 1542, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:02.945985', 'step': 1542, 'epoch': 1} {'type': 'loss', 'content': 0.020811183378100395, 'timestamp': '2025-09-30 22:15:02.958366', 'step': 1543, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:02.996540', 'step': 1543, 'epoch': 1} {'type': 'loss', 'content': 0.021285343915224075, 'timestamp': '2025-09-30 22:15:03.029735', 'step': 1544, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:03.066709', 'step': 1544, 'epoch': 1} {'type': 'loss', 'content': 0.007851026952266693, 'timestamp': '2025-09-30 22:15:03.076558', 'step': 1545, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:03.118425', 'step': 1545, 'epoch': 1} {'type': 'loss', 'content': 0.027574323117733, 'timestamp': '2025-09-30 22:15:03.131719', 'step': 1546, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:03.166377', 'step': 1546, 'epoch': 1} {'type': 'loss', 'content': 0.01619553565979004, 'timestamp': '2025-09-30 22:15:03.178657', 'step': 1547, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:03.217768', 'step': 1547, 'epoch': 1} {'type': 'loss', 'content': 0.02190890908241272, 'timestamp': '2025-09-30 22:15:03.250975', 'step': 1548, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:03.294776', 'step': 1548, 'epoch': 1} {'type': 'loss', 'content': 0.007822610437870026, 'timestamp': '2025-09-30 22:15:03.307499', 'step': 1549, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:03.348489', 'step': 1549, 'epoch': 1} {'type': 'loss', 'content': 0.011164495721459389, 'timestamp': '2025-09-30 22:15:03.361161', 'step': 1550, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:03.408154', 'step': 1550, 'epoch': 1} {'type': 'loss', 'content': 0.013010852038860321, 'timestamp': '2025-09-30 22:15:03.427345', 'step': 1551, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:03.469761', 'step': 1551, 'epoch': 1} {'type': 'loss', 'content': 0.022824978455901146, 'timestamp': '2025-09-30 22:15:03.504051', 'step': 1552, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:03.539602', 'step': 1552, 'epoch': 1} {'type': 'loss', 'content': 0.012585001066327095, 'timestamp': '2025-09-30 22:15:03.550501', 'step': 1553, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:03.596933', 'step': 1553, 'epoch': 1} {'type': 'loss', 'content': 0.021539170295000076, 'timestamp': '2025-09-30 22:15:03.608113', 'step': 1554, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:03.648220', 'step': 1554, 'epoch': 1} {'type': 'loss', 'content': 0.03153735771775246, 'timestamp': '2025-09-30 22:15:03.661559', 'step': 1555, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:15:03.694963', 'step': 1555, 'epoch': 1} {'type': 'loss', 'content': 0.02265406958758831, 'timestamp': '2025-09-30 22:15:03.723833', 'step': 1556, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:03.765045', 'step': 1556, 'epoch': 1} {'type': 'loss', 'content': 0.018682723864912987, 'timestamp': '2025-09-30 22:15:03.773124', 'step': 1557, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:03.811401', 'step': 1557, 'epoch': 1} {'type': 'loss', 'content': 0.01089040283113718, 'timestamp': '2025-09-30 22:15:03.822520', 'step': 1558, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:15:03.863937', 'step': 1558, 'epoch': 1} {'type': 'loss', 'content': 0.01570407673716545, 'timestamp': '2025-09-30 22:15:03.877649', 'step': 1559, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:15:03.911488', 'step': 1559, 'epoch': 1} {'type': 'loss', 'content': 0.01560614537447691, 'timestamp': '2025-09-30 22:15:03.940293', 'step': 1560, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:03.975964', 'step': 1560, 'epoch': 1} {'type': 'loss', 'content': 0.022726111114025116, 'timestamp': '2025-09-30 22:15:03.984180', 'step': 1561, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:15:04.022196', 'step': 1561, 'epoch': 1} {'type': 'loss', 'content': 0.011036788113415241, 'timestamp': '2025-09-30 22:15:04.036241', 'step': 1562, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:04.074640', 'step': 1562, 'epoch': 1} {'type': 'loss', 'content': 0.022821931168437004, 'timestamp': '2025-09-30 22:15:04.086027', 'step': 1563, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:04.117162', 'step': 1563, 'epoch': 1} {'type': 'loss', 'content': 0.028916530311107635, 'timestamp': '2025-09-30 22:15:04.148549', 'step': 1564, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:04.191703', 'step': 1564, 'epoch': 1} {'type': 'loss', 'content': 0.027913669124245644, 'timestamp': '2025-09-30 22:15:04.202150', 'step': 1565, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:04.237009', 'step': 1565, 'epoch': 1} {'type': 'loss', 'content': 0.02377433516085148, 'timestamp': '2025-09-30 22:15:04.249335', 'step': 1566, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:04.286365', 'step': 1566, 'epoch': 1} {'type': 'loss', 'content': 0.016784582287073135, 'timestamp': '2025-09-30 22:15:04.299763', 'step': 1567, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:04.334662', 'step': 1567, 'epoch': 1} {'type': 'loss', 'content': 0.037801824510097504, 'timestamp': '2025-09-30 22:15:04.366069', 'step': 1568, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:04.398070', 'step': 1568, 'epoch': 1} {'type': 'loss', 'content': 0.0208644550293684, 'timestamp': '2025-09-30 22:15:04.406141', 'step': 1569, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:04.442067', 'step': 1569, 'epoch': 1} {'type': 'loss', 'content': 0.012212934903800488, 'timestamp': '2025-09-30 22:15:04.454595', 'step': 1570, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:04.492559', 'step': 1570, 'epoch': 1} {'type': 'loss', 'content': 0.013333462178707123, 'timestamp': '2025-09-30 22:15:04.503625', 'step': 1571, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:04.543265', 'step': 1571, 'epoch': 1} {'type': 'loss', 'content': 0.02915300987660885, 'timestamp': '2025-09-30 22:15:04.576761', 'step': 1572, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:04.612123', 'step': 1572, 'epoch': 1} {'type': 'loss', 'content': 0.023643821477890015, 'timestamp': '2025-09-30 22:15:04.622946', 'step': 1573, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:04.660747', 'step': 1573, 'epoch': 1} {'type': 'loss', 'content': 0.015274593606591225, 'timestamp': '2025-09-30 22:15:04.673126', 'step': 1574, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:04.710632', 'step': 1574, 'epoch': 1} {'type': 'loss', 'content': 0.021839918568730354, 'timestamp': '2025-09-30 22:15:04.722993', 'step': 1575, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:15:04.763437', 'step': 1575, 'epoch': 1} {'type': 'loss', 'content': 0.007588080130517483, 'timestamp': '2025-09-30 22:15:04.797980', 'step': 1576, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:04.832980', 'step': 1576, 'epoch': 1} {'type': 'loss', 'content': 0.014518532902002335, 'timestamp': '2025-09-30 22:15:04.845610', 'step': 1577, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:15:04.896414', 'step': 1577, 'epoch': 1} {'type': 'loss', 'content': 0.0101570850238204, 'timestamp': '2025-09-30 22:15:04.912257', 'step': 1578, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:15:04.957017', 'step': 1578, 'epoch': 1} {'type': 'loss', 'content': 0.006863059010356665, 'timestamp': '2025-09-30 22:15:04.972686', 'step': 1579, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:15:05.015281', 'step': 1579, 'epoch': 1} {'type': 'loss', 'content': 0.03003445453941822, 'timestamp': '2025-09-30 22:15:05.049792', 'step': 1580, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:05.096118', 'step': 1580, 'epoch': 1} {'type': 'loss', 'content': 0.026673344895243645, 'timestamp': '2025-09-30 22:15:05.104815', 'step': 1581, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:05.144333', 'step': 1581, 'epoch': 1} {'type': 'loss', 'content': 0.028324894607067108, 'timestamp': '2025-09-30 22:15:05.155446', 'step': 1582, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:05.192601', 'step': 1582, 'epoch': 1} {'type': 'loss', 'content': 0.01850230246782303, 'timestamp': '2025-09-30 22:15:05.203181', 'step': 1583, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:05.240604', 'step': 1583, 'epoch': 1} {'type': 'loss', 'content': 0.014605790376663208, 'timestamp': '2025-09-30 22:15:05.274038', 'step': 1584, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:05.316147', 'step': 1584, 'epoch': 1} {'type': 'loss', 'content': 0.01580674760043621, 'timestamp': '2025-09-30 22:15:05.324007', 'step': 1585, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:05.358589', 'step': 1585, 'epoch': 1} {'type': 'loss', 'content': 0.015010936185717583, 'timestamp': '2025-09-30 22:15:05.370937', 'step': 1586, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:05.404407', 'step': 1586, 'epoch': 1} {'type': 'loss', 'content': 0.04198862239718437, 'timestamp': '2025-09-30 22:15:05.414748', 'step': 1587, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:05.452183', 'step': 1587, 'epoch': 1} {'type': 'loss', 'content': 0.010779457166790962, 'timestamp': '2025-09-30 22:15:05.485613', 'step': 1588, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:05.524310', 'step': 1588, 'epoch': 1} {'type': 'loss', 'content': 0.037828974425792694, 'timestamp': '2025-09-30 22:15:05.536946', 'step': 1589, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:05.571561', 'step': 1589, 'epoch': 1} {'type': 'loss', 'content': 0.02178335376083851, 'timestamp': '2025-09-30 22:15:05.582544', 'step': 1590, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:15:05.617260', 'step': 1590, 'epoch': 1} {'type': 'loss', 'content': 0.01142160128802061, 'timestamp': '2025-09-30 22:15:05.624741', 'step': 1591, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:05.658317', 'step': 1591, 'epoch': 1} {'type': 'loss', 'content': 0.018499299883842468, 'timestamp': '2025-09-30 22:15:05.689446', 'step': 1592, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:05.732066', 'step': 1592, 'epoch': 1} {'type': 'loss', 'content': 0.030043240636587143, 'timestamp': '2025-09-30 22:15:05.740822', 'step': 1593, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:05.774048', 'step': 1593, 'epoch': 1} {'type': 'loss', 'content': 0.013827769085764885, 'timestamp': '2025-09-30 22:15:05.786361', 'step': 1594, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:15:05.825080', 'step': 1594, 'epoch': 1} {'type': 'loss', 'content': 0.021424157544970512, 'timestamp': '2025-09-30 22:15:05.838944', 'step': 1595, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:15:05.876781', 'step': 1595, 'epoch': 1} {'type': 'loss', 'content': 0.020284688100218773, 'timestamp': '2025-09-30 22:15:05.906188', 'step': 1596, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:05.956963', 'step': 1596, 'epoch': 1} {'type': 'loss', 'content': 0.013254090212285519, 'timestamp': '2025-09-30 22:15:05.966961', 'step': 1597, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:15:06.016279', 'step': 1597, 'epoch': 1} {'type': 'loss', 'content': 0.01100999303162098, 'timestamp': '2025-09-30 22:15:06.030049', 'step': 1598, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:06.072696', 'step': 1598, 'epoch': 1} {'type': 'loss', 'content': 0.018316803500056267, 'timestamp': '2025-09-30 22:15:06.083007', 'step': 1599, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:06.125635', 'step': 1599, 'epoch': 1} {'type': 'loss', 'content': 0.02024856209754944, 'timestamp': '2025-09-30 22:15:06.158837', 'step': 1600, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:15:06.202958', 'step': 1600, 'epoch': 1} {'type': 'loss', 'content': 0.01697261445224285, 'timestamp': '2025-09-30 22:15:06.216298', 'step': 1601, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:06.252496', 'step': 1601, 'epoch': 1} {'type': 'loss', 'content': 0.01887323521077633, 'timestamp': '2025-09-30 22:15:06.265066', 'step': 1602, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:06.308840', 'step': 1602, 'epoch': 1} {'type': 'loss', 'content': 0.02012869343161583, 'timestamp': '2025-09-30 22:15:06.322235', 'step': 1603, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:06.362188', 'step': 1603, 'epoch': 1} {'type': 'loss', 'content': 0.01736818440258503, 'timestamp': '2025-09-30 22:15:06.396499', 'step': 1604, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:06.431925', 'step': 1604, 'epoch': 1} {'type': 'loss', 'content': 0.0334581658244133, 'timestamp': '2025-09-30 22:15:06.442444', 'step': 1605, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:15:06.481912', 'step': 1605, 'epoch': 1} {'type': 'loss', 'content': 0.012905096635222435, 'timestamp': '2025-09-30 22:15:06.495698', 'step': 1606, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:06.544382', 'step': 1606, 'epoch': 1} {'type': 'loss', 'content': 0.01980365253984928, 'timestamp': '2025-09-30 22:15:06.557736', 'step': 1607, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:06.606310', 'step': 1607, 'epoch': 1} {'type': 'loss', 'content': 0.013451996259391308, 'timestamp': '2025-09-30 22:15:06.639373', 'step': 1608, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:15:06.687176', 'step': 1608, 'epoch': 1} {'type': 'loss', 'content': 0.029307778924703598, 'timestamp': '2025-09-30 22:15:06.692827', 'step': 1609, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:06.746176', 'step': 1609, 'epoch': 1} {'type': 'loss', 'content': 0.027767712250351906, 'timestamp': '2025-09-30 22:15:06.757308', 'step': 1610, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-09-30 22:15:09.347425', 'step': 1610, 'epoch': 1} {'type': 'pplx', 'content': 5.567295370613334, 'timestamp': '2025-09-30 22:15:09.349820', 'step': 1610, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:09.379989', 'step': 1610, 'epoch': 1} {'type': 'loss', 'content': 0.026089852675795555, 'timestamp': '2025-09-30 22:15:09.392442', 'step': 1611, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-09-30 22:15:09.435847', 'step': 1611, 'epoch': 1} {'type': 'loss', 'content': 0.018595224246382713, 'timestamp': '2025-09-30 22:15:09.472876', 'step': 1612, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:09.508124', 'step': 1612, 'epoch': 1} {'type': 'loss', 'content': 0.01098089013248682, 'timestamp': '2025-09-30 22:15:09.520785', 'step': 1613, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:15:09.570952', 'step': 1613, 'epoch': 1} {'type': 'loss', 'content': 0.013269396498799324, 'timestamp': '2025-09-30 22:15:09.586510', 'step': 1614, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:09.629996', 'step': 1614, 'epoch': 1} {'type': 'loss', 'content': 0.015588633716106415, 'timestamp': '2025-09-30 22:15:09.642200', 'step': 1615, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:09.675728', 'step': 1615, 'epoch': 1} {'type': 'loss', 'content': 0.012610826641321182, 'timestamp': '2025-09-30 22:15:09.709174', 'step': 1616, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:09.745424', 'step': 1616, 'epoch': 1} {'type': 'loss', 'content': 0.0474841482937336, 'timestamp': '2025-09-30 22:15:09.753326', 'step': 1617, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:09.793672', 'step': 1617, 'epoch': 1} {'type': 'loss', 'content': 0.02974524535238743, 'timestamp': '2025-09-30 22:15:09.806017', 'step': 1618, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:15:09.846413', 'step': 1618, 'epoch': 1} {'type': 'loss', 'content': 0.024764923378825188, 'timestamp': '2025-09-30 22:15:09.860135', 'step': 1619, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:15:09.892123', 'step': 1619, 'epoch': 1} {'type': 'loss', 'content': 0.041059788316488266, 'timestamp': '2025-09-30 22:15:09.917495', 'step': 1620, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:15:09.967799', 'step': 1620, 'epoch': 1} {'type': 'loss', 'content': 0.04249605908989906, 'timestamp': '2025-09-30 22:15:09.982914', 'step': 1621, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:15:10.014911', 'step': 1621, 'epoch': 1} {'type': 'loss', 'content': 0.040544331073760986, 'timestamp': '2025-09-30 22:15:10.019407', 'step': 1622, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-09-30 22:15:10.066393', 'step': 1622, 'epoch': 1} {'type': 'loss', 'content': 0.018109586089849472, 'timestamp': '2025-09-30 22:15:10.082479', 'step': 1623, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:15:10.126180', 'step': 1623, 'epoch': 1} {'type': 'loss', 'content': 0.016633780673146248, 'timestamp': '2025-09-30 22:15:10.153788', 'step': 1624, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:10.188864', 'step': 1624, 'epoch': 1} {'type': 'loss', 'content': 0.018415579572319984, 'timestamp': '2025-09-30 22:15:10.197659', 'step': 1625, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:10.233300', 'step': 1625, 'epoch': 1} {'type': 'loss', 'content': 0.02845713496208191, 'timestamp': '2025-09-30 22:15:10.244430', 'step': 1626, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:10.278719', 'step': 1626, 'epoch': 1} {'type': 'loss', 'content': 0.029067419469356537, 'timestamp': '2025-09-30 22:15:10.289307', 'step': 1627, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:10.332600', 'step': 1627, 'epoch': 1} {'type': 'loss', 'content': 0.018506374210119247, 'timestamp': '2025-09-30 22:15:10.365800', 'step': 1628, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:15:10.398468', 'step': 1628, 'epoch': 1} {'type': 'loss', 'content': 0.03130076453089714, 'timestamp': '2025-09-30 22:15:10.403394', 'step': 1629, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:10.458144', 'step': 1629, 'epoch': 1} {'type': 'loss', 'content': 0.018053654581308365, 'timestamp': '2025-09-30 22:15:10.470461', 'step': 1630, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:10.521124', 'step': 1630, 'epoch': 1} {'type': 'loss', 'content': 0.013748985715210438, 'timestamp': '2025-09-30 22:15:10.532348', 'step': 1631, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:15:10.572156', 'step': 1631, 'epoch': 1} {'type': 'loss', 'content': 0.014565156772732735, 'timestamp': '2025-09-30 22:15:10.601102', 'step': 1632, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:15:10.634696', 'step': 1632, 'epoch': 1} {'type': 'loss', 'content': 0.05316678807139397, 'timestamp': '2025-09-30 22:15:10.646155', 'step': 1633, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:10.681703', 'step': 1633, 'epoch': 1} {'type': 'loss', 'content': 0.02041662111878395, 'timestamp': '2025-09-30 22:15:10.694079', 'step': 1634, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:15:10.742607', 'step': 1634, 'epoch': 1} {'type': 'loss', 'content': 0.017785055562853813, 'timestamp': '2025-09-30 22:15:10.758527', 'step': 1635, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:15:10.793464', 'step': 1635, 'epoch': 1} {'type': 'loss', 'content': 0.012598762288689613, 'timestamp': '2025-09-30 22:15:10.821246', 'step': 1636, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:10.861028', 'step': 1636, 'epoch': 1} {'type': 'loss', 'content': 0.01525981817394495, 'timestamp': '2025-09-30 22:15:10.871535', 'step': 1637, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:10.903740', 'step': 1637, 'epoch': 1} {'type': 'loss', 'content': 0.03392323479056358, 'timestamp': '2025-09-30 22:15:10.916056', 'step': 1638, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:10.953982', 'step': 1638, 'epoch': 1} {'type': 'loss', 'content': 0.024084903299808502, 'timestamp': '2025-09-30 22:15:10.964510', 'step': 1639, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:15:11.006336', 'step': 1639, 'epoch': 1} {'type': 'loss', 'content': 0.03673992678523064, 'timestamp': '2025-09-30 22:15:11.040952', 'step': 1640, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:11.105431', 'step': 1640, 'epoch': 1} {'type': 'loss', 'content': 0.027965785935521126, 'timestamp': '2025-09-30 22:15:11.114035', 'step': 1641, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:11.147881', 'step': 1641, 'epoch': 1} {'type': 'loss', 'content': 0.021707849577069283, 'timestamp': '2025-09-30 22:15:11.160273', 'step': 1642, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:11.199425', 'step': 1642, 'epoch': 1} {'type': 'loss', 'content': 0.020364973694086075, 'timestamp': '2025-09-30 22:15:11.211979', 'step': 1643, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:11.246269', 'step': 1643, 'epoch': 1} {'type': 'loss', 'content': 0.012984007596969604, 'timestamp': '2025-09-30 22:15:11.279461', 'step': 1644, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:15:11.320454', 'step': 1644, 'epoch': 1} {'type': 'loss', 'content': 0.018483446910977364, 'timestamp': '2025-09-30 22:15:11.333494', 'step': 1645, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:11.373288', 'step': 1645, 'epoch': 1} {'type': 'loss', 'content': 0.03060707449913025, 'timestamp': '2025-09-30 22:15:11.386629', 'step': 1646, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:11.430698', 'step': 1646, 'epoch': 1} {'type': 'loss', 'content': 0.013780564069747925, 'timestamp': '2025-09-30 22:15:11.444058', 'step': 1647, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:11.484199', 'step': 1647, 'epoch': 1} {'type': 'loss', 'content': 0.00868815928697586, 'timestamp': '2025-09-30 22:15:11.517443', 'step': 1648, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:11.554542', 'step': 1648, 'epoch': 1} {'type': 'loss', 'content': 0.017985939979553223, 'timestamp': '2025-09-30 22:15:11.564519', 'step': 1649, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:11.597091', 'step': 1649, 'epoch': 1} {'type': 'loss', 'content': 0.008981360122561455, 'timestamp': '2025-09-30 22:15:11.607558', 'step': 1650, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:15:11.649241', 'step': 1650, 'epoch': 1} {'type': 'loss', 'content': 0.026895396411418915, 'timestamp': '2025-09-30 22:15:11.656780', 'step': 1651, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:11.701964', 'step': 1651, 'epoch': 1} {'type': 'loss', 'content': 0.016994664445519447, 'timestamp': '2025-09-30 22:15:11.733066', 'step': 1652, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:15:11.768252', 'step': 1652, 'epoch': 1} {'type': 'loss', 'content': 0.0100490041077137, 'timestamp': '2025-09-30 22:15:11.781235', 'step': 1653, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:15:11.817627', 'step': 1653, 'epoch': 1} {'type': 'loss', 'content': 0.029902230948209763, 'timestamp': '2025-09-30 22:15:11.824879', 'step': 1654, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:15:11.867331', 'step': 1654, 'epoch': 1} {'type': 'loss', 'content': 0.026958175003528595, 'timestamp': '2025-09-30 22:15:11.881006', 'step': 1655, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:15:11.915803', 'step': 1655, 'epoch': 1} {'type': 'loss', 'content': 0.02336595207452774, 'timestamp': '2025-09-30 22:15:11.944464', 'step': 1656, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:11.984098', 'step': 1656, 'epoch': 1} {'type': 'loss', 'content': 0.0225741695612669, 'timestamp': '2025-09-30 22:15:11.991907', 'step': 1657, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:12.029238', 'step': 1657, 'epoch': 1} {'type': 'loss', 'content': 0.018445616587996483, 'timestamp': '2025-09-30 22:15:12.041742', 'step': 1658, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:15:12.080374', 'step': 1658, 'epoch': 1} {'type': 'loss', 'content': 0.01366068609058857, 'timestamp': '2025-09-30 22:15:12.087910', 'step': 1659, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:12.137396', 'step': 1659, 'epoch': 1} {'type': 'loss', 'content': 0.012483044527471066, 'timestamp': '2025-09-30 22:15:12.171635', 'step': 1660, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:12.206344', 'step': 1660, 'epoch': 1} {'type': 'loss', 'content': 0.02146543562412262, 'timestamp': '2025-09-30 22:15:12.215016', 'step': 1661, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:15:12.254632', 'step': 1661, 'epoch': 1} {'type': 'loss', 'content': 0.023363564163446426, 'timestamp': '2025-09-30 22:15:12.262439', 'step': 1662, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:12.306923', 'step': 1662, 'epoch': 1} {'type': 'loss', 'content': 0.024465836584568024, 'timestamp': '2025-09-30 22:15:12.318997', 'step': 1663, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:15:12.363484', 'step': 1663, 'epoch': 1} {'type': 'loss', 'content': 0.032095007598400116, 'timestamp': '2025-09-30 22:15:12.410191', 'step': 1664, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:12.471169', 'step': 1664, 'epoch': 1} {'type': 'loss', 'content': 0.016160208731889725, 'timestamp': '2025-09-30 22:15:12.481747', 'step': 1665, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:12.529492', 'step': 1665, 'epoch': 1} {'type': 'loss', 'content': 0.007151294033974409, 'timestamp': '2025-09-30 22:15:12.542852', 'step': 1666, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:15:12.599241', 'step': 1666, 'epoch': 1} {'type': 'loss', 'content': 0.0183419156819582, 'timestamp': '2025-09-30 22:15:12.620858', 'step': 1667, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-09-30 22:15:12.691914', 'step': 1667, 'epoch': 1} {'type': 'loss', 'content': 0.008691221475601196, 'timestamp': '2025-09-30 22:15:12.728991', 'step': 1668, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:12.775732', 'step': 1668, 'epoch': 1} {'type': 'loss', 'content': 0.013635504990816116, 'timestamp': '2025-09-30 22:15:12.788431', 'step': 1669, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:12.831372', 'step': 1669, 'epoch': 1} {'type': 'loss', 'content': 0.0186114851385355, 'timestamp': '2025-09-30 22:15:12.843908', 'step': 1670, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:12.894916', 'step': 1670, 'epoch': 1} {'type': 'loss', 'content': 0.008745511062443256, 'timestamp': '2025-09-30 22:15:12.919761', 'step': 1671, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:15:12.966732', 'step': 1671, 'epoch': 1} {'type': 'loss', 'content': 0.00956717412918806, 'timestamp': '2025-09-30 22:15:13.001388', 'step': 1672, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:15:13.040547', 'step': 1672, 'epoch': 1} {'type': 'loss', 'content': 0.018159685656428337, 'timestamp': '2025-09-30 22:15:13.055612', 'step': 1673, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:15:13.118496', 'step': 1673, 'epoch': 1} {'type': 'loss', 'content': 0.010431898757815361, 'timestamp': '2025-09-30 22:15:13.134289', 'step': 1674, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:13.181376', 'step': 1674, 'epoch': 1} {'type': 'loss', 'content': 0.01341898925602436, 'timestamp': '2025-09-30 22:15:13.193997', 'step': 1675, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:15:13.232693', 'step': 1675, 'epoch': 1} {'type': 'loss', 'content': 0.0198319423943758, 'timestamp': '2025-09-30 22:15:13.275123', 'step': 1676, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:15:13.325554', 'step': 1676, 'epoch': 1} {'type': 'loss', 'content': 0.01101736817508936, 'timestamp': '2025-09-30 22:15:13.348949', 'step': 1677, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:15:13.404663', 'step': 1677, 'epoch': 1} {'type': 'loss', 'content': 0.05434549227356911, 'timestamp': '2025-09-30 22:15:13.416335', 'step': 1678, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:13.461161', 'step': 1678, 'epoch': 1} {'type': 'loss', 'content': 0.016798565164208412, 'timestamp': '2025-09-30 22:15:13.474310', 'step': 1679, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:15:13.519923', 'step': 1679, 'epoch': 1} {'type': 'loss', 'content': 0.012180604040622711, 'timestamp': '2025-09-30 22:15:13.550315', 'step': 1680, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:13.634714', 'step': 1680, 'epoch': 1} {'type': 'loss', 'content': 0.013805767521262169, 'timestamp': '2025-09-30 22:15:13.646294', 'step': 1681, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-09-30 22:15:13.696300', 'step': 1681, 'epoch': 1} {'type': 'loss', 'content': 0.009852793999016285, 'timestamp': '2025-09-30 22:15:13.719365', 'step': 1682, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-09-30 22:15:13.770660', 'step': 1682, 'epoch': 1} {'type': 'loss', 'content': 0.01591717265546322, 'timestamp': '2025-09-30 22:15:13.786861', 'step': 1683, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:13.828935', 'step': 1683, 'epoch': 1} {'type': 'loss', 'content': 0.028100574389100075, 'timestamp': '2025-09-30 22:15:13.863250', 'step': 1684, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:13.903144', 'step': 1684, 'epoch': 1} {'type': 'loss', 'content': 0.013175170868635178, 'timestamp': '2025-09-30 22:15:13.912845', 'step': 1685, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:13.949682', 'step': 1685, 'epoch': 1} {'type': 'loss', 'content': 0.02460007183253765, 'timestamp': '2025-09-30 22:15:13.963116', 'step': 1686, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:14.002022', 'step': 1686, 'epoch': 1} {'type': 'loss', 'content': 0.025650952011346817, 'timestamp': '2025-09-30 22:15:14.015339', 'step': 1687, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:14.054111', 'step': 1687, 'epoch': 1} {'type': 'loss', 'content': 0.013419519178569317, 'timestamp': '2025-09-30 22:15:14.088402', 'step': 1688, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:15:14.125169', 'step': 1688, 'epoch': 1} {'type': 'loss', 'content': 0.01797398552298546, 'timestamp': '2025-09-30 22:15:14.138333', 'step': 1689, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:14.174713', 'step': 1689, 'epoch': 1} {'type': 'loss', 'content': 0.02820388600230217, 'timestamp': '2025-09-30 22:15:14.185214', 'step': 1690, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:15:14.233387', 'step': 1690, 'epoch': 1} {'type': 'loss', 'content': 0.01589238829910755, 'timestamp': '2025-09-30 22:15:14.247079', 'step': 1691, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:14.280376', 'step': 1691, 'epoch': 1} {'type': 'loss', 'content': 0.01470992062240839, 'timestamp': '2025-09-30 22:15:14.313614', 'step': 1692, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:15:14.351496', 'step': 1692, 'epoch': 1} {'type': 'loss', 'content': 0.010090518742799759, 'timestamp': '2025-09-30 22:15:14.364584', 'step': 1693, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:14.397685', 'step': 1693, 'epoch': 1} {'type': 'loss', 'content': 0.028140120208263397, 'timestamp': '2025-09-30 22:15:14.410233', 'step': 1694, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:15:14.445849', 'step': 1694, 'epoch': 1} {'type': 'loss', 'content': 0.036818306893110275, 'timestamp': '2025-09-30 22:15:14.453786', 'step': 1695, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:14.492868', 'step': 1695, 'epoch': 1} {'type': 'loss', 'content': 0.0343332402408123, 'timestamp': '2025-09-30 22:15:14.527084', 'step': 1696, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:15:14.568832', 'step': 1696, 'epoch': 1} {'type': 'loss', 'content': 0.03022080473601818, 'timestamp': '2025-09-30 22:15:14.584033', 'step': 1697, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:14.616168', 'step': 1697, 'epoch': 1} {'type': 'loss', 'content': 0.028232431039214134, 'timestamp': '2025-09-30 22:15:14.628516', 'step': 1698, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:14.670814', 'step': 1698, 'epoch': 1} {'type': 'loss', 'content': 0.027424275875091553, 'timestamp': '2025-09-30 22:15:14.683392', 'step': 1699, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:14.721004', 'step': 1699, 'epoch': 1} {'type': 'loss', 'content': 0.05194878950715065, 'timestamp': '2025-09-30 22:15:14.754163', 'step': 1700, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:14.790900', 'step': 1700, 'epoch': 1} {'type': 'loss', 'content': 0.023143520578742027, 'timestamp': '2025-09-30 22:15:14.799620', 'step': 1701, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:14.835941', 'step': 1701, 'epoch': 1} {'type': 'loss', 'content': 0.021254858002066612, 'timestamp': '2025-09-30 22:15:14.847208', 'step': 1702, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:14.884841', 'step': 1702, 'epoch': 1} {'type': 'loss', 'content': 0.0150804677978158, 'timestamp': '2025-09-30 22:15:14.895272', 'step': 1703, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:14.931348', 'step': 1703, 'epoch': 1} {'type': 'loss', 'content': 0.0160662978887558, 'timestamp': '2025-09-30 22:15:14.962603', 'step': 1704, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:14.996595', 'step': 1704, 'epoch': 1} {'type': 'loss', 'content': 0.018683334812521935, 'timestamp': '2025-09-30 22:15:15.005509', 'step': 1705, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:15.044911', 'step': 1705, 'epoch': 1} {'type': 'loss', 'content': 0.01358802616596222, 'timestamp': '2025-09-30 22:15:15.056036', 'step': 1706, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:15:15.089359', 'step': 1706, 'epoch': 1} {'type': 'loss', 'content': 0.018810700625181198, 'timestamp': '2025-09-30 22:15:15.096862', 'step': 1707, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:15:15.144559', 'step': 1707, 'epoch': 1} {'type': 'loss', 'content': 0.026481501758098602, 'timestamp': '2025-09-30 22:15:15.179146', 'step': 1708, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:15:15.215178', 'step': 1708, 'epoch': 1} {'type': 'loss', 'content': 0.02003110758960247, 'timestamp': '2025-09-30 22:15:15.228157', 'step': 1709, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:15.265597', 'step': 1709, 'epoch': 1} {'type': 'loss', 'content': 0.016395648941397667, 'timestamp': '2025-09-30 22:15:15.278139', 'step': 1710, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:15.309718', 'step': 1710, 'epoch': 1} {'type': 'loss', 'content': 0.020903056487441063, 'timestamp': '2025-09-30 22:15:15.320190', 'step': 1711, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:15.352436', 'step': 1711, 'epoch': 1} {'type': 'loss', 'content': 0.02075912617146969, 'timestamp': '2025-09-30 22:15:15.384441', 'step': 1712, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:15:15.419582', 'step': 1712, 'epoch': 1} {'type': 'loss', 'content': 0.02496561035513878, 'timestamp': '2025-09-30 22:15:15.435869', 'step': 1713, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:15:15.482040', 'step': 1713, 'epoch': 1} {'type': 'loss', 'content': 0.025438209995627403, 'timestamp': '2025-09-30 22:15:15.495744', 'step': 1714, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-09-30 22:15:15.539585', 'step': 1714, 'epoch': 1} {'type': 'loss', 'content': 0.02345852367579937, 'timestamp': '2025-09-30 22:15:15.555923', 'step': 1715, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:15.592113', 'step': 1715, 'epoch': 1} {'type': 'loss', 'content': 0.017462914809584618, 'timestamp': '2025-09-30 22:15:15.624105', 'step': 1716, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:15.660417', 'step': 1716, 'epoch': 1} {'type': 'loss', 'content': 0.017815817147493362, 'timestamp': '2025-09-30 22:15:15.670255', 'step': 1717, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:15.704918', 'step': 1717, 'epoch': 1} {'type': 'loss', 'content': 0.010522090829908848, 'timestamp': '2025-09-30 22:15:15.717541', 'step': 1718, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:15.750633', 'step': 1718, 'epoch': 1} {'type': 'loss', 'content': 0.020719345659017563, 'timestamp': '2025-09-30 22:15:15.760908', 'step': 1719, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:15.795165', 'step': 1719, 'epoch': 1} {'type': 'loss', 'content': 0.03029588609933853, 'timestamp': '2025-09-30 22:15:15.828569', 'step': 1720, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:15.866673', 'step': 1720, 'epoch': 1} {'type': 'loss', 'content': 0.03135362267494202, 'timestamp': '2025-09-30 22:15:15.875475', 'step': 1721, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:15.912720', 'step': 1721, 'epoch': 1} {'type': 'loss', 'content': 0.021656924858689308, 'timestamp': '2025-09-30 22:15:15.923760', 'step': 1722, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:15.964457', 'step': 1722, 'epoch': 1} {'type': 'loss', 'content': 0.01336958073079586, 'timestamp': '2025-09-30 22:15:15.977016', 'step': 1723, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:16.009623', 'step': 1723, 'epoch': 1} {'type': 'loss', 'content': 0.04008239880204201, 'timestamp': '2025-09-30 22:15:16.042772', 'step': 1724, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:15:16.075780', 'step': 1724, 'epoch': 1} {'type': 'loss', 'content': 0.018738439306616783, 'timestamp': '2025-09-30 22:15:16.081369', 'step': 1725, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-09-30 22:15:18.522572', 'step': 1725, 'epoch': 1} {'type': 'pplx', 'content': 5.524427428699606, 'timestamp': '2025-09-30 22:15:18.525301', 'step': 1725, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:18.563053', 'step': 1725, 'epoch': 1} {'type': 'loss', 'content': 0.009471976198256016, 'timestamp': '2025-09-30 22:15:18.576396', 'step': 1726, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:15:18.609903', 'step': 1726, 'epoch': 1} {'type': 'loss', 'content': 0.034076958894729614, 'timestamp': '2025-09-30 22:15:18.617656', 'step': 1727, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:15:18.661195', 'step': 1727, 'epoch': 1} {'type': 'loss', 'content': 0.006048847455531359, 'timestamp': '2025-09-30 22:15:18.695827', 'step': 1728, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:18.730077', 'step': 1728, 'epoch': 1} {'type': 'loss', 'content': 0.022094430401921272, 'timestamp': '2025-09-30 22:15:18.738016', 'step': 1729, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:15:18.777458', 'step': 1729, 'epoch': 1} {'type': 'loss', 'content': 0.022269519045948982, 'timestamp': '2025-09-30 22:15:18.784378', 'step': 1730, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:18.821527', 'step': 1730, 'epoch': 1} {'type': 'loss', 'content': 0.030709749087691307, 'timestamp': '2025-09-30 22:15:18.832475', 'step': 1731, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:18.867738', 'step': 1731, 'epoch': 1} {'type': 'loss', 'content': 0.026105714961886406, 'timestamp': '2025-09-30 22:15:18.899092', 'step': 1732, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:15:18.942742', 'step': 1732, 'epoch': 1} {'type': 'loss', 'content': 0.023841775953769684, 'timestamp': '2025-09-30 22:15:18.948539', 'step': 1733, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:18.985012', 'step': 1733, 'epoch': 1} {'type': 'loss', 'content': 0.018309002742171288, 'timestamp': '2025-09-30 22:15:18.997177', 'step': 1734, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:19.039996', 'step': 1734, 'epoch': 1} {'type': 'loss', 'content': 0.014661869965493679, 'timestamp': '2025-09-30 22:15:19.052312', 'step': 1735, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:15:19.098608', 'step': 1735, 'epoch': 1} {'type': 'loss', 'content': 0.029143791645765305, 'timestamp': '2025-09-30 22:15:19.133206', 'step': 1736, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:15:19.177766', 'step': 1736, 'epoch': 1} {'type': 'loss', 'content': 0.03220280632376671, 'timestamp': '2025-09-30 22:15:19.190883', 'step': 1737, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:15:19.224088', 'step': 1737, 'epoch': 1} {'type': 'loss', 'content': 0.010954529978334904, 'timestamp': '2025-09-30 22:15:19.233474', 'step': 1738, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:15:19.272941', 'step': 1738, 'epoch': 1} {'type': 'loss', 'content': 0.025374675169587135, 'timestamp': '2025-09-30 22:15:19.277166', 'step': 1739, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:15:19.314726', 'step': 1739, 'epoch': 1} {'type': 'loss', 'content': 0.010610057041049004, 'timestamp': '2025-09-30 22:15:19.339900', 'step': 1740, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:19.383524', 'step': 1740, 'epoch': 1} {'type': 'loss', 'content': 0.01448817364871502, 'timestamp': '2025-09-30 22:15:19.393469', 'step': 1741, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:19.428391', 'step': 1741, 'epoch': 1} {'type': 'loss', 'content': 0.010096672922372818, 'timestamp': '2025-09-30 22:15:19.438880', 'step': 1742, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:19.473712', 'step': 1742, 'epoch': 1} {'type': 'loss', 'content': 0.004360891878604889, 'timestamp': '2025-09-30 22:15:19.484050', 'step': 1743, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:15:19.527998', 'step': 1743, 'epoch': 1} {'type': 'loss', 'content': 0.021376613527536392, 'timestamp': '2025-09-30 22:15:19.560949', 'step': 1744, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:15:19.600419', 'step': 1744, 'epoch': 1} {'type': 'loss', 'content': 0.010968276299536228, 'timestamp': '2025-09-30 22:15:19.610294', 'step': 1745, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:15:19.646393', 'step': 1745, 'epoch': 1} {'type': 'loss', 'content': 0.016172928735613823, 'timestamp': '2025-09-30 22:15:19.655510', 'step': 1746, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:19.698685', 'step': 1746, 'epoch': 1} {'type': 'loss', 'content': 0.008436311967670918, 'timestamp': '2025-09-30 22:15:19.711304', 'step': 1747, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:19.744843', 'step': 1747, 'epoch': 1} {'type': 'loss', 'content': 0.01623164303600788, 'timestamp': '2025-09-30 22:15:19.778335', 'step': 1748, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:19.814602', 'step': 1748, 'epoch': 1} {'type': 'loss', 'content': 0.033938564360141754, 'timestamp': '2025-09-30 22:15:19.823264', 'step': 1749, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:19.867354', 'step': 1749, 'epoch': 1} {'type': 'loss', 'content': 0.01624465547502041, 'timestamp': '2025-09-30 22:15:19.880668', 'step': 1750, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:15:19.916953', 'step': 1750, 'epoch': 1} {'type': 'loss', 'content': 0.0248996801674366, 'timestamp': '2025-09-30 22:15:19.924824', 'step': 1751, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:19.956200', 'step': 1751, 'epoch': 1} {'type': 'loss', 'content': 0.018772227689623833, 'timestamp': '2025-09-30 22:15:19.987554', 'step': 1752, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:15:20.020058', 'step': 1752, 'epoch': 1} {'type': 'loss', 'content': 0.01700027659535408, 'timestamp': '2025-09-30 22:15:20.025435', 'step': 1753, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:15:20.058097', 'step': 1753, 'epoch': 1} {'type': 'loss', 'content': 0.024260636419057846, 'timestamp': '2025-09-30 22:15:20.066066', 'step': 1754, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:20.101018', 'step': 1754, 'epoch': 1} {'type': 'loss', 'content': 0.014491290785372257, 'timestamp': '2025-09-30 22:15:20.114365', 'step': 1755, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:20.147112', 'step': 1755, 'epoch': 1} {'type': 'loss', 'content': 0.017340457066893578, 'timestamp': '2025-09-30 22:15:20.179126', 'step': 1756, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:15:20.217277', 'step': 1756, 'epoch': 1} {'type': 'loss', 'content': 0.03053688071668148, 'timestamp': '2025-09-30 22:15:20.222894', 'step': 1757, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:20.261376', 'step': 1757, 'epoch': 1} {'type': 'loss', 'content': 0.011630667373538017, 'timestamp': '2025-09-30 22:15:20.274697', 'step': 1758, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 14712978242368}, 'timestamp': '2025-09-30 22:15:20.323682', 'step': 1758, 'epoch': 1} {'type': 'loss', 'content': 0.010422976687550545, 'timestamp': '2025-09-30 22:15:20.341262', 'step': 1759, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:15:20.381298', 'step': 1759, 'epoch': 1} {'type': 'loss', 'content': 0.00819334015250206, 'timestamp': '2025-09-30 22:15:20.415832', 'step': 1760, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:15:20.453585', 'step': 1760, 'epoch': 1} {'type': 'loss', 'content': 0.012914934195578098, 'timestamp': '2025-09-30 22:15:20.468659', 'step': 1761, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:20.502785', 'step': 1761, 'epoch': 1} {'type': 'loss', 'content': 0.02971467934548855, 'timestamp': '2025-09-30 22:15:20.515354', 'step': 1762, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:20.551770', 'step': 1762, 'epoch': 1} {'type': 'loss', 'content': 0.00758015364408493, 'timestamp': '2025-09-30 22:15:20.564296', 'step': 1763, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:15:20.603621', 'step': 1763, 'epoch': 1} {'type': 'loss', 'content': 0.010941659100353718, 'timestamp': '2025-09-30 22:15:20.638224', 'step': 1764, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:20.681274', 'step': 1764, 'epoch': 1} {'type': 'loss', 'content': 0.014606650918722153, 'timestamp': '2025-09-30 22:15:20.691790', 'step': 1765, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:20.726870', 'step': 1765, 'epoch': 1} {'type': 'loss', 'content': 0.018656129017472267, 'timestamp': '2025-09-30 22:15:20.739287', 'step': 1766, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:15:20.778474', 'step': 1766, 'epoch': 1} {'type': 'loss', 'content': 0.01316360104829073, 'timestamp': '2025-09-30 22:15:20.792174', 'step': 1767, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:20.830398', 'step': 1767, 'epoch': 1} {'type': 'loss', 'content': 0.008767510764300823, 'timestamp': '2025-09-30 22:15:20.861530', 'step': 1768, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:20.896665', 'step': 1768, 'epoch': 1} {'type': 'loss', 'content': 0.022422747686505318, 'timestamp': '2025-09-30 22:15:20.906481', 'step': 1769, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:15:20.960049', 'step': 1769, 'epoch': 1} {'type': 'loss', 'content': 0.013530876487493515, 'timestamp': '2025-09-30 22:15:20.973856', 'step': 1770, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:21.009694', 'step': 1770, 'epoch': 1} {'type': 'loss', 'content': 0.016074175015091896, 'timestamp': '2025-09-30 22:15:21.022218', 'step': 1771, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:21.059845', 'step': 1771, 'epoch': 1} {'type': 'loss', 'content': 0.012821718119084835, 'timestamp': '2025-09-30 22:15:21.091305', 'step': 1772, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:21.126100', 'step': 1772, 'epoch': 1} {'type': 'loss', 'content': 0.01885077729821205, 'timestamp': '2025-09-30 22:15:21.134767', 'step': 1773, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:21.172538', 'step': 1773, 'epoch': 1} {'type': 'loss', 'content': 0.01258346438407898, 'timestamp': '2025-09-30 22:15:21.183845', 'step': 1774, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:15:21.216032', 'step': 1774, 'epoch': 1} {'type': 'loss', 'content': 0.02077512815594673, 'timestamp': '2025-09-30 22:15:21.223993', 'step': 1775, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:15:21.263595', 'step': 1775, 'epoch': 1} {'type': 'loss', 'content': 0.004540475085377693, 'timestamp': '2025-09-30 22:15:21.298157', 'step': 1776, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:15:21.339064', 'step': 1776, 'epoch': 1} {'type': 'loss', 'content': 0.006810082122683525, 'timestamp': '2025-09-30 22:15:21.354483', 'step': 1777, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:15:21.401475', 'step': 1777, 'epoch': 1} {'type': 'loss', 'content': 0.006215658038854599, 'timestamp': '2025-09-30 22:15:21.415147', 'step': 1778, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:15:21.461281', 'step': 1778, 'epoch': 1} {'type': 'loss', 'content': 0.005509376525878906, 'timestamp': '2025-09-30 22:15:21.475121', 'step': 1779, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:21.510225', 'step': 1779, 'epoch': 1} {'type': 'loss', 'content': 0.010733134113252163, 'timestamp': '2025-09-30 22:15:21.543598', 'step': 1780, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:21.582176', 'step': 1780, 'epoch': 1} {'type': 'loss', 'content': 0.01937573403120041, 'timestamp': '2025-09-30 22:15:21.592264', 'step': 1781, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:21.641408', 'step': 1781, 'epoch': 1} {'type': 'loss', 'content': 0.007524130865931511, 'timestamp': '2025-09-30 22:15:21.653969', 'step': 1782, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:15:21.699905', 'step': 1782, 'epoch': 1} {'type': 'loss', 'content': 0.0070258439518511295, 'timestamp': '2025-09-30 22:15:21.715826', 'step': 1783, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:15:21.750804', 'step': 1783, 'epoch': 1} {'type': 'loss', 'content': 0.0137464739382267, 'timestamp': '2025-09-30 22:15:21.779359', 'step': 1784, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-09-30 22:15:21.823451', 'step': 1784, 'epoch': 1} {'type': 'loss', 'content': 0.017076196148991585, 'timestamp': '2025-09-30 22:15:21.839145', 'step': 1785, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:21.879757', 'step': 1785, 'epoch': 1} {'type': 'loss', 'content': 0.03697328642010689, 'timestamp': '2025-09-30 22:15:21.891180', 'step': 1786, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:21.927460', 'step': 1786, 'epoch': 1} {'type': 'loss', 'content': 0.014486968517303467, 'timestamp': '2025-09-30 22:15:21.938851', 'step': 1787, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:21.978780', 'step': 1787, 'epoch': 1} {'type': 'loss', 'content': 0.008746178820729256, 'timestamp': '2025-09-30 22:15:22.010210', 'step': 1788, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-09-30 22:15:22.066507', 'step': 1788, 'epoch': 1} {'type': 'loss', 'content': 0.008364402689039707, 'timestamp': '2025-09-30 22:15:22.082207', 'step': 1789, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:15:22.134773', 'step': 1789, 'epoch': 1} {'type': 'loss', 'content': 0.013731135986745358, 'timestamp': '2025-09-30 22:15:22.148609', 'step': 1790, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:15:22.196051', 'step': 1790, 'epoch': 1} {'type': 'loss', 'content': 0.006996935233473778, 'timestamp': '2025-09-30 22:15:22.209963', 'step': 1791, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 14238374516352}, 'timestamp': '2025-09-30 22:15:22.264298', 'step': 1791, 'epoch': 1} {'type': 'loss', 'content': 0.008566569536924362, 'timestamp': '2025-09-30 22:15:22.302446', 'step': 1792, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:22.340129', 'step': 1792, 'epoch': 1} {'type': 'loss', 'content': 0.014082583598792553, 'timestamp': '2025-09-30 22:15:22.348979', 'step': 1793, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:22.390495', 'step': 1793, 'epoch': 1} {'type': 'loss', 'content': 0.02005489356815815, 'timestamp': '2025-09-30 22:15:22.402838', 'step': 1794, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:15:22.447617', 'step': 1794, 'epoch': 1} {'type': 'loss', 'content': 0.010589111596345901, 'timestamp': '2025-09-30 22:15:22.461330', 'step': 1795, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:15:22.502133', 'step': 1795, 'epoch': 1} {'type': 'loss', 'content': 0.004992394242435694, 'timestamp': '2025-09-30 22:15:22.537039', 'step': 1796, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:22.571758', 'step': 1796, 'epoch': 1} {'type': 'loss', 'content': 0.025007154792547226, 'timestamp': '2025-09-30 22:15:22.580431', 'step': 1797, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:22.621276', 'step': 1797, 'epoch': 1} {'type': 'loss', 'content': 0.00899725966155529, 'timestamp': '2025-09-30 22:15:22.634672', 'step': 1798, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:15:22.688607', 'step': 1798, 'epoch': 1} {'type': 'loss', 'content': 0.013840748928487301, 'timestamp': '2025-09-30 22:15:22.702278', 'step': 1799, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:22.739073', 'step': 1799, 'epoch': 1} {'type': 'loss', 'content': 0.022443510591983795, 'timestamp': '2025-09-30 22:15:22.772309', 'step': 1800, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:22.807669', 'step': 1800, 'epoch': 1} {'type': 'loss', 'content': 0.011950202286243439, 'timestamp': '2025-09-30 22:15:22.817784', 'step': 1801, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:22.853827', 'step': 1801, 'epoch': 1} {'type': 'loss', 'content': 0.01878458447754383, 'timestamp': '2025-09-30 22:15:22.864912', 'step': 1802, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:22.907941', 'step': 1802, 'epoch': 1} {'type': 'loss', 'content': 0.02448650635778904, 'timestamp': '2025-09-30 22:15:22.918359', 'step': 1803, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:22.950555', 'step': 1803, 'epoch': 1} {'type': 'loss', 'content': 0.019893376156687737, 'timestamp': '2025-09-30 22:15:22.982407', 'step': 1804, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:15:23.014406', 'step': 1804, 'epoch': 1} {'type': 'loss', 'content': 0.010442492552101612, 'timestamp': '2025-09-30 22:15:23.020014', 'step': 1805, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:15:23.054053', 'step': 1805, 'epoch': 1} {'type': 'loss', 'content': 0.011205971240997314, 'timestamp': '2025-09-30 22:15:23.061030', 'step': 1806, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:23.097700', 'step': 1806, 'epoch': 1} {'type': 'loss', 'content': 0.007506354711949825, 'timestamp': '2025-09-30 22:15:23.111084', 'step': 1807, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:23.149451', 'step': 1807, 'epoch': 1} {'type': 'loss', 'content': 0.027089744806289673, 'timestamp': '2025-09-30 22:15:23.182835', 'step': 1808, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:23.216940', 'step': 1808, 'epoch': 1} {'type': 'loss', 'content': 0.017995381727814674, 'timestamp': '2025-09-30 22:15:23.227348', 'step': 1809, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:23.260626', 'step': 1809, 'epoch': 1} {'type': 'loss', 'content': 0.02709285542368889, 'timestamp': '2025-09-30 22:15:23.273232', 'step': 1810, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:23.312977', 'step': 1810, 'epoch': 1} {'type': 'loss', 'content': 0.03118719719350338, 'timestamp': '2025-09-30 22:15:23.326368', 'step': 1811, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:23.360594', 'step': 1811, 'epoch': 1} {'type': 'loss', 'content': 0.026631765067577362, 'timestamp': '2025-09-30 22:15:23.393810', 'step': 1812, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:15:23.430423', 'step': 1812, 'epoch': 1} {'type': 'loss', 'content': 0.009016057476401329, 'timestamp': '2025-09-30 22:15:23.443727', 'step': 1813, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:23.480525', 'step': 1813, 'epoch': 1} {'type': 'loss', 'content': 0.009003258310258389, 'timestamp': '2025-09-30 22:15:23.492827', 'step': 1814, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 17560600598464}, 'timestamp': '2025-09-30 22:15:23.544774', 'step': 1814, 'epoch': 1} {'type': 'loss', 'content': 0.005577434320002794, 'timestamp': '2025-09-30 22:15:23.565852', 'step': 1815, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:23.603139', 'step': 1815, 'epoch': 1} {'type': 'loss', 'content': 0.013796670362353325, 'timestamp': '2025-09-30 22:15:23.637356', 'step': 1816, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:15:23.677517', 'step': 1816, 'epoch': 1} {'type': 'loss', 'content': 0.010871457867324352, 'timestamp': '2025-09-30 22:15:23.683954', 'step': 1817, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:23.717131', 'step': 1817, 'epoch': 1} {'type': 'loss', 'content': 0.006785405334085226, 'timestamp': '2025-09-30 22:15:23.729503', 'step': 1818, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:23.770701', 'step': 1818, 'epoch': 1} {'type': 'loss', 'content': 0.008428267203271389, 'timestamp': '2025-09-30 22:15:23.784159', 'step': 1819, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:23.817435', 'step': 1819, 'epoch': 1} {'type': 'loss', 'content': 0.009874596260488033, 'timestamp': '2025-09-30 22:15:23.850883', 'step': 1820, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:23.885048', 'step': 1820, 'epoch': 1} {'type': 'loss', 'content': 0.02326430380344391, 'timestamp': '2025-09-30 22:15:23.897644', 'step': 1821, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:23.930177', 'step': 1821, 'epoch': 1} {'type': 'loss', 'content': 0.03274618089199066, 'timestamp': '2025-09-30 22:15:23.941399', 'step': 1822, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:23.977828', 'step': 1822, 'epoch': 1} {'type': 'loss', 'content': 0.023377005010843277, 'timestamp': '2025-09-30 22:15:23.990183', 'step': 1823, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:15:24.027694', 'step': 1823, 'epoch': 1} {'type': 'loss', 'content': 0.02934858575463295, 'timestamp': '2025-09-30 22:15:24.062433', 'step': 1824, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:24.102236', 'step': 1824, 'epoch': 1} {'type': 'loss', 'content': 0.0181177519261837, 'timestamp': '2025-09-30 22:15:24.114869', 'step': 1825, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:24.152510', 'step': 1825, 'epoch': 1} {'type': 'loss', 'content': 0.01756267435848713, 'timestamp': '2025-09-30 22:15:24.165081', 'step': 1826, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:15:24.202823', 'step': 1826, 'epoch': 1} {'type': 'loss', 'content': 0.03452328220009804, 'timestamp': '2025-09-30 22:15:24.216543', 'step': 1827, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:24.251387', 'step': 1827, 'epoch': 1} {'type': 'loss', 'content': 0.024465546011924744, 'timestamp': '2025-09-30 22:15:24.284646', 'step': 1828, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:24.317717', 'step': 1828, 'epoch': 1} {'type': 'loss', 'content': 0.038967929780483246, 'timestamp': '2025-09-30 22:15:24.327653', 'step': 1829, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:24.362319', 'step': 1829, 'epoch': 1} {'type': 'loss', 'content': 0.026917487382888794, 'timestamp': '2025-09-30 22:15:24.373471', 'step': 1830, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:24.406705', 'step': 1830, 'epoch': 1} {'type': 'loss', 'content': 0.025900596752762794, 'timestamp': '2025-09-30 22:15:24.417218', 'step': 1831, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:15:24.459304', 'step': 1831, 'epoch': 1} {'type': 'loss', 'content': 0.00981822144240141, 'timestamp': '2025-09-30 22:15:24.494096', 'step': 1832, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:15:24.537747', 'step': 1832, 'epoch': 1} {'type': 'loss', 'content': 0.023126764222979546, 'timestamp': '2025-09-30 22:15:24.550816', 'step': 1833, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 15187581968384}, 'timestamp': '2025-09-30 22:15:24.595360', 'step': 1833, 'epoch': 1} {'type': 'loss', 'content': 0.013973300345242023, 'timestamp': '2025-09-30 22:15:24.613093', 'step': 1834, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:15:24.657748', 'step': 1834, 'epoch': 1} {'type': 'loss', 'content': 0.017417822033166885, 'timestamp': '2025-09-30 22:15:24.673657', 'step': 1835, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:24.708575', 'step': 1835, 'epoch': 1} {'type': 'loss', 'content': 0.004481332842260599, 'timestamp': '2025-09-30 22:15:24.742836', 'step': 1836, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:24.777196', 'step': 1836, 'epoch': 1} {'type': 'loss', 'content': 0.009193511679768562, 'timestamp': '2025-09-30 22:15:24.787173', 'step': 1837, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:24.823652', 'step': 1837, 'epoch': 1} {'type': 'loss', 'content': 0.012367206625640392, 'timestamp': '2025-09-30 22:15:24.837056', 'step': 1838, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:24.881330', 'step': 1838, 'epoch': 1} {'type': 'loss', 'content': 0.012220718897879124, 'timestamp': '2025-09-30 22:15:24.894705', 'step': 1839, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:15:24.936285', 'step': 1839, 'epoch': 1} {'type': 'loss', 'content': 0.012313958257436752, 'timestamp': '2025-09-30 22:15:24.970866', 'step': 1840, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-09-30 22:15:27.455257', 'step': 1840, 'epoch': 1} {'type': 'pplx', 'content': 5.489145934259954, 'timestamp': '2025-09-30 22:15:27.457920', 'step': 1840, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:27.488465', 'step': 1840, 'epoch': 1} {'type': 'loss', 'content': 0.004935309290885925, 'timestamp': '2025-09-30 22:15:27.501114', 'step': 1841, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:27.538849', 'step': 1841, 'epoch': 1} {'type': 'loss', 'content': 0.015119443647563457, 'timestamp': '2025-09-30 22:15:27.551428', 'step': 1842, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:15:27.594161', 'step': 1842, 'epoch': 1} {'type': 'loss', 'content': 0.011127560399472713, 'timestamp': '2025-09-30 22:15:27.607974', 'step': 1843, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:15:27.646596', 'step': 1843, 'epoch': 1} {'type': 'loss', 'content': 0.017564384266734123, 'timestamp': '2025-09-30 22:15:27.674867', 'step': 1844, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:15:27.709911', 'step': 1844, 'epoch': 2} {'type': 'loss', 'content': 0.03666895255446434, 'timestamp': '2025-09-30 22:15:27.715216', 'step': 1845, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:27.754659', 'step': 1845, 'epoch': 2} {'type': 'loss', 'content': 0.008158748969435692, 'timestamp': '2025-09-30 22:15:27.768020', 'step': 1846, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:15:27.801840', 'step': 1846, 'epoch': 2} {'type': 'loss', 'content': 0.013101806864142418, 'timestamp': '2025-09-30 22:15:27.809376', 'step': 1847, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:27.861537', 'step': 1847, 'epoch': 2} {'type': 'loss', 'content': 0.013875213451683521, 'timestamp': '2025-09-30 22:15:27.893791', 'step': 1848, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:27.934819', 'step': 1848, 'epoch': 2} {'type': 'loss', 'content': 0.010416905395686626, 'timestamp': '2025-09-30 22:15:27.944907', 'step': 1849, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:15:27.981397', 'step': 1849, 'epoch': 2} {'type': 'loss', 'content': 0.011625699698925018, 'timestamp': '2025-09-30 22:15:27.988937', 'step': 1850, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:28.027331', 'step': 1850, 'epoch': 2} {'type': 'loss', 'content': 0.015321994200348854, 'timestamp': '2025-09-30 22:15:28.040676', 'step': 1851, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:28.074429', 'step': 1851, 'epoch': 2} {'type': 'loss', 'content': 0.010722902603447437, 'timestamp': '2025-09-30 22:15:28.107823', 'step': 1852, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-09-30 22:15:28.151752', 'step': 1852, 'epoch': 2} {'type': 'loss', 'content': 0.010753704234957695, 'timestamp': '2025-09-30 22:15:28.168448', 'step': 1853, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:28.204256', 'step': 1853, 'epoch': 2} {'type': 'loss', 'content': 0.009430734440684319, 'timestamp': '2025-09-30 22:15:28.215559', 'step': 1854, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:28.256649', 'step': 1854, 'epoch': 2} {'type': 'loss', 'content': 0.006714122835546732, 'timestamp': '2025-09-30 22:15:28.270147', 'step': 1855, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:28.308013', 'step': 1855, 'epoch': 2} {'type': 'loss', 'content': 0.012612263672053814, 'timestamp': '2025-09-30 22:15:28.342248', 'step': 1856, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:15:28.389282', 'step': 1856, 'epoch': 2} {'type': 'loss', 'content': 0.012894387356936932, 'timestamp': '2025-09-30 22:15:28.396669', 'step': 1857, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:15:28.437005', 'step': 1857, 'epoch': 2} {'type': 'loss', 'content': 0.012102191336452961, 'timestamp': '2025-09-30 22:15:28.445003', 'step': 1858, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:28.483923', 'step': 1858, 'epoch': 2} {'type': 'loss', 'content': 0.013301697559654713, 'timestamp': '2025-09-30 22:15:28.495288', 'step': 1859, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:28.532470', 'step': 1859, 'epoch': 2} {'type': 'loss', 'content': 0.013101636432111263, 'timestamp': '2025-09-30 22:15:28.566735', 'step': 1860, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:28.602883', 'step': 1860, 'epoch': 2} {'type': 'loss', 'content': 0.018009858205914497, 'timestamp': '2025-09-30 22:15:28.611014', 'step': 1861, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:28.653408', 'step': 1861, 'epoch': 2} {'type': 'loss', 'content': 0.017011038959026337, 'timestamp': '2025-09-30 22:15:28.665869', 'step': 1862, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:15:28.712007', 'step': 1862, 'epoch': 2} {'type': 'loss', 'content': 0.016825055703520775, 'timestamp': '2025-09-30 22:15:28.719248', 'step': 1863, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:15:28.755095', 'step': 1863, 'epoch': 2} {'type': 'loss', 'content': 0.022498799487948418, 'timestamp': '2025-09-30 22:15:28.783852', 'step': 1864, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:15:28.823770', 'step': 1864, 'epoch': 2} {'type': 'loss', 'content': 0.015230205841362476, 'timestamp': '2025-09-30 22:15:28.832932', 'step': 1865, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:15:28.894586', 'step': 1865, 'epoch': 2} {'type': 'loss', 'content': 0.006908940616995096, 'timestamp': '2025-09-30 22:15:28.902454', 'step': 1866, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:15:28.934193', 'step': 1866, 'epoch': 2} {'type': 'loss', 'content': 0.013190867379307747, 'timestamp': '2025-09-30 22:15:28.941223', 'step': 1867, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:15:28.976258', 'step': 1867, 'epoch': 2} {'type': 'loss', 'content': 0.01497811358422041, 'timestamp': '2025-09-30 22:15:29.004232', 'step': 1868, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:15:29.046964', 'step': 1868, 'epoch': 2} {'type': 'loss', 'content': 0.00782166887074709, 'timestamp': '2025-09-30 22:15:29.060054', 'step': 1869, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:15:29.100544', 'step': 1869, 'epoch': 2} {'type': 'loss', 'content': 0.023333149030804634, 'timestamp': '2025-09-30 22:15:29.108894', 'step': 1870, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:15:29.144079', 'step': 1870, 'epoch': 2} {'type': 'loss', 'content': 0.011829240247607231, 'timestamp': '2025-09-30 22:15:29.157857', 'step': 1871, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:29.192571', 'step': 1871, 'epoch': 2} {'type': 'loss', 'content': 0.017776671797037125, 'timestamp': '2025-09-30 22:15:29.226031', 'step': 1872, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:29.261788', 'step': 1872, 'epoch': 2} {'type': 'loss', 'content': 0.007805516477674246, 'timestamp': '2025-09-30 22:15:29.272561', 'step': 1873, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:29.322182', 'step': 1873, 'epoch': 2} {'type': 'loss', 'content': 0.009270432405173779, 'timestamp': '2025-09-30 22:15:29.335524', 'step': 1874, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:29.373774', 'step': 1874, 'epoch': 2} {'type': 'loss', 'content': 0.01128674391657114, 'timestamp': '2025-09-30 22:15:29.384947', 'step': 1875, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:15:29.420416', 'step': 1875, 'epoch': 2} {'type': 'loss', 'content': 0.010258953087031841, 'timestamp': '2025-09-30 22:15:29.449146', 'step': 1876, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:15:29.488855', 'step': 1876, 'epoch': 2} {'type': 'loss', 'content': 0.008005455136299133, 'timestamp': '2025-09-30 22:15:29.501837', 'step': 1877, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:29.547812', 'step': 1877, 'epoch': 2} {'type': 'loss', 'content': 0.016554296016693115, 'timestamp': '2025-09-30 22:15:29.558946', 'step': 1878, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:15:29.598181', 'step': 1878, 'epoch': 2} {'type': 'loss', 'content': 0.015631001442670822, 'timestamp': '2025-09-30 22:15:29.611924', 'step': 1879, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:15:29.647928', 'step': 1879, 'epoch': 2} {'type': 'loss', 'content': 0.015927642583847046, 'timestamp': '2025-09-30 22:15:29.682503', 'step': 1880, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:15:29.719745', 'step': 1880, 'epoch': 2} {'type': 'loss', 'content': 0.005917059723287821, 'timestamp': '2025-09-30 22:15:29.725380', 'step': 1881, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:29.757829', 'step': 1881, 'epoch': 2} {'type': 'loss', 'content': 0.01006174273788929, 'timestamp': '2025-09-30 22:15:29.768971', 'step': 1882, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:15:29.806877', 'step': 1882, 'epoch': 2} {'type': 'loss', 'content': 0.02184908092021942, 'timestamp': '2025-09-30 22:15:29.814142', 'step': 1883, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:15:29.848059', 'step': 1883, 'epoch': 2} {'type': 'loss', 'content': 0.014168507419526577, 'timestamp': '2025-09-30 22:15:29.875806', 'step': 1884, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:15:29.909335', 'step': 1884, 'epoch': 2} {'type': 'loss', 'content': 0.01987319439649582, 'timestamp': '2025-09-30 22:15:29.915014', 'step': 1885, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:29.948518', 'step': 1885, 'epoch': 2} {'type': 'loss', 'content': 0.011883490718901157, 'timestamp': '2025-09-30 22:15:29.958854', 'step': 1886, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 15187581968384}, 'timestamp': '2025-09-30 22:15:30.007776', 'step': 1886, 'epoch': 2} {'type': 'loss', 'content': 0.010094949044287205, 'timestamp': '2025-09-30 22:15:30.025490', 'step': 1887, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:15:30.063403', 'step': 1887, 'epoch': 2} {'type': 'loss', 'content': 0.009513383731245995, 'timestamp': '2025-09-30 22:15:30.097906', 'step': 1888, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:15:30.136806', 'step': 1888, 'epoch': 2} {'type': 'loss', 'content': 0.008315491490066051, 'timestamp': '2025-09-30 22:15:30.152260', 'step': 1889, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:30.184560', 'step': 1889, 'epoch': 2} {'type': 'loss', 'content': 0.014158587902784348, 'timestamp': '2025-09-30 22:15:30.196948', 'step': 1890, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:15:30.229527', 'step': 1890, 'epoch': 2} {'type': 'loss', 'content': 0.010483119636774063, 'timestamp': '2025-09-30 22:15:30.237505', 'step': 1891, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:30.275983', 'step': 1891, 'epoch': 2} {'type': 'loss', 'content': 0.01435977965593338, 'timestamp': '2025-09-30 22:15:30.308088', 'step': 1892, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:15:30.342026', 'step': 1892, 'epoch': 2} {'type': 'loss', 'content': 0.014692501164972782, 'timestamp': '2025-09-30 22:15:30.347595', 'step': 1893, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:30.388102', 'step': 1893, 'epoch': 2} {'type': 'loss', 'content': 0.020229607820510864, 'timestamp': '2025-09-30 22:15:30.399129', 'step': 1894, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:15:30.438020', 'step': 1894, 'epoch': 2} {'type': 'loss', 'content': 0.011172110214829445, 'timestamp': '2025-09-30 22:15:30.445555', 'step': 1895, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:30.485998', 'step': 1895, 'epoch': 2} {'type': 'loss', 'content': 0.009735530242323875, 'timestamp': '2025-09-30 22:15:30.520145', 'step': 1896, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:15:30.555929', 'step': 1896, 'epoch': 2} {'type': 'loss', 'content': 0.008219888433814049, 'timestamp': '2025-09-30 22:15:30.569007', 'step': 1897, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:30.604067', 'step': 1897, 'epoch': 2} {'type': 'loss', 'content': 0.010688601061701775, 'timestamp': '2025-09-30 22:15:30.615381', 'step': 1898, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:30.647848', 'step': 1898, 'epoch': 2} {'type': 'loss', 'content': 0.01074580755084753, 'timestamp': '2025-09-30 22:15:30.658938', 'step': 1899, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:30.700789', 'step': 1899, 'epoch': 2} {'type': 'loss', 'content': 0.013557717204093933, 'timestamp': '2025-09-30 22:15:30.733906', 'step': 1900, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:30.766861', 'step': 1900, 'epoch': 2} {'type': 'loss', 'content': 0.01604650542140007, 'timestamp': '2025-09-30 22:15:30.777383', 'step': 1901, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:30.809507', 'step': 1901, 'epoch': 2} {'type': 'loss', 'content': 0.009455199353396893, 'timestamp': '2025-09-30 22:15:30.820647', 'step': 1902, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:15:30.861685', 'step': 1902, 'epoch': 2} {'type': 'loss', 'content': 0.012455038726329803, 'timestamp': '2025-09-30 22:15:30.868948', 'step': 1903, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:15:30.900470', 'step': 1903, 'epoch': 2} {'type': 'loss', 'content': 0.02439979650080204, 'timestamp': '2025-09-30 22:15:30.928618', 'step': 1904, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:15:30.966569', 'step': 1904, 'epoch': 2} {'type': 'loss', 'content': 0.007738007232546806, 'timestamp': '2025-09-30 22:15:30.971320', 'step': 1905, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:15:31.007374', 'step': 1905, 'epoch': 2} {'type': 'loss', 'content': 0.011472243815660477, 'timestamp': '2025-09-30 22:15:31.014754', 'step': 1906, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:31.048064', 'step': 1906, 'epoch': 2} {'type': 'loss', 'content': 0.011051525361835957, 'timestamp': '2025-09-30 22:15:31.060384', 'step': 1907, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:15:31.101469', 'step': 1907, 'epoch': 2} {'type': 'loss', 'content': 0.0075341323390603065, 'timestamp': '2025-09-30 22:15:31.136321', 'step': 1908, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:15:31.177066', 'step': 1908, 'epoch': 2} {'type': 'loss', 'content': 0.00750238262116909, 'timestamp': '2025-09-30 22:15:31.190405', 'step': 1909, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:15:31.225961', 'step': 1909, 'epoch': 2} {'type': 'loss', 'content': 0.01275754626840353, 'timestamp': '2025-09-30 22:15:31.233322', 'step': 1910, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:31.265878', 'step': 1910, 'epoch': 2} {'type': 'loss', 'content': 0.011358684860169888, 'timestamp': '2025-09-30 22:15:31.278196', 'step': 1911, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:31.311841', 'step': 1911, 'epoch': 2} {'type': 'loss', 'content': 0.008774158544838428, 'timestamp': '2025-09-30 22:15:31.345127', 'step': 1912, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:31.387727', 'step': 1912, 'epoch': 2} {'type': 'loss', 'content': 0.011165403760969639, 'timestamp': '2025-09-30 22:15:31.396360', 'step': 1913, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:15:31.434477', 'step': 1913, 'epoch': 2} {'type': 'loss', 'content': 0.014243747107684612, 'timestamp': '2025-09-30 22:15:31.448307', 'step': 1914, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:15:31.481873', 'step': 1914, 'epoch': 2} {'type': 'loss', 'content': 0.012929615564644337, 'timestamp': '2025-09-30 22:15:31.489764', 'step': 1915, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:31.523105', 'step': 1915, 'epoch': 2} {'type': 'loss', 'content': 0.007347370497882366, 'timestamp': '2025-09-30 22:15:31.556356', 'step': 1916, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:31.594760', 'step': 1916, 'epoch': 2} {'type': 'loss', 'content': 0.014166549779474735, 'timestamp': '2025-09-30 22:15:31.602674', 'step': 1917, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:31.637824', 'step': 1917, 'epoch': 2} {'type': 'loss', 'content': 0.009590953588485718, 'timestamp': '2025-09-30 22:15:31.650371', 'step': 1918, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:31.685602', 'step': 1918, 'epoch': 2} {'type': 'loss', 'content': 0.009673715569078922, 'timestamp': '2025-09-30 22:15:31.696631', 'step': 1919, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:15:31.734965', 'step': 1919, 'epoch': 2} {'type': 'loss', 'content': 0.012877479195594788, 'timestamp': '2025-09-30 22:15:31.763382', 'step': 1920, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:31.796360', 'step': 1920, 'epoch': 2} {'type': 'loss', 'content': 0.011299233883619308, 'timestamp': '2025-09-30 22:15:31.806052', 'step': 1921, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:31.838739', 'step': 1921, 'epoch': 2} {'type': 'loss', 'content': 0.011492876335978508, 'timestamp': '2025-09-30 22:15:31.851293', 'step': 1922, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:31.888932', 'step': 1922, 'epoch': 2} {'type': 'loss', 'content': 0.01344871986657381, 'timestamp': '2025-09-30 22:15:31.900198', 'step': 1923, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:15:31.933676', 'step': 1923, 'epoch': 2} {'type': 'loss', 'content': 0.017432425171136856, 'timestamp': '2025-09-30 22:15:31.961503', 'step': 1924, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:31.994546', 'step': 1924, 'epoch': 2} {'type': 'loss', 'content': 0.01488548144698143, 'timestamp': '2025-09-30 22:15:32.002526', 'step': 1925, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:15:32.039100', 'step': 1925, 'epoch': 2} {'type': 'loss', 'content': 0.00806745421141386, 'timestamp': '2025-09-30 22:15:32.046853', 'step': 1926, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:32.081888', 'step': 1926, 'epoch': 2} {'type': 'loss', 'content': 0.010180181823670864, 'timestamp': '2025-09-30 22:15:32.095269', 'step': 1927, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-09-30 22:15:32.144012', 'step': 1927, 'epoch': 2} {'type': 'loss', 'content': 0.004572300240397453, 'timestamp': '2025-09-30 22:15:32.181160', 'step': 1928, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:32.217812', 'step': 1928, 'epoch': 2} {'type': 'loss', 'content': 0.017586616799235344, 'timestamp': '2025-09-30 22:15:32.230435', 'step': 1929, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:15:32.267724', 'step': 1929, 'epoch': 2} {'type': 'loss', 'content': 0.008072668686509132, 'timestamp': '2025-09-30 22:15:32.281696', 'step': 1930, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:15:32.318774', 'step': 1930, 'epoch': 2} {'type': 'loss', 'content': 0.007447035517543554, 'timestamp': '2025-09-30 22:15:32.332628', 'step': 1931, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:32.375710', 'step': 1931, 'epoch': 2} {'type': 'loss', 'content': 0.006811977364122868, 'timestamp': '2025-09-30 22:15:32.409914', 'step': 1932, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:15:32.450362', 'step': 1932, 'epoch': 2} {'type': 'loss', 'content': 0.01102065946906805, 'timestamp': '2025-09-30 22:15:32.463483', 'step': 1933, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:15:32.504920', 'step': 1933, 'epoch': 2} {'type': 'loss', 'content': 0.015133448876440525, 'timestamp': '2025-09-30 22:15:32.511624', 'step': 1934, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:32.551127', 'step': 1934, 'epoch': 2} {'type': 'loss', 'content': 0.007282888051122427, 'timestamp': '2025-09-30 22:15:32.561367', 'step': 1935, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:15:32.597859', 'step': 1935, 'epoch': 2} {'type': 'loss', 'content': 0.012841354124248028, 'timestamp': '2025-09-30 22:15:32.626608', 'step': 1936, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:32.669019', 'step': 1936, 'epoch': 2} {'type': 'loss', 'content': 0.012392260134220123, 'timestamp': '2025-09-30 22:15:32.676924', 'step': 1937, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:32.718886', 'step': 1937, 'epoch': 2} {'type': 'loss', 'content': 0.011032551527023315, 'timestamp': '2025-09-30 22:15:32.730017', 'step': 1938, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:15:32.785612', 'step': 1938, 'epoch': 2} {'type': 'loss', 'content': 0.012614678591489792, 'timestamp': '2025-09-30 22:15:32.799365', 'step': 1939, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:32.837506', 'step': 1939, 'epoch': 2} {'type': 'loss', 'content': 0.00897765439003706, 'timestamp': '2025-09-30 22:15:32.870617', 'step': 1940, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:32.906731', 'step': 1940, 'epoch': 2} {'type': 'loss', 'content': 0.023669099435210228, 'timestamp': '2025-09-30 22:15:32.914709', 'step': 1941, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:32.951958', 'step': 1941, 'epoch': 2} {'type': 'loss', 'content': 0.014903804287314415, 'timestamp': '2025-09-30 22:15:32.964526', 'step': 1942, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:15:33.003799', 'step': 1942, 'epoch': 2} {'type': 'loss', 'content': 0.013701890595257282, 'timestamp': '2025-09-30 22:15:33.017484', 'step': 1943, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:33.054319', 'step': 1943, 'epoch': 2} {'type': 'loss', 'content': 0.01658545434474945, 'timestamp': '2025-09-30 22:15:33.088543', 'step': 1944, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:15:33.125388', 'step': 1944, 'epoch': 2} {'type': 'loss', 'content': 0.0041729118674993515, 'timestamp': '2025-09-30 22:15:33.138406', 'step': 1945, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:33.172919', 'step': 1945, 'epoch': 2} {'type': 'loss', 'content': 0.008692542091012001, 'timestamp': '2025-09-30 22:15:33.185242', 'step': 1946, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:33.225723', 'step': 1946, 'epoch': 2} {'type': 'loss', 'content': 0.008979223668575287, 'timestamp': '2025-09-30 22:15:33.238150', 'step': 1947, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:15:33.270928', 'step': 1947, 'epoch': 2} {'type': 'loss', 'content': 0.010823034681379795, 'timestamp': '2025-09-30 22:15:33.299366', 'step': 1948, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:33.336355', 'step': 1948, 'epoch': 2} {'type': 'loss', 'content': 0.006081897299736738, 'timestamp': '2025-09-30 22:15:33.344469', 'step': 1949, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:15:33.392724', 'step': 1949, 'epoch': 2} {'type': 'loss', 'content': 0.0050694928504526615, 'timestamp': '2025-09-30 22:15:33.406606', 'step': 1950, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:33.441887', 'step': 1950, 'epoch': 2} {'type': 'loss', 'content': 0.0053678094409406185, 'timestamp': '2025-09-30 22:15:33.454471', 'step': 1951, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:33.488182', 'step': 1951, 'epoch': 2} {'type': 'loss', 'content': 0.014238450676202774, 'timestamp': '2025-09-30 22:15:33.519617', 'step': 1952, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:15:33.552277', 'step': 1952, 'epoch': 2} {'type': 'loss', 'content': 0.00962776318192482, 'timestamp': '2025-09-30 22:15:33.557629', 'step': 1953, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:33.591179', 'step': 1953, 'epoch': 2} {'type': 'loss', 'content': 0.019318213686347008, 'timestamp': '2025-09-30 22:15:33.603697', 'step': 1954, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-09-30 22:15:33.658879', 'step': 1954, 'epoch': 2} {'type': 'loss', 'content': 0.009837604127824306, 'timestamp': '2025-09-30 22:15:33.675030', 'step': 1955, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-09-30 22:15:36.132305', 'step': 1955, 'epoch': 2} {'type': 'pplx', 'content': 5.504560427648865, 'timestamp': '2025-09-30 22:15:36.135356', 'step': 1955, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:36.166512', 'step': 1955, 'epoch': 2} {'type': 'loss', 'content': 0.01425859797745943, 'timestamp': '2025-09-30 22:15:36.199266', 'step': 1956, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:15:36.240897', 'step': 1956, 'epoch': 2} {'type': 'loss', 'content': 0.008966123685240746, 'timestamp': '2025-09-30 22:15:36.254250', 'step': 1957, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:15:36.287770', 'step': 1957, 'epoch': 2} {'type': 'loss', 'content': 0.0134968226775527, 'timestamp': '2025-09-30 22:15:36.295764', 'step': 1958, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:15:36.334086', 'step': 1958, 'epoch': 2} {'type': 'loss', 'content': 0.025526832789182663, 'timestamp': '2025-09-30 22:15:36.340977', 'step': 1959, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:15:36.390284', 'step': 1959, 'epoch': 2} {'type': 'loss', 'content': 0.01609751395881176, 'timestamp': '2025-09-30 22:15:36.426974', 'step': 1960, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:36.458804', 'step': 1960, 'epoch': 2} {'type': 'loss', 'content': 0.0084404731169343, 'timestamp': '2025-09-30 22:15:36.466841', 'step': 1961, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:36.503477', 'step': 1961, 'epoch': 2} {'type': 'loss', 'content': 0.01829015463590622, 'timestamp': '2025-09-30 22:15:36.515770', 'step': 1962, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:36.548569', 'step': 1962, 'epoch': 2} {'type': 'loss', 'content': 0.019106082618236542, 'timestamp': '2025-09-30 22:15:36.559766', 'step': 1963, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:15:36.598885', 'step': 1963, 'epoch': 2} {'type': 'loss', 'content': 0.007889645174145699, 'timestamp': '2025-09-30 22:15:36.633629', 'step': 1964, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:15:36.672751', 'step': 1964, 'epoch': 2} {'type': 'loss', 'content': 0.014179641380906105, 'timestamp': '2025-09-30 22:15:36.685856', 'step': 1965, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:36.725432', 'step': 1965, 'epoch': 2} {'type': 'loss', 'content': 0.010651156306266785, 'timestamp': '2025-09-30 22:15:36.737992', 'step': 1966, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:15:36.778377', 'step': 1966, 'epoch': 2} {'type': 'loss', 'content': 0.014606158249080181, 'timestamp': '2025-09-30 22:15:36.792238', 'step': 1967, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:15:36.838165', 'step': 1967, 'epoch': 2} {'type': 'loss', 'content': 0.006826434750109911, 'timestamp': '2025-09-30 22:15:36.874893', 'step': 1968, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:15:36.919277', 'step': 1968, 'epoch': 2} {'type': 'loss', 'content': 0.006176867987960577, 'timestamp': '2025-09-30 22:15:36.934738', 'step': 1969, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:15:36.979028', 'step': 1969, 'epoch': 2} {'type': 'loss', 'content': 0.007977073080837727, 'timestamp': '2025-09-30 22:15:36.994790', 'step': 1970, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:37.029441', 'step': 1970, 'epoch': 2} {'type': 'loss', 'content': 0.009346827864646912, 'timestamp': '2025-09-30 22:15:37.040201', 'step': 1971, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 16611393146432}, 'timestamp': '2025-09-30 22:15:37.090937', 'step': 1971, 'epoch': 2} {'type': 'loss', 'content': 0.01219925656914711, 'timestamp': '2025-09-30 22:15:37.131111', 'step': 1972, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-09-30 22:15:37.177297', 'step': 1972, 'epoch': 2} {'type': 'loss', 'content': 0.008181479759514332, 'timestamp': '2025-09-30 22:15:37.193935', 'step': 1973, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:15:37.235947', 'step': 1973, 'epoch': 2} {'type': 'loss', 'content': 0.00793197751045227, 'timestamp': '2025-09-30 22:15:37.250047', 'step': 1974, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:37.285881', 'step': 1974, 'epoch': 2} {'type': 'loss', 'content': 0.011092676781117916, 'timestamp': '2025-09-30 22:15:37.298269', 'step': 1975, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:37.336821', 'step': 1975, 'epoch': 2} {'type': 'loss', 'content': 0.012023183517158031, 'timestamp': '2025-09-30 22:15:37.371036', 'step': 1976, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-09-30 22:15:37.414944', 'step': 1976, 'epoch': 2} {'type': 'loss', 'content': 0.0072458055801689625, 'timestamp': '2025-09-30 22:15:37.431648', 'step': 1977, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 14712978242368}, 'timestamp': '2025-09-30 22:15:37.476108', 'step': 1977, 'epoch': 2} {'type': 'loss', 'content': 0.007181333377957344, 'timestamp': '2025-09-30 22:15:37.493683', 'step': 1978, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:15:37.529952', 'step': 1978, 'epoch': 2} {'type': 'loss', 'content': 0.01450754888355732, 'timestamp': '2025-09-30 22:15:37.543686', 'step': 1979, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:15:37.576735', 'step': 1979, 'epoch': 2} {'type': 'loss', 'content': 0.0155011722818017, 'timestamp': '2025-09-30 22:15:37.604843', 'step': 1980, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:37.639143', 'step': 1980, 'epoch': 2} {'type': 'loss', 'content': 0.00863499566912651, 'timestamp': '2025-09-30 22:15:37.646961', 'step': 1981, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:37.681276', 'step': 1981, 'epoch': 2} {'type': 'loss', 'content': 0.011826743371784687, 'timestamp': '2025-09-30 22:15:37.693853', 'step': 1982, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:37.726685', 'step': 1982, 'epoch': 2} {'type': 'loss', 'content': 0.014029378071427345, 'timestamp': '2025-09-30 22:15:37.737793', 'step': 1983, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:15:37.772111', 'step': 1983, 'epoch': 2} {'type': 'loss', 'content': 0.019402116537094116, 'timestamp': '2025-09-30 22:15:37.803136', 'step': 1984, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:15:37.835752', 'step': 1984, 'epoch': 2} {'type': 'loss', 'content': 0.009639054536819458, 'timestamp': '2025-09-30 22:15:37.841505', 'step': 1985, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:37.889704', 'step': 1985, 'epoch': 2} {'type': 'loss', 'content': 0.023592282086610794, 'timestamp': '2025-09-30 22:15:37.903073', 'step': 1986, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-09-30 22:15:37.944177', 'step': 1986, 'epoch': 2} {'type': 'loss', 'content': 0.011838329955935478, 'timestamp': '2025-09-30 22:15:37.960263', 'step': 1987, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:15:37.998669', 'step': 1987, 'epoch': 2} {'type': 'loss', 'content': 0.012354640290141106, 'timestamp': '2025-09-30 22:15:38.027259', 'step': 1988, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:38.066402', 'step': 1988, 'epoch': 2} {'type': 'loss', 'content': 0.01696304976940155, 'timestamp': '2025-09-30 22:15:38.074365', 'step': 1989, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:15:38.112169', 'step': 1989, 'epoch': 2} {'type': 'loss', 'content': 0.009608439169824123, 'timestamp': '2025-09-30 22:15:38.120173', 'step': 1990, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:15:38.157074', 'step': 1990, 'epoch': 2} {'type': 'loss', 'content': 0.014672442339360714, 'timestamp': '2025-09-30 22:15:38.164668', 'step': 1991, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:15:38.199989', 'step': 1991, 'epoch': 2} {'type': 'loss', 'content': 0.019588777795433998, 'timestamp': '2025-09-30 22:15:38.227949', 'step': 1992, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:38.263627', 'step': 1992, 'epoch': 2} {'type': 'loss', 'content': 0.010438477620482445, 'timestamp': '2025-09-30 22:15:38.274166', 'step': 1993, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:38.310768', 'step': 1993, 'epoch': 2} {'type': 'loss', 'content': 0.02435128763318062, 'timestamp': '2025-09-30 22:15:38.322005', 'step': 1994, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:38.358400', 'step': 1994, 'epoch': 2} {'type': 'loss', 'content': 0.010515746660530567, 'timestamp': '2025-09-30 22:15:38.369585', 'step': 1995, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:15:38.414287', 'step': 1995, 'epoch': 2} {'type': 'loss', 'content': 0.012999563477933407, 'timestamp': '2025-09-30 22:15:38.449136', 'step': 1996, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:38.490885', 'step': 1996, 'epoch': 2} {'type': 'loss', 'content': 0.012872045859694481, 'timestamp': '2025-09-30 22:15:38.501572', 'step': 1997, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:15:38.540942', 'step': 1997, 'epoch': 2} {'type': 'loss', 'content': 0.007690249476581812, 'timestamp': '2025-09-30 22:15:38.554644', 'step': 1998, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:15:38.597112', 'step': 1998, 'epoch': 2} {'type': 'loss', 'content': 0.013428461737930775, 'timestamp': '2025-09-30 22:15:38.610777', 'step': 1999, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:15:38.651793', 'step': 1999, 'epoch': 2} {'type': 'loss', 'content': 0.010164592415094376, 'timestamp': '2025-09-30 22:15:38.688301', 'step': 2000, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 2000', 'timestamp': '2025-09-30 22:15:43.740044', 'step': 2000, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:43.778379', 'step': 2000, 'epoch': 2} {'type': 'loss', 'content': 0.010619484819471836, 'timestamp': '2025-09-30 22:15:43.786307', 'step': 2001, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:15:43.828402', 'step': 2001, 'epoch': 2} {'type': 'loss', 'content': 0.006324365269392729, 'timestamp': '2025-09-30 22:15:43.842110', 'step': 2002, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:43.877915', 'step': 2002, 'epoch': 2} {'type': 'loss', 'content': 0.023312492296099663, 'timestamp': '2025-09-30 22:15:43.889963', 'step': 2003, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:15:43.930637', 'step': 2003, 'epoch': 2} {'type': 'loss', 'content': 0.010721305385231972, 'timestamp': '2025-09-30 22:15:43.965423', 'step': 2004, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:15:44.000396', 'step': 2004, 'epoch': 2} {'type': 'loss', 'content': 0.011308002285659313, 'timestamp': '2025-09-30 22:15:44.011505', 'step': 2005, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:44.055880', 'step': 2005, 'epoch': 2} {'type': 'loss', 'content': 0.010342972353100777, 'timestamp': '2025-09-30 22:15:44.068438', 'step': 2006, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:44.111007', 'step': 2006, 'epoch': 2} {'type': 'loss', 'content': 0.012661411426961422, 'timestamp': '2025-09-30 22:15:44.122162', 'step': 2007, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:44.158076', 'step': 2007, 'epoch': 2} {'type': 'loss', 'content': 0.010892446152865887, 'timestamp': '2025-09-30 22:15:44.191615', 'step': 2008, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:44.237328', 'step': 2008, 'epoch': 2} {'type': 'loss', 'content': 0.007492191158235073, 'timestamp': '2025-09-30 22:15:44.247882', 'step': 2009, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:44.284504', 'step': 2009, 'epoch': 2} {'type': 'loss', 'content': 0.01097020972520113, 'timestamp': '2025-09-30 22:15:44.294799', 'step': 2010, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:44.334542', 'step': 2010, 'epoch': 2} {'type': 'loss', 'content': 0.008936642669141293, 'timestamp': '2025-09-30 22:15:44.346334', 'step': 2011, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:15:44.384345', 'step': 2011, 'epoch': 2} {'type': 'loss', 'content': 0.012820220552384853, 'timestamp': '2025-09-30 22:15:44.413018', 'step': 2012, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:15:44.447682', 'step': 2012, 'epoch': 2} {'type': 'loss', 'content': 0.01265409030020237, 'timestamp': '2025-09-30 22:15:44.453404', 'step': 2013, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:15:44.500827', 'step': 2013, 'epoch': 2} {'type': 'loss', 'content': 0.010327492840588093, 'timestamp': '2025-09-30 22:15:44.508869', 'step': 2014, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:15:44.554990', 'step': 2014, 'epoch': 2} {'type': 'loss', 'content': 0.006490873172879219, 'timestamp': '2025-09-30 22:15:44.568660', 'step': 2015, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:15:44.606746', 'step': 2015, 'epoch': 2} {'type': 'loss', 'content': 0.010052801109850407, 'timestamp': '2025-09-30 22:15:44.635281', 'step': 2016, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:44.674941', 'step': 2016, 'epoch': 2} {'type': 'loss', 'content': 0.011912785470485687, 'timestamp': '2025-09-30 22:15:44.685029', 'step': 2017, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:44.719112', 'step': 2017, 'epoch': 2} {'type': 'loss', 'content': 0.017113978043198586, 'timestamp': '2025-09-30 22:15:44.729692', 'step': 2018, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:15:44.779921', 'step': 2018, 'epoch': 2} {'type': 'loss', 'content': 0.012415301986038685, 'timestamp': '2025-09-30 22:15:44.789827', 'step': 2019, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:44.830211', 'step': 2019, 'epoch': 2} {'type': 'loss', 'content': 0.008418025448918343, 'timestamp': '2025-09-30 22:15:44.863311', 'step': 2020, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:15:44.910693', 'step': 2020, 'epoch': 2} {'type': 'loss', 'content': 0.005270734429359436, 'timestamp': '2025-09-30 22:15:44.923916', 'step': 2021, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:15:44.957388', 'step': 2021, 'epoch': 2} {'type': 'loss', 'content': 0.007560055702924728, 'timestamp': '2025-09-30 22:15:44.965263', 'step': 2022, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-09-30 22:15:45.007256', 'step': 2022, 'epoch': 2} {'type': 'loss', 'content': 0.004281700123101473, 'timestamp': '2025-09-30 22:15:45.023514', 'step': 2023, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:45.065677', 'step': 2023, 'epoch': 2} {'type': 'loss', 'content': 0.011109118349850178, 'timestamp': '2025-09-30 22:15:45.099903', 'step': 2024, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:45.131605', 'step': 2024, 'epoch': 2} {'type': 'loss', 'content': 0.007959590293467045, 'timestamp': '2025-09-30 22:15:45.142139', 'step': 2025, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:45.183590', 'step': 2025, 'epoch': 2} {'type': 'loss', 'content': 0.010825558565557003, 'timestamp': '2025-09-30 22:15:45.194651', 'step': 2026, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:15:45.233414', 'step': 2026, 'epoch': 2} {'type': 'loss', 'content': 0.016622615978121758, 'timestamp': '2025-09-30 22:15:45.244216', 'step': 2027, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:45.280922', 'step': 2027, 'epoch': 2} {'type': 'loss', 'content': 0.011463593691587448, 'timestamp': '2025-09-30 22:15:45.314309', 'step': 2028, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:15:45.348038', 'step': 2028, 'epoch': 2} {'type': 'loss', 'content': 0.01033572107553482, 'timestamp': '2025-09-30 22:15:45.351397', 'step': 2029, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:45.388485', 'step': 2029, 'epoch': 2} {'type': 'loss', 'content': 0.005977582652121782, 'timestamp': '2025-09-30 22:15:45.398966', 'step': 2030, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:15:45.444266', 'step': 2030, 'epoch': 2} {'type': 'loss', 'content': 0.014716926030814648, 'timestamp': '2025-09-30 22:15:45.457959', 'step': 2031, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:15:45.491603', 'step': 2031, 'epoch': 2} {'type': 'loss', 'content': 0.018717095255851746, 'timestamp': '2025-09-30 22:15:45.520145', 'step': 2032, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:45.553014', 'step': 2032, 'epoch': 2} {'type': 'loss', 'content': 0.007365270983427763, 'timestamp': '2025-09-30 22:15:45.565605', 'step': 2033, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:45.600960', 'step': 2033, 'epoch': 2} {'type': 'loss', 'content': 0.007374709937721491, 'timestamp': '2025-09-30 22:15:45.613565', 'step': 2034, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:15:45.663856', 'step': 2034, 'epoch': 2} {'type': 'loss', 'content': 0.004602862522006035, 'timestamp': '2025-09-30 22:15:45.679679', 'step': 2035, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:45.713682', 'step': 2035, 'epoch': 2} {'type': 'loss', 'content': 0.004512309562414885, 'timestamp': '2025-09-30 22:15:45.747098', 'step': 2036, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:15:45.779757', 'step': 2036, 'epoch': 2} {'type': 'loss', 'content': 0.011239656247198582, 'timestamp': '2025-09-30 22:15:45.784395', 'step': 2037, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:45.824005', 'step': 2037, 'epoch': 2} {'type': 'loss', 'content': 0.009968086145818233, 'timestamp': '2025-09-30 22:15:45.834472', 'step': 2038, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:15:45.880507', 'step': 2038, 'epoch': 2} {'type': 'loss', 'content': 0.009855345822870731, 'timestamp': '2025-09-30 22:15:45.896390', 'step': 2039, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:15:45.941039', 'step': 2039, 'epoch': 2} {'type': 'loss', 'content': 0.009853428229689598, 'timestamp': '2025-09-30 22:15:45.975583', 'step': 2040, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:15:46.010782', 'step': 2040, 'epoch': 2} {'type': 'loss', 'content': 0.008677588775753975, 'timestamp': '2025-09-30 22:15:46.015533', 'step': 2041, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:46.055165', 'step': 2041, 'epoch': 2} {'type': 'loss', 'content': 0.014559353701770306, 'timestamp': '2025-09-30 22:15:46.065783', 'step': 2042, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:46.097782', 'step': 2042, 'epoch': 2} {'type': 'loss', 'content': 0.016621189191937447, 'timestamp': '2025-09-30 22:15:46.108366', 'step': 2043, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:46.152368', 'step': 2043, 'epoch': 2} {'type': 'loss', 'content': 0.0088790999725461, 'timestamp': '2025-09-30 22:15:46.183643', 'step': 2044, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:15:46.226022', 'step': 2044, 'epoch': 2} {'type': 'loss', 'content': 0.014385106973350048, 'timestamp': '2025-09-30 22:15:46.231534', 'step': 2045, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:46.269038', 'step': 2045, 'epoch': 2} {'type': 'loss', 'content': 0.006793558597564697, 'timestamp': '2025-09-30 22:15:46.282426', 'step': 2046, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:15:46.325522', 'step': 2046, 'epoch': 2} {'type': 'loss', 'content': 0.0046113766729831696, 'timestamp': '2025-09-30 22:15:46.339330', 'step': 2047, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:15:46.384990', 'step': 2047, 'epoch': 2} {'type': 'loss', 'content': 0.0070945583283901215, 'timestamp': '2025-09-30 22:15:46.419659', 'step': 2048, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:46.456068', 'step': 2048, 'epoch': 2} {'type': 'loss', 'content': 0.012187566608190536, 'timestamp': '2025-09-30 22:15:46.468668', 'step': 2049, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:46.502019', 'step': 2049, 'epoch': 2} {'type': 'loss', 'content': 0.017908932641148567, 'timestamp': '2025-09-30 22:15:46.514403', 'step': 2050, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:46.558697', 'step': 2050, 'epoch': 2} {'type': 'loss', 'content': 0.009582750499248505, 'timestamp': '2025-09-30 22:15:46.572094', 'step': 2051, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:15:46.617163', 'step': 2051, 'epoch': 2} {'type': 'loss', 'content': 0.013554844073951244, 'timestamp': '2025-09-30 22:15:46.651797', 'step': 2052, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:46.692422', 'step': 2052, 'epoch': 2} {'type': 'loss', 'content': 0.011859928257763386, 'timestamp': '2025-09-30 22:15:46.700211', 'step': 2053, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:15:46.742006', 'step': 2053, 'epoch': 2} {'type': 'loss', 'content': 0.008377709425985813, 'timestamp': '2025-09-30 22:15:46.755680', 'step': 2054, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:15:46.796919', 'step': 2054, 'epoch': 2} {'type': 'loss', 'content': 0.00923876091837883, 'timestamp': '2025-09-30 22:15:46.804938', 'step': 2055, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:46.851929', 'step': 2055, 'epoch': 2} {'type': 'loss', 'content': 0.011093209497630596, 'timestamp': '2025-09-30 22:15:46.886169', 'step': 2056, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:46.922464', 'step': 2056, 'epoch': 2} {'type': 'loss', 'content': 0.012157139368355274, 'timestamp': '2025-09-30 22:15:46.932261', 'step': 2057, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:46.967189', 'step': 2057, 'epoch': 2} {'type': 'loss', 'content': 0.015122108161449432, 'timestamp': '2025-09-30 22:15:46.978250', 'step': 2058, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:47.012612', 'step': 2058, 'epoch': 2} {'type': 'loss', 'content': 0.007611238397657871, 'timestamp': '2025-09-30 22:15:47.024822', 'step': 2059, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:15:47.063942', 'step': 2059, 'epoch': 2} {'type': 'loss', 'content': 0.012658152729272842, 'timestamp': '2025-09-30 22:15:47.098526', 'step': 2060, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:47.142781', 'step': 2060, 'epoch': 2} {'type': 'loss', 'content': 0.006565988529473543, 'timestamp': '2025-09-30 22:15:47.155470', 'step': 2061, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:47.201615', 'step': 2061, 'epoch': 2} {'type': 'loss', 'content': 0.007400591857731342, 'timestamp': '2025-09-30 22:15:47.214154', 'step': 2062, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:47.255351', 'step': 2062, 'epoch': 2} {'type': 'loss', 'content': 0.006891261320561171, 'timestamp': '2025-09-30 22:15:47.268723', 'step': 2063, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:47.310324', 'step': 2063, 'epoch': 2} {'type': 'loss', 'content': 0.007434625178575516, 'timestamp': '2025-09-30 22:15:47.343403', 'step': 2064, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:47.378392', 'step': 2064, 'epoch': 2} {'type': 'loss', 'content': 0.00676254415884614, 'timestamp': '2025-09-30 22:15:47.390909', 'step': 2065, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:15:47.427085', 'step': 2065, 'epoch': 2} {'type': 'loss', 'content': 0.0081375977024436, 'timestamp': '2025-09-30 22:15:47.434375', 'step': 2066, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:47.474283', 'step': 2066, 'epoch': 2} {'type': 'loss', 'content': 0.008514241315424442, 'timestamp': '2025-09-30 22:15:47.486782', 'step': 2067, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:15:47.522766', 'step': 2067, 'epoch': 2} {'type': 'loss', 'content': 0.009845650754868984, 'timestamp': '2025-09-30 22:15:47.551517', 'step': 2068, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:47.590783', 'step': 2068, 'epoch': 2} {'type': 'loss', 'content': 0.011821952648460865, 'timestamp': '2025-09-30 22:15:47.600939', 'step': 2069, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:47.639504', 'step': 2069, 'epoch': 2} {'type': 'loss', 'content': 0.009292146191000938, 'timestamp': '2025-09-30 22:15:47.650381', 'step': 2070, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-09-30 22:15:50.188319', 'step': 2070, 'epoch': 2} {'type': 'pplx', 'content': 5.5781301359781965, 'timestamp': '2025-09-30 22:15:50.191016', 'step': 2070, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:15:50.222991', 'step': 2070, 'epoch': 2} {'type': 'loss', 'content': 0.014613412320613861, 'timestamp': '2025-09-30 22:15:50.229575', 'step': 2071, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-09-30 22:15:50.281375', 'step': 2071, 'epoch': 2} {'type': 'loss', 'content': 0.005937773268669844, 'timestamp': '2025-09-30 22:15:50.319265', 'step': 2072, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:50.365232', 'step': 2072, 'epoch': 2} {'type': 'loss', 'content': 0.005999124608933926, 'timestamp': '2025-09-30 22:15:50.377850', 'step': 2073, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:50.411882', 'step': 2073, 'epoch': 2} {'type': 'loss', 'content': 0.01658693514764309, 'timestamp': '2025-09-30 22:15:50.424163', 'step': 2074, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:15:50.456708', 'step': 2074, 'epoch': 2} {'type': 'loss', 'content': 0.008094480261206627, 'timestamp': '2025-09-30 22:15:50.464652', 'step': 2075, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:50.501347', 'step': 2075, 'epoch': 2} {'type': 'loss', 'content': 0.013593686744570732, 'timestamp': '2025-09-30 22:15:50.534533', 'step': 2076, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:50.569164', 'step': 2076, 'epoch': 2} {'type': 'loss', 'content': 0.01685952953994274, 'timestamp': '2025-09-30 22:15:50.577215', 'step': 2077, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:50.616449', 'step': 2077, 'epoch': 2} {'type': 'loss', 'content': 0.0064816963858902454, 'timestamp': '2025-09-30 22:15:50.629028', 'step': 2078, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:15:50.664593', 'step': 2078, 'epoch': 2} {'type': 'loss', 'content': 0.007426493801176548, 'timestamp': '2025-09-30 22:15:50.672763', 'step': 2079, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:50.716301', 'step': 2079, 'epoch': 2} {'type': 'loss', 'content': 0.00575529458001256, 'timestamp': '2025-09-30 22:15:50.750505', 'step': 2080, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:50.789204', 'step': 2080, 'epoch': 2} {'type': 'loss', 'content': 0.01651431806385517, 'timestamp': '2025-09-30 22:15:50.797865', 'step': 2081, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:50.834645', 'step': 2081, 'epoch': 2} {'type': 'loss', 'content': 0.014004933647811413, 'timestamp': '2025-09-30 22:15:50.847041', 'step': 2082, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:50.880426', 'step': 2082, 'epoch': 2} {'type': 'loss', 'content': 0.011181175708770752, 'timestamp': '2025-09-30 22:15:50.892776', 'step': 2083, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:50.931865', 'step': 2083, 'epoch': 2} {'type': 'loss', 'content': 0.012345019727945328, 'timestamp': '2025-09-30 22:15:50.966103', 'step': 2084, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:15:51.012624', 'step': 2084, 'epoch': 2} {'type': 'loss', 'content': 0.013612167909741402, 'timestamp': '2025-09-30 22:15:51.025648', 'step': 2085, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:51.063085', 'step': 2085, 'epoch': 2} {'type': 'loss', 'content': 0.007797682657837868, 'timestamp': '2025-09-30 22:15:51.074215', 'step': 2086, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:51.109535', 'step': 2086, 'epoch': 2} {'type': 'loss', 'content': 0.012879363261163235, 'timestamp': '2025-09-30 22:15:51.120552', 'step': 2087, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:15:51.167767', 'step': 2087, 'epoch': 2} {'type': 'loss', 'content': 0.00713657820597291, 'timestamp': '2025-09-30 22:15:51.202388', 'step': 2088, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:15:51.237362', 'step': 2088, 'epoch': 2} {'type': 'loss', 'content': 0.016538361087441444, 'timestamp': '2025-09-30 22:15:51.250340', 'step': 2089, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:15:51.297149', 'step': 2089, 'epoch': 2} {'type': 'loss', 'content': 0.006878055166453123, 'timestamp': '2025-09-30 22:15:51.312795', 'step': 2090, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:51.352172', 'step': 2090, 'epoch': 2} {'type': 'loss', 'content': 0.005907298065721989, 'timestamp': '2025-09-30 22:15:51.365580', 'step': 2091, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:51.399530', 'step': 2091, 'epoch': 2} {'type': 'loss', 'content': 0.011420510709285736, 'timestamp': '2025-09-30 22:15:51.432932', 'step': 2092, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:15:51.479046', 'step': 2092, 'epoch': 2} {'type': 'loss', 'content': 0.007098886650055647, 'timestamp': '2025-09-30 22:15:51.492351', 'step': 2093, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:15:51.541119', 'step': 2093, 'epoch': 2} {'type': 'loss', 'content': 0.00935671292245388, 'timestamp': '2025-09-30 22:15:51.557034', 'step': 2094, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:15:51.595846', 'step': 2094, 'epoch': 2} {'type': 'loss', 'content': 0.007575173396617174, 'timestamp': '2025-09-30 22:15:51.609663', 'step': 2095, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:15:51.654887', 'step': 2095, 'epoch': 2} {'type': 'loss', 'content': 0.0037557778414338827, 'timestamp': '2025-09-30 22:15:51.691722', 'step': 2096, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:51.731402', 'step': 2096, 'epoch': 2} {'type': 'loss', 'content': 0.008648642338812351, 'timestamp': '2025-09-30 22:15:51.741255', 'step': 2097, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:51.778787', 'step': 2097, 'epoch': 2} {'type': 'loss', 'content': 0.012735256925225258, 'timestamp': '2025-09-30 22:15:51.792154', 'step': 2098, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:15:51.842223', 'step': 2098, 'epoch': 2} {'type': 'loss', 'content': 0.011567601934075356, 'timestamp': '2025-09-30 22:15:51.856004', 'step': 2099, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:51.892525', 'step': 2099, 'epoch': 2} {'type': 'loss', 'content': 0.013985881581902504, 'timestamp': '2025-09-30 22:15:51.924484', 'step': 2100, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:15:51.959500', 'step': 2100, 'epoch': 2} {'type': 'loss', 'content': 0.0130574656650424, 'timestamp': '2025-09-30 22:15:51.972646', 'step': 2101, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:52.009601', 'step': 2101, 'epoch': 2} {'type': 'loss', 'content': 0.010273012332618237, 'timestamp': '2025-09-30 22:15:52.021922', 'step': 2102, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:52.056863', 'step': 2102, 'epoch': 2} {'type': 'loss', 'content': 0.014140932820737362, 'timestamp': '2025-09-30 22:15:52.070271', 'step': 2103, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:15:52.109205', 'step': 2103, 'epoch': 2} {'type': 'loss', 'content': 0.009167030453681946, 'timestamp': '2025-09-30 22:15:52.143894', 'step': 2104, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 14238374516352}, 'timestamp': '2025-09-30 22:15:52.187873', 'step': 2104, 'epoch': 2} {'type': 'loss', 'content': 0.008260817267000675, 'timestamp': '2025-09-30 22:15:52.204884', 'step': 2105, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:52.246966', 'step': 2105, 'epoch': 2} {'type': 'loss', 'content': 0.01203217078000307, 'timestamp': '2025-09-30 22:15:52.260429', 'step': 2106, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:15:52.295351', 'step': 2106, 'epoch': 2} {'type': 'loss', 'content': 0.01023252122104168, 'timestamp': '2025-09-30 22:15:52.303218', 'step': 2107, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:52.338941', 'step': 2107, 'epoch': 2} {'type': 'loss', 'content': 0.021626289933919907, 'timestamp': '2025-09-30 22:15:52.370083', 'step': 2108, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:15:52.409825', 'step': 2108, 'epoch': 2} {'type': 'loss', 'content': 0.02148636244237423, 'timestamp': '2025-09-30 22:15:52.415548', 'step': 2109, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:52.454766', 'step': 2109, 'epoch': 2} {'type': 'loss', 'content': 0.009455446153879166, 'timestamp': '2025-09-30 22:15:52.467361', 'step': 2110, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:52.505991', 'step': 2110, 'epoch': 2} {'type': 'loss', 'content': 0.015936443582177162, 'timestamp': '2025-09-30 22:15:52.519372', 'step': 2111, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:52.554404', 'step': 2111, 'epoch': 2} {'type': 'loss', 'content': 0.020320961251854897, 'timestamp': '2025-09-30 22:15:52.587749', 'step': 2112, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:52.625544', 'step': 2112, 'epoch': 2} {'type': 'loss', 'content': 0.009221428073942661, 'timestamp': '2025-09-30 22:15:52.636157', 'step': 2113, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:52.678337', 'step': 2113, 'epoch': 2} {'type': 'loss', 'content': 0.011892154812812805, 'timestamp': '2025-09-30 22:15:52.691651', 'step': 2114, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-09-30 22:15:52.736693', 'step': 2114, 'epoch': 2} {'type': 'loss', 'content': 0.008845707401633263, 'timestamp': '2025-09-30 22:15:52.753048', 'step': 2115, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:15:52.795751', 'step': 2115, 'epoch': 2} {'type': 'loss', 'content': 0.007404664997011423, 'timestamp': '2025-09-30 22:15:52.832485', 'step': 2116, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:15:52.872791', 'step': 2116, 'epoch': 2} {'type': 'loss', 'content': 0.007510712370276451, 'timestamp': '2025-09-30 22:15:52.886189', 'step': 2117, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:52.923463', 'step': 2117, 'epoch': 2} {'type': 'loss', 'content': 0.009143121540546417, 'timestamp': '2025-09-30 22:15:52.933925', 'step': 2118, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-09-30 22:15:52.992396', 'step': 2118, 'epoch': 2} {'type': 'loss', 'content': 0.00436469865962863, 'timestamp': '2025-09-30 22:15:53.008592', 'step': 2119, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:15:53.066340', 'step': 2119, 'epoch': 2} {'type': 'loss', 'content': 0.009797188453376293, 'timestamp': '2025-09-30 22:15:53.100866', 'step': 2120, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:53.142376', 'step': 2120, 'epoch': 2} {'type': 'loss', 'content': 0.010304818861186504, 'timestamp': '2025-09-30 22:15:53.152152', 'step': 2121, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:15:53.193096', 'step': 2121, 'epoch': 2} {'type': 'loss', 'content': 0.013395457528531551, 'timestamp': '2025-09-30 22:15:53.200720', 'step': 2122, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:53.232561', 'step': 2122, 'epoch': 2} {'type': 'loss', 'content': 0.01645011082291603, 'timestamp': '2025-09-30 22:15:53.242841', 'step': 2123, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:53.279323', 'step': 2123, 'epoch': 2} {'type': 'loss', 'content': 0.004526421893388033, 'timestamp': '2025-09-30 22:15:53.310425', 'step': 2124, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:15:53.346184', 'step': 2124, 'epoch': 2} {'type': 'loss', 'content': 0.028871094807982445, 'timestamp': '2025-09-30 22:15:53.359303', 'step': 2125, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:53.395171', 'step': 2125, 'epoch': 2} {'type': 'loss', 'content': 0.0040580276399850845, 'timestamp': '2025-09-30 22:15:53.405721', 'step': 2126, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:15:53.449254', 'step': 2126, 'epoch': 2} {'type': 'loss', 'content': 0.011990678496658802, 'timestamp': '2025-09-30 22:15:53.463220', 'step': 2127, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:53.508104', 'step': 2127, 'epoch': 2} {'type': 'loss', 'content': 0.01219364907592535, 'timestamp': '2025-09-30 22:15:53.541298', 'step': 2128, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:15:53.580794', 'step': 2128, 'epoch': 2} {'type': 'loss', 'content': 0.00998268835246563, 'timestamp': '2025-09-30 22:15:53.595920', 'step': 2129, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:15:53.637055', 'step': 2129, 'epoch': 2} {'type': 'loss', 'content': 0.020302310585975647, 'timestamp': '2025-09-30 22:15:53.652954', 'step': 2130, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:53.689382', 'step': 2130, 'epoch': 2} {'type': 'loss', 'content': 0.011316817253828049, 'timestamp': '2025-09-30 22:15:53.700462', 'step': 2131, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:53.742819', 'step': 2131, 'epoch': 2} {'type': 'loss', 'content': 0.009340646676719189, 'timestamp': '2025-09-30 22:15:53.774119', 'step': 2132, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:53.812609', 'step': 2132, 'epoch': 2} {'type': 'loss', 'content': 0.013567190617322922, 'timestamp': '2025-09-30 22:15:53.821467', 'step': 2133, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:53.861941', 'step': 2133, 'epoch': 2} {'type': 'loss', 'content': 0.0061887470073997974, 'timestamp': '2025-09-30 22:15:53.875327', 'step': 2134, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:53.911908', 'step': 2134, 'epoch': 2} {'type': 'loss', 'content': 0.008160131052136421, 'timestamp': '2025-09-30 22:15:53.925301', 'step': 2135, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:53.958650', 'step': 2135, 'epoch': 2} {'type': 'loss', 'content': 0.011914225295186043, 'timestamp': '2025-09-30 22:15:53.991886', 'step': 2136, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:54.029081', 'step': 2136, 'epoch': 2} {'type': 'loss', 'content': 0.00802577380090952, 'timestamp': '2025-09-30 22:15:54.037851', 'step': 2137, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:54.072938', 'step': 2137, 'epoch': 2} {'type': 'loss', 'content': 0.008770965039730072, 'timestamp': '2025-09-30 22:15:54.085279', 'step': 2138, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:15:54.126509', 'step': 2138, 'epoch': 2} {'type': 'loss', 'content': 0.008672679774463177, 'timestamp': '2025-09-30 22:15:54.133766', 'step': 2139, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:15:54.181090', 'step': 2139, 'epoch': 2} {'type': 'loss', 'content': 0.0069053624756634235, 'timestamp': '2025-09-30 22:15:54.217849', 'step': 2140, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:15:54.252278', 'step': 2140, 'epoch': 2} {'type': 'loss', 'content': 0.009527964517474174, 'timestamp': '2025-09-30 22:15:54.257088', 'step': 2141, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:15:54.292415', 'step': 2141, 'epoch': 2} {'type': 'loss', 'content': 0.024030277505517006, 'timestamp': '2025-09-30 22:15:54.299437', 'step': 2142, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:54.341480', 'step': 2142, 'epoch': 2} {'type': 'loss', 'content': 0.005627688951790333, 'timestamp': '2025-09-30 22:15:54.352066', 'step': 2143, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:54.393200', 'step': 2143, 'epoch': 2} {'type': 'loss', 'content': 0.011507662013173103, 'timestamp': '2025-09-30 22:15:54.425159', 'step': 2144, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:15:54.460471', 'step': 2144, 'epoch': 2} {'type': 'loss', 'content': 0.010537424124777317, 'timestamp': '2025-09-30 22:15:54.473784', 'step': 2145, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:15:54.510723', 'step': 2145, 'epoch': 2} {'type': 'loss', 'content': 0.01107259001582861, 'timestamp': '2025-09-30 22:15:54.518686', 'step': 2146, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:15:54.555489', 'step': 2146, 'epoch': 2} {'type': 'loss', 'content': 0.028157521039247513, 'timestamp': '2025-09-30 22:15:54.562649', 'step': 2147, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:54.605357', 'step': 2147, 'epoch': 2} {'type': 'loss', 'content': 0.00928823184221983, 'timestamp': '2025-09-30 22:15:54.637474', 'step': 2148, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:54.671978', 'step': 2148, 'epoch': 2} {'type': 'loss', 'content': 0.008403152227401733, 'timestamp': '2025-09-30 22:15:54.684614', 'step': 2149, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:15:54.718865', 'step': 2149, 'epoch': 2} {'type': 'loss', 'content': 0.012543005868792534, 'timestamp': '2025-09-30 22:15:54.726705', 'step': 2150, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:15:54.761107', 'step': 2150, 'epoch': 2} {'type': 'loss', 'content': 0.009016967378556728, 'timestamp': '2025-09-30 22:15:54.769091', 'step': 2151, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:15:54.805847', 'step': 2151, 'epoch': 2} {'type': 'loss', 'content': 0.010433612391352654, 'timestamp': '2025-09-30 22:15:54.840545', 'step': 2152, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:15:54.883286', 'step': 2152, 'epoch': 2} {'type': 'loss', 'content': 0.00729801319539547, 'timestamp': '2025-09-30 22:15:54.898631', 'step': 2153, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-09-30 22:15:54.944731', 'step': 2153, 'epoch': 2} {'type': 'loss', 'content': 0.010827134363353252, 'timestamp': '2025-09-30 22:15:54.961814', 'step': 2154, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:15:54.999172', 'step': 2154, 'epoch': 2} {'type': 'loss', 'content': 0.01179803628474474, 'timestamp': '2025-09-30 22:15:55.013013', 'step': 2155, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:15:55.052053', 'step': 2155, 'epoch': 2} {'type': 'loss', 'content': 0.007486463990062475, 'timestamp': '2025-09-30 22:15:55.086874', 'step': 2156, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:55.124429', 'step': 2156, 'epoch': 2} {'type': 'loss', 'content': 0.010275008156895638, 'timestamp': '2025-09-30 22:15:55.132553', 'step': 2157, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:15:55.169980', 'step': 2157, 'epoch': 2} {'type': 'loss', 'content': 0.013125157915055752, 'timestamp': '2025-09-30 22:15:55.183777', 'step': 2158, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:15:55.219724', 'step': 2158, 'epoch': 2} {'type': 'loss', 'content': 0.009517863392829895, 'timestamp': '2025-09-30 22:15:55.232099', 'step': 2159, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:55.275332', 'step': 2159, 'epoch': 2} {'type': 'loss', 'content': 0.004936881363391876, 'timestamp': '2025-09-30 22:15:55.309586', 'step': 2160, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:15:55.351880', 'step': 2160, 'epoch': 2} {'type': 'loss', 'content': 0.005083846859633923, 'timestamp': '2025-09-30 22:15:55.365252', 'step': 2161, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:55.402697', 'step': 2161, 'epoch': 2} {'type': 'loss', 'content': 0.006979496218264103, 'timestamp': '2025-09-30 22:15:55.416123', 'step': 2162, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:55.453992', 'step': 2162, 'epoch': 2} {'type': 'loss', 'content': 0.009978152811527252, 'timestamp': '2025-09-30 22:15:55.466502', 'step': 2163, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:15:55.507099', 'step': 2163, 'epoch': 2} {'type': 'loss', 'content': 0.006703498773276806, 'timestamp': '2025-09-30 22:15:55.541762', 'step': 2164, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:55.574313', 'step': 2164, 'epoch': 2} {'type': 'loss', 'content': 0.013024583458900452, 'timestamp': '2025-09-30 22:15:55.582574', 'step': 2165, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:55.619787', 'step': 2165, 'epoch': 2} {'type': 'loss', 'content': 0.008418618701398373, 'timestamp': '2025-09-30 22:15:55.632390', 'step': 2166, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:15:55.668165', 'step': 2166, 'epoch': 2} {'type': 'loss', 'content': 0.012666082940995693, 'timestamp': '2025-09-30 22:15:55.675240', 'step': 2167, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:15:55.710537', 'step': 2167, 'epoch': 2} {'type': 'loss', 'content': 0.0075226472690701485, 'timestamp': '2025-09-30 22:15:55.738639', 'step': 2168, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:15:55.780602', 'step': 2168, 'epoch': 2} {'type': 'loss', 'content': 0.005574285984039307, 'timestamp': '2025-09-30 22:15:55.785544', 'step': 2169, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:15:55.818416', 'step': 2169, 'epoch': 2} {'type': 'loss', 'content': 0.01113008800894022, 'timestamp': '2025-09-30 22:15:55.825793', 'step': 2170, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:15:55.865733', 'step': 2170, 'epoch': 2} {'type': 'loss', 'content': 0.011179173365235329, 'timestamp': '2025-09-30 22:15:55.878298', 'step': 2171, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:15:55.914286', 'step': 2171, 'epoch': 2} {'type': 'loss', 'content': 0.010755863040685654, 'timestamp': '2025-09-30 22:15:55.946096', 'step': 2172, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:15:55.984802', 'step': 2172, 'epoch': 2} {'type': 'loss', 'content': 0.004485005047172308, 'timestamp': '2025-09-30 22:15:55.997859', 'step': 2173, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:56.031636', 'step': 2173, 'epoch': 2} {'type': 'loss', 'content': 0.008229841478168964, 'timestamp': '2025-09-30 22:15:56.042094', 'step': 2174, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:15:56.078451', 'step': 2174, 'epoch': 2} {'type': 'loss', 'content': 0.007020510733127594, 'timestamp': '2025-09-30 22:15:56.092315', 'step': 2175, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:15:56.138923', 'step': 2175, 'epoch': 2} {'type': 'loss', 'content': 0.025375016033649445, 'timestamp': '2025-09-30 22:15:56.175499', 'step': 2176, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:15:56.223252', 'step': 2176, 'epoch': 2} {'type': 'loss', 'content': 0.011891238391399384, 'timestamp': '2025-09-30 22:15:56.237795', 'step': 2177, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:56.293871', 'step': 2177, 'epoch': 2} {'type': 'loss', 'content': 0.011667244136333466, 'timestamp': '2025-09-30 22:15:56.308653', 'step': 2178, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:15:56.359219', 'step': 2178, 'epoch': 2} {'type': 'loss', 'content': 0.007750329095870256, 'timestamp': '2025-09-30 22:15:56.373232', 'step': 2179, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:56.427206', 'step': 2179, 'epoch': 2} {'type': 'loss', 'content': 0.011612946167588234, 'timestamp': '2025-09-30 22:15:56.461448', 'step': 2180, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:56.496330', 'step': 2180, 'epoch': 2} {'type': 'loss', 'content': 0.014608295634388924, 'timestamp': '2025-09-30 22:15:56.505829', 'step': 2181, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:56.541648', 'step': 2181, 'epoch': 2} {'type': 'loss', 'content': 0.007489521987736225, 'timestamp': '2025-09-30 22:15:56.556018', 'step': 2182, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:15:56.606206', 'step': 2182, 'epoch': 2} {'type': 'loss', 'content': 0.006555826403200626, 'timestamp': '2025-09-30 22:15:56.613440', 'step': 2183, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:15:56.659068', 'step': 2183, 'epoch': 2} {'type': 'loss', 'content': 0.010146516375243664, 'timestamp': '2025-09-30 22:15:56.687947', 'step': 2184, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:15:56.735455', 'step': 2184, 'epoch': 2} {'type': 'loss', 'content': 0.028159474954009056, 'timestamp': '2025-09-30 22:15:56.750854', 'step': 2185, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-09-30 22:15:59.216342', 'step': 2185, 'epoch': 2} {'type': 'pplx', 'content': 5.614024903667018, 'timestamp': '2025-09-30 22:15:59.218947', 'step': 2185, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:59.249474', 'step': 2185, 'epoch': 2} {'type': 'loss', 'content': 0.007176598068326712, 'timestamp': '2025-09-30 22:15:59.260149', 'step': 2186, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:15:59.299798', 'step': 2186, 'epoch': 2} {'type': 'loss', 'content': 0.010663502849638462, 'timestamp': '2025-09-30 22:15:59.306654', 'step': 2187, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:15:59.342291', 'step': 2187, 'epoch': 2} {'type': 'loss', 'content': 0.014264430850744247, 'timestamp': '2025-09-30 22:15:59.373436', 'step': 2188, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:15:59.407340', 'step': 2188, 'epoch': 2} {'type': 'loss', 'content': 0.013394519686698914, 'timestamp': '2025-09-30 22:15:59.412681', 'step': 2189, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:15:59.448096', 'step': 2189, 'epoch': 2} {'type': 'loss', 'content': 0.01116852555423975, 'timestamp': '2025-09-30 22:15:59.455873', 'step': 2190, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:15:59.489528', 'step': 2190, 'epoch': 2} {'type': 'loss', 'content': 0.012576432898640633, 'timestamp': '2025-09-30 22:15:59.497397', 'step': 2191, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:59.531745', 'step': 2191, 'epoch': 2} {'type': 'loss', 'content': 0.009694776497781277, 'timestamp': '2025-09-30 22:15:59.563663', 'step': 2192, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:15:59.598912', 'step': 2192, 'epoch': 2} {'type': 'loss', 'content': 0.009965450502932072, 'timestamp': '2025-09-30 22:15:59.612058', 'step': 2193, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:59.646110', 'step': 2193, 'epoch': 2} {'type': 'loss', 'content': 0.010894293896853924, 'timestamp': '2025-09-30 22:15:59.657536', 'step': 2194, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:15:59.697841', 'step': 2194, 'epoch': 2} {'type': 'loss', 'content': 0.00821804627776146, 'timestamp': '2025-09-30 22:15:59.709023', 'step': 2195, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:59.743982', 'step': 2195, 'epoch': 2} {'type': 'loss', 'content': 0.01242805365473032, 'timestamp': '2025-09-30 22:15:59.778206', 'step': 2196, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:59.814437', 'step': 2196, 'epoch': 2} {'type': 'loss', 'content': 0.012450500391423702, 'timestamp': '2025-09-30 22:15:59.827121', 'step': 2197, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:15:59.882887', 'step': 2197, 'epoch': 2} {'type': 'loss', 'content': 0.007693701423704624, 'timestamp': '2025-09-30 22:15:59.896598', 'step': 2198, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:15:59.936345', 'step': 2198, 'epoch': 2} {'type': 'loss', 'content': 0.014463446103036404, 'timestamp': '2025-09-30 22:15:59.950386', 'step': 2199, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:15:59.991831', 'step': 2199, 'epoch': 2} {'type': 'loss', 'content': 0.01318387035280466, 'timestamp': '2025-09-30 22:16:00.026108', 'step': 2200, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:16:00.063593', 'step': 2200, 'epoch': 2} {'type': 'loss', 'content': 0.004912909120321274, 'timestamp': '2025-09-30 22:16:00.073659', 'step': 2201, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:16:00.109365', 'step': 2201, 'epoch': 2} {'type': 'loss', 'content': 0.011479430831968784, 'timestamp': '2025-09-30 22:16:00.121792', 'step': 2202, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:16:00.164778', 'step': 2202, 'epoch': 2} {'type': 'loss', 'content': 0.011321412399411201, 'timestamp': '2025-09-30 22:16:00.172152', 'step': 2203, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:16:00.215177', 'step': 2203, 'epoch': 2} {'type': 'loss', 'content': 0.009902720339596272, 'timestamp': '2025-09-30 22:16:00.251864', 'step': 2204, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:16:00.292852', 'step': 2204, 'epoch': 2} {'type': 'loss', 'content': 0.00927629042416811, 'timestamp': '2025-09-30 22:16:00.305966', 'step': 2205, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:00.345430', 'step': 2205, 'epoch': 2} {'type': 'loss', 'content': 0.010016247630119324, 'timestamp': '2025-09-30 22:16:00.353378', 'step': 2206, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:16:00.405618', 'step': 2206, 'epoch': 2} {'type': 'loss', 'content': 0.01247417088598013, 'timestamp': '2025-09-30 22:16:00.421534', 'step': 2207, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 14238374516352}, 'timestamp': '2025-09-30 22:16:00.475589', 'step': 2207, 'epoch': 2} {'type': 'loss', 'content': 0.007204344030469656, 'timestamp': '2025-09-30 22:16:00.513797', 'step': 2208, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:16:00.548636', 'step': 2208, 'epoch': 2} {'type': 'loss', 'content': 0.017199793830513954, 'timestamp': '2025-09-30 22:16:00.558805', 'step': 2209, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:16:00.601264', 'step': 2209, 'epoch': 2} {'type': 'loss', 'content': 0.012835008092224598, 'timestamp': '2025-09-30 22:16:00.613794', 'step': 2210, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:00.646429', 'step': 2210, 'epoch': 2} {'type': 'loss', 'content': 0.009112820960581303, 'timestamp': '2025-09-30 22:16:00.654077', 'step': 2211, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:16:00.689320', 'step': 2211, 'epoch': 2} {'type': 'loss', 'content': 0.012906182557344437, 'timestamp': '2025-09-30 22:16:00.722455', 'step': 2212, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:00.755148', 'step': 2212, 'epoch': 2} {'type': 'loss', 'content': 0.005865730345249176, 'timestamp': '2025-09-30 22:16:00.760934', 'step': 2213, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:16:00.804911', 'step': 2213, 'epoch': 2} {'type': 'loss', 'content': 0.01065493281930685, 'timestamp': '2025-09-30 22:16:00.812676', 'step': 2214, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:00.852881', 'step': 2214, 'epoch': 2} {'type': 'loss', 'content': 0.009175293147563934, 'timestamp': '2025-09-30 22:16:00.860727', 'step': 2215, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:00.893745', 'step': 2215, 'epoch': 2} {'type': 'loss', 'content': 0.009614244103431702, 'timestamp': '2025-09-30 22:16:00.925066', 'step': 2216, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:16:00.964192', 'step': 2216, 'epoch': 2} {'type': 'loss', 'content': 0.011298530735075474, 'timestamp': '2025-09-30 22:16:00.974105', 'step': 2217, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:16:01.025615', 'step': 2217, 'epoch': 2} {'type': 'loss', 'content': 0.009481500834226608, 'timestamp': '2025-09-30 22:16:01.038209', 'step': 2218, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:16:01.080015', 'step': 2218, 'epoch': 2} {'type': 'loss', 'content': 0.008077259175479412, 'timestamp': '2025-09-30 22:16:01.093869', 'step': 2219, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:01.142383', 'step': 2219, 'epoch': 2} {'type': 'loss', 'content': 0.006438512355089188, 'timestamp': '2025-09-30 22:16:01.171207', 'step': 2220, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:01.206086', 'step': 2220, 'epoch': 2} {'type': 'loss', 'content': 0.010353588499128819, 'timestamp': '2025-09-30 22:16:01.211726', 'step': 2221, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:16:01.245007', 'step': 2221, 'epoch': 2} {'type': 'loss', 'content': 0.006344547960907221, 'timestamp': '2025-09-30 22:16:01.257593', 'step': 2222, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:16:01.293379', 'step': 2222, 'epoch': 2} {'type': 'loss', 'content': 0.010450613684952259, 'timestamp': '2025-09-30 22:16:01.301101', 'step': 2223, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:16:01.333219', 'step': 2223, 'epoch': 2} {'type': 'loss', 'content': 0.00932193361222744, 'timestamp': '2025-09-30 22:16:01.366335', 'step': 2224, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:16:01.399437', 'step': 2224, 'epoch': 2} {'type': 'loss', 'content': 0.005790230818092823, 'timestamp': '2025-09-30 22:16:01.404384', 'step': 2225, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:16:01.447074', 'step': 2225, 'epoch': 2} {'type': 'loss', 'content': 0.012799671851098537, 'timestamp': '2025-09-30 22:16:01.459410', 'step': 2226, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:16:01.493176', 'step': 2226, 'epoch': 2} {'type': 'loss', 'content': 0.004914826713502407, 'timestamp': '2025-09-30 22:16:01.505746', 'step': 2227, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:16:01.544020', 'step': 2227, 'epoch': 2} {'type': 'loss', 'content': 0.006473730318248272, 'timestamp': '2025-09-30 22:16:01.578275', 'step': 2228, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:01.610405', 'step': 2228, 'epoch': 2} {'type': 'loss', 'content': 0.0077035753056406975, 'timestamp': '2025-09-30 22:16:01.619345', 'step': 2229, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:16:01.652221', 'step': 2229, 'epoch': 2} {'type': 'loss', 'content': 0.009425118565559387, 'timestamp': '2025-09-30 22:16:01.659881', 'step': 2230, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:16:01.695532', 'step': 2230, 'epoch': 2} {'type': 'loss', 'content': 0.011323267593979836, 'timestamp': '2025-09-30 22:16:01.709230', 'step': 2231, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:16:01.745294', 'step': 2231, 'epoch': 2} {'type': 'loss', 'content': 0.005302715580910444, 'timestamp': '2025-09-30 22:16:01.773301', 'step': 2232, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:01.810200', 'step': 2232, 'epoch': 2} {'type': 'loss', 'content': 0.010681116953492165, 'timestamp': '2025-09-30 22:16:01.815859', 'step': 2233, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:16:01.859037', 'step': 2233, 'epoch': 2} {'type': 'loss', 'content': 0.005477715749293566, 'timestamp': '2025-09-30 22:16:01.871327', 'step': 2234, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:01.904481', 'step': 2234, 'epoch': 2} {'type': 'loss', 'content': 0.00989870447665453, 'timestamp': '2025-09-30 22:16:01.914974', 'step': 2235, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:16:01.955710', 'step': 2235, 'epoch': 2} {'type': 'loss', 'content': 0.004034838639199734, 'timestamp': '2025-09-30 22:16:01.989103', 'step': 2236, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:16:02.039332', 'step': 2236, 'epoch': 2} {'type': 'loss', 'content': 0.0105787618085742, 'timestamp': '2025-09-30 22:16:02.051951', 'step': 2237, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:16:02.092289', 'step': 2237, 'epoch': 2} {'type': 'loss', 'content': 0.00780834536999464, 'timestamp': '2025-09-30 22:16:02.104784', 'step': 2238, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:02.137504', 'step': 2238, 'epoch': 2} {'type': 'loss', 'content': 0.012646148912608624, 'timestamp': '2025-09-30 22:16:02.145332', 'step': 2239, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:16:02.183046', 'step': 2239, 'epoch': 2} {'type': 'loss', 'content': 0.01194890309125185, 'timestamp': '2025-09-30 22:16:02.217713', 'step': 2240, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:16:02.262607', 'step': 2240, 'epoch': 2} {'type': 'loss', 'content': 0.007028935011476278, 'timestamp': '2025-09-30 22:16:02.272466', 'step': 2241, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:16:02.313166', 'step': 2241, 'epoch': 2} {'type': 'loss', 'content': 0.00852537713944912, 'timestamp': '2025-09-30 22:16:02.326563', 'step': 2242, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:16:02.364833', 'step': 2242, 'epoch': 2} {'type': 'loss', 'content': 0.007157263811677694, 'timestamp': '2025-09-30 22:16:02.378859', 'step': 2243, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:16:02.413584', 'step': 2243, 'epoch': 2} {'type': 'loss', 'content': 0.008009511046111584, 'timestamp': '2025-09-30 22:16:02.446974', 'step': 2244, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:16:02.481843', 'step': 2244, 'epoch': 2} {'type': 'loss', 'content': 0.00914499070495367, 'timestamp': '2025-09-30 22:16:02.494465', 'step': 2245, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:02.528219', 'step': 2245, 'epoch': 2} {'type': 'loss', 'content': 0.010988660156726837, 'timestamp': '2025-09-30 22:16:02.538717', 'step': 2246, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:16:02.573023', 'step': 2246, 'epoch': 2} {'type': 'loss', 'content': 0.01401505433022976, 'timestamp': '2025-09-30 22:16:02.580168', 'step': 2247, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:16:02.615683', 'step': 2247, 'epoch': 2} {'type': 'loss', 'content': 0.011652003973722458, 'timestamp': '2025-09-30 22:16:02.643782', 'step': 2248, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:02.688947', 'step': 2248, 'epoch': 2} {'type': 'loss', 'content': 0.012605683878064156, 'timestamp': '2025-09-30 22:16:02.694489', 'step': 2249, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:02.731806', 'step': 2249, 'epoch': 2} {'type': 'loss', 'content': 0.00848052091896534, 'timestamp': '2025-09-30 22:16:02.739506', 'step': 2250, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:16:02.775989', 'step': 2250, 'epoch': 2} {'type': 'loss', 'content': 0.021888215094804764, 'timestamp': '2025-09-30 22:16:02.779975', 'step': 2251, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:16:02.812840', 'step': 2251, 'epoch': 2} {'type': 'loss', 'content': 0.01496994961053133, 'timestamp': '2025-09-30 22:16:02.842758', 'step': 2252, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 15187581968384}, 'timestamp': '2025-09-30 22:16:02.886105', 'step': 2252, 'epoch': 2} {'type': 'loss', 'content': 0.0088545773178339, 'timestamp': '2025-09-30 22:16:02.903469', 'step': 2253, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:16:02.940176', 'step': 2253, 'epoch': 2} {'type': 'loss', 'content': 0.007000849116593599, 'timestamp': '2025-09-30 22:16:02.953897', 'step': 2254, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:16:02.986709', 'step': 2254, 'epoch': 2} {'type': 'loss', 'content': 0.012423422187566757, 'timestamp': '2025-09-30 22:16:02.991155', 'step': 2255, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:03.024781', 'step': 2255, 'epoch': 2} {'type': 'loss', 'content': 0.013098802417516708, 'timestamp': '2025-09-30 22:16:03.056567', 'step': 2256, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:16:03.092064', 'step': 2256, 'epoch': 2} {'type': 'loss', 'content': 0.010979630053043365, 'timestamp': '2025-09-30 22:16:03.097146', 'step': 2257, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:03.133860', 'step': 2257, 'epoch': 2} {'type': 'loss', 'content': 0.016253529116511345, 'timestamp': '2025-09-30 22:16:03.141728', 'step': 2258, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 14712978242368}, 'timestamp': '2025-09-30 22:16:03.186756', 'step': 2258, 'epoch': 2} {'type': 'loss', 'content': 0.007928948849439621, 'timestamp': '2025-09-30 22:16:03.204484', 'step': 2259, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:16:03.243592', 'step': 2259, 'epoch': 2} {'type': 'loss', 'content': 0.009120190516114235, 'timestamp': '2025-09-30 22:16:03.272204', 'step': 2260, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:03.305722', 'step': 2260, 'epoch': 2} {'type': 'loss', 'content': 0.010486864484846592, 'timestamp': '2025-09-30 22:16:03.311318', 'step': 2261, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:03.351028', 'step': 2261, 'epoch': 2} {'type': 'loss', 'content': 0.01721068099141121, 'timestamp': '2025-09-30 22:16:03.358965', 'step': 2262, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:03.392734', 'step': 2262, 'epoch': 2} {'type': 'loss', 'content': 0.00959155336022377, 'timestamp': '2025-09-30 22:16:03.400660', 'step': 2263, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:16:03.440369', 'step': 2263, 'epoch': 2} {'type': 'loss', 'content': 0.011247663758695126, 'timestamp': '2025-09-30 22:16:03.474966', 'step': 2264, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:16:03.512298', 'step': 2264, 'epoch': 2} {'type': 'loss', 'content': 0.012087054550647736, 'timestamp': '2025-09-30 22:16:03.517598', 'step': 2265, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:16:03.558753', 'step': 2265, 'epoch': 2} {'type': 'loss', 'content': 0.005484334193170071, 'timestamp': '2025-09-30 22:16:03.572482', 'step': 2266, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:16:03.613016', 'step': 2266, 'epoch': 2} {'type': 'loss', 'content': 0.007422391790896654, 'timestamp': '2025-09-30 22:16:03.626383', 'step': 2267, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:16:03.663095', 'step': 2267, 'epoch': 2} {'type': 'loss', 'content': 0.006608553696423769, 'timestamp': '2025-09-30 22:16:03.697659', 'step': 2268, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-09-30 22:16:03.739280', 'step': 2268, 'epoch': 2} {'type': 'loss', 'content': 0.00455620139837265, 'timestamp': '2025-09-30 22:16:03.755973', 'step': 2269, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:03.789195', 'step': 2269, 'epoch': 2} {'type': 'loss', 'content': 0.009005401283502579, 'timestamp': '2025-09-30 22:16:03.799620', 'step': 2270, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:03.840153', 'step': 2270, 'epoch': 2} {'type': 'loss', 'content': 0.03305266425013542, 'timestamp': '2025-09-30 22:16:03.848093', 'step': 2271, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:03.884066', 'step': 2271, 'epoch': 2} {'type': 'loss', 'content': 0.011565910652279854, 'timestamp': '2025-09-30 22:16:03.915975', 'step': 2272, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:16:03.949919', 'step': 2272, 'epoch': 2} {'type': 'loss', 'content': 0.008708333596587181, 'timestamp': '2025-09-30 22:16:03.955269', 'step': 2273, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:16:03.997173', 'step': 2273, 'epoch': 2} {'type': 'loss', 'content': 0.00512832123786211, 'timestamp': '2025-09-30 22:16:04.013036', 'step': 2274, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:16:04.059062', 'step': 2274, 'epoch': 2} {'type': 'loss', 'content': 0.010472118854522705, 'timestamp': '2025-09-30 22:16:04.073066', 'step': 2275, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:16:04.128173', 'step': 2275, 'epoch': 2} {'type': 'loss', 'content': 0.009419584646821022, 'timestamp': '2025-09-30 22:16:04.162719', 'step': 2276, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:04.196702', 'step': 2276, 'epoch': 2} {'type': 'loss', 'content': 0.005723748356103897, 'timestamp': '2025-09-30 22:16:04.205403', 'step': 2277, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:16:04.237453', 'step': 2277, 'epoch': 2} {'type': 'loss', 'content': 0.008166803047060966, 'timestamp': '2025-09-30 22:16:04.242077', 'step': 2278, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:16:04.281982', 'step': 2278, 'epoch': 2} {'type': 'loss', 'content': 0.01263397466391325, 'timestamp': '2025-09-30 22:16:04.294286', 'step': 2279, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:04.328337', 'step': 2279, 'epoch': 2} {'type': 'loss', 'content': 0.010637044906616211, 'timestamp': '2025-09-30 22:16:04.360112', 'step': 2280, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:16:04.400038', 'step': 2280, 'epoch': 2} {'type': 'loss', 'content': 0.013092324137687683, 'timestamp': '2025-09-30 22:16:04.412677', 'step': 2281, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:16:04.455526', 'step': 2281, 'epoch': 2} {'type': 'loss', 'content': 0.009603707119822502, 'timestamp': '2025-09-30 22:16:04.463172', 'step': 2282, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:04.499443', 'step': 2282, 'epoch': 2} {'type': 'loss', 'content': 0.013961341232061386, 'timestamp': '2025-09-30 22:16:04.510495', 'step': 2283, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:04.546979', 'step': 2283, 'epoch': 2} {'type': 'loss', 'content': 0.00484241358935833, 'timestamp': '2025-09-30 22:16:04.578965', 'step': 2284, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:16:04.617418', 'step': 2284, 'epoch': 2} {'type': 'loss', 'content': 0.014262514188885689, 'timestamp': '2025-09-30 22:16:04.627949', 'step': 2285, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:04.665288', 'step': 2285, 'epoch': 2} {'type': 'loss', 'content': 0.022015521302819252, 'timestamp': '2025-09-30 22:16:04.673285', 'step': 2286, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:16:04.710185', 'step': 2286, 'epoch': 2} {'type': 'loss', 'content': 0.011776491068303585, 'timestamp': '2025-09-30 22:16:04.722682', 'step': 2287, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:16:04.762171', 'step': 2287, 'epoch': 2} {'type': 'loss', 'content': 0.010544958524405956, 'timestamp': '2025-09-30 22:16:04.796898', 'step': 2288, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:16:04.841237', 'step': 2288, 'epoch': 2} {'type': 'loss', 'content': 0.003932147286832333, 'timestamp': '2025-09-30 22:16:04.854444', 'step': 2289, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:16:04.892166', 'step': 2289, 'epoch': 2} {'type': 'loss', 'content': 0.011136190965771675, 'timestamp': '2025-09-30 22:16:04.906039', 'step': 2290, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-09-30 22:16:04.949567', 'step': 2290, 'epoch': 2} {'type': 'loss', 'content': 0.015524694696068764, 'timestamp': '2025-09-30 22:16:04.965867', 'step': 2291, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:16:05.006660', 'step': 2291, 'epoch': 2} {'type': 'loss', 'content': 0.0072592394426465034, 'timestamp': '2025-09-30 22:16:05.040033', 'step': 2292, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:16:05.085470', 'step': 2292, 'epoch': 2} {'type': 'loss', 'content': 0.009627988561987877, 'timestamp': '2025-09-30 22:16:05.098109', 'step': 2293, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:16:05.140393', 'step': 2293, 'epoch': 2} {'type': 'loss', 'content': 0.007930876687169075, 'timestamp': '2025-09-30 22:16:05.156283', 'step': 2294, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:05.192704', 'step': 2294, 'epoch': 2} {'type': 'loss', 'content': 0.016360390931367874, 'timestamp': '2025-09-30 22:16:05.203121', 'step': 2295, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:05.241402', 'step': 2295, 'epoch': 2} {'type': 'loss', 'content': 0.005407324526458979, 'timestamp': '2025-09-30 22:16:05.273448', 'step': 2296, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:05.305388', 'step': 2296, 'epoch': 2} {'type': 'loss', 'content': 0.01181397121399641, 'timestamp': '2025-09-30 22:16:05.313278', 'step': 2297, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:16:05.349354', 'step': 2297, 'epoch': 2} {'type': 'loss', 'content': 0.005481473170220852, 'timestamp': '2025-09-30 22:16:05.362681', 'step': 2298, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:16:05.401286', 'step': 2298, 'epoch': 2} {'type': 'loss', 'content': 0.006777736358344555, 'timestamp': '2025-09-30 22:16:05.415292', 'step': 2299, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:16:05.455667', 'step': 2299, 'epoch': 2} {'type': 'loss', 'content': 0.010990391485393047, 'timestamp': '2025-09-30 22:16:05.489098', 'step': 2300, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-09-30 22:16:07.986643', 'step': 2300, 'epoch': 2} {'type': 'pplx', 'content': 5.654332293541132, 'timestamp': '2025-09-30 22:16:07.993683', 'step': 2300, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:16:08.027455', 'step': 2300, 'epoch': 2} {'type': 'loss', 'content': 0.01496883388608694, 'timestamp': '2025-09-30 22:16:08.037225', 'step': 2301, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:16:08.084041', 'step': 2301, 'epoch': 2} {'type': 'loss', 'content': 0.011960986070334911, 'timestamp': '2025-09-30 22:16:08.096687', 'step': 2302, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:08.137462', 'step': 2302, 'epoch': 2} {'type': 'loss', 'content': 0.00994796585291624, 'timestamp': '2025-09-30 22:16:08.145376', 'step': 2303, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:16:08.186949', 'step': 2303, 'epoch': 2} {'type': 'loss', 'content': 0.010986410081386566, 'timestamp': '2025-09-30 22:16:08.223487', 'step': 2304, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:16:08.263976', 'step': 2304, 'epoch': 2} {'type': 'loss', 'content': 0.00654952647164464, 'timestamp': '2025-09-30 22:16:08.279375', 'step': 2305, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:16:08.329375', 'step': 2305, 'epoch': 2} {'type': 'loss', 'content': 0.009799250401556492, 'timestamp': '2025-09-30 22:16:08.343208', 'step': 2306, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:16:08.379722', 'step': 2306, 'epoch': 2} {'type': 'loss', 'content': 0.014504171907901764, 'timestamp': '2025-09-30 22:16:08.392324', 'step': 2307, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:16:08.431664', 'step': 2307, 'epoch': 2} {'type': 'loss', 'content': 0.007755675353109837, 'timestamp': '2025-09-30 22:16:08.466561', 'step': 2308, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:16:08.502141', 'step': 2308, 'epoch': 2} {'type': 'loss', 'content': 0.00974066648632288, 'timestamp': '2025-09-30 22:16:08.507056', 'step': 2309, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:16:08.552495', 'step': 2309, 'epoch': 2} {'type': 'loss', 'content': 0.00655874889343977, 'timestamp': '2025-09-30 22:16:08.566227', 'step': 2310, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:08.609713', 'step': 2310, 'epoch': 2} {'type': 'loss', 'content': 0.01651756651699543, 'timestamp': '2025-09-30 22:16:08.617716', 'step': 2311, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:16:08.653081', 'step': 2311, 'epoch': 2} {'type': 'loss', 'content': 0.011428349651396275, 'timestamp': '2025-09-30 22:16:08.681551', 'step': 2312, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 14238374516352}, 'timestamp': '2025-09-30 22:16:08.731610', 'step': 2312, 'epoch': 2} {'type': 'loss', 'content': 0.004849064163863659, 'timestamp': '2025-09-30 22:16:08.748632', 'step': 2313, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:08.786046', 'step': 2313, 'epoch': 2} {'type': 'loss', 'content': 0.026103388518095016, 'timestamp': '2025-09-30 22:16:08.794220', 'step': 2314, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:16:08.838816', 'step': 2314, 'epoch': 2} {'type': 'loss', 'content': 0.011875770054757595, 'timestamp': '2025-09-30 22:16:08.846363', 'step': 2315, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:08.883435', 'step': 2315, 'epoch': 2} {'type': 'loss', 'content': 0.009886275045573711, 'timestamp': '2025-09-30 22:16:08.914639', 'step': 2316, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:16:08.949276', 'step': 2316, 'epoch': 2} {'type': 'loss', 'content': 0.009622431360185146, 'timestamp': '2025-09-30 22:16:08.959874', 'step': 2317, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-09-30 22:16:09.015050', 'step': 2317, 'epoch': 2} {'type': 'loss', 'content': 0.007912329398095608, 'timestamp': '2025-09-30 22:16:09.031208', 'step': 2318, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:16:09.082762', 'step': 2318, 'epoch': 2} {'type': 'loss', 'content': 0.012053360231220722, 'timestamp': '2025-09-30 22:16:09.096340', 'step': 2319, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:16:09.135134', 'step': 2319, 'epoch': 2} {'type': 'loss', 'content': 0.011790354736149311, 'timestamp': '2025-09-30 22:16:09.168329', 'step': 2320, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:09.210180', 'step': 2320, 'epoch': 2} {'type': 'loss', 'content': 0.013455498032271862, 'timestamp': '2025-09-30 22:16:09.218294', 'step': 2321, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:09.254726', 'step': 2321, 'epoch': 2} {'type': 'loss', 'content': 0.007671833038330078, 'timestamp': '2025-09-30 22:16:09.266004', 'step': 2322, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:16:09.299708', 'step': 2322, 'epoch': 2} {'type': 'loss', 'content': 0.008541908115148544, 'timestamp': '2025-09-30 22:16:09.307351', 'step': 2323, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:16:09.343828', 'step': 2323, 'epoch': 2} {'type': 'loss', 'content': 0.010388360358774662, 'timestamp': '2025-09-30 22:16:09.377070', 'step': 2324, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:16:09.419433', 'step': 2324, 'epoch': 2} {'type': 'loss', 'content': 0.004493786953389645, 'timestamp': '2025-09-30 22:16:09.432079', 'step': 2325, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:16:09.488396', 'step': 2325, 'epoch': 2} {'type': 'loss', 'content': 0.01047692820429802, 'timestamp': '2025-09-30 22:16:09.500894', 'step': 2326, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:16:09.538722', 'step': 2326, 'epoch': 2} {'type': 'loss', 'content': 0.008095546625554562, 'timestamp': '2025-09-30 22:16:09.552465', 'step': 2327, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:09.591892', 'step': 2327, 'epoch': 2} {'type': 'loss', 'content': 0.006904910784214735, 'timestamp': '2025-09-30 22:16:09.623791', 'step': 2328, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:09.658613', 'step': 2328, 'epoch': 2} {'type': 'loss', 'content': 0.0041915723122656345, 'timestamp': '2025-09-30 22:16:09.667322', 'step': 2329, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:09.705640', 'step': 2329, 'epoch': 2} {'type': 'loss', 'content': 0.013046212494373322, 'timestamp': '2025-09-30 22:16:09.716740', 'step': 2330, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:09.750844', 'step': 2330, 'epoch': 2} {'type': 'loss', 'content': 0.009823589585721493, 'timestamp': '2025-09-30 22:16:09.761201', 'step': 2331, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:16:09.806873', 'step': 2331, 'epoch': 2} {'type': 'loss', 'content': 0.00981562864035368, 'timestamp': '2025-09-30 22:16:09.843358', 'step': 2332, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:16:09.885675', 'step': 2332, 'epoch': 2} {'type': 'loss', 'content': 0.012619983404874802, 'timestamp': '2025-09-30 22:16:09.898279', 'step': 2333, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:16:09.945767', 'step': 2333, 'epoch': 2} {'type': 'loss', 'content': 0.007371451240032911, 'timestamp': '2025-09-30 22:16:09.959477', 'step': 2334, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:09.994393', 'step': 2334, 'epoch': 2} {'type': 'loss', 'content': 0.012231948785483837, 'timestamp': '2025-09-30 22:16:10.002444', 'step': 2335, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:10.035472', 'step': 2335, 'epoch': 2} {'type': 'loss', 'content': 0.007391262799501419, 'timestamp': '2025-09-30 22:16:10.066705', 'step': 2336, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:16:10.100360', 'step': 2336, 'epoch': 2} {'type': 'loss', 'content': 0.007522133179008961, 'timestamp': '2025-09-30 22:16:10.111067', 'step': 2337, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:16:10.146506', 'step': 2337, 'epoch': 2} {'type': 'loss', 'content': 0.0057350401766598225, 'timestamp': '2025-09-30 22:16:10.160252', 'step': 2338, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:16:10.205516', 'step': 2338, 'epoch': 2} {'type': 'loss', 'content': 0.005650083534419537, 'timestamp': '2025-09-30 22:16:10.219254', 'step': 2339, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:16:10.257407', 'step': 2339, 'epoch': 2} {'type': 'loss', 'content': 0.003318364266306162, 'timestamp': '2025-09-30 22:16:10.291869', 'step': 2340, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:16:10.328847', 'step': 2340, 'epoch': 2} {'type': 'loss', 'content': 0.009202542714774609, 'timestamp': '2025-09-30 22:16:10.342033', 'step': 2341, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:16:10.378956', 'step': 2341, 'epoch': 2} {'type': 'loss', 'content': 0.009690223261713982, 'timestamp': '2025-09-30 22:16:10.391289', 'step': 2342, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:10.425727', 'step': 2342, 'epoch': 2} {'type': 'loss', 'content': 0.009548195637762547, 'timestamp': '2025-09-30 22:16:10.433693', 'step': 2343, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:16:10.476039', 'step': 2343, 'epoch': 2} {'type': 'loss', 'content': 0.012893835082650185, 'timestamp': '2025-09-30 22:16:10.509297', 'step': 2344, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:10.545556', 'step': 2344, 'epoch': 2} {'type': 'loss', 'content': 0.008734777569770813, 'timestamp': '2025-09-30 22:16:10.554333', 'step': 2345, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:10.588437', 'step': 2345, 'epoch': 2} {'type': 'loss', 'content': 0.011990435421466827, 'timestamp': '2025-09-30 22:16:10.596350', 'step': 2346, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:16:10.641052', 'step': 2346, 'epoch': 2} {'type': 'loss', 'content': 0.008576703257858753, 'timestamp': '2025-09-30 22:16:10.656700', 'step': 2347, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:10.694763', 'step': 2347, 'epoch': 2} {'type': 'loss', 'content': 0.008816730231046677, 'timestamp': '2025-09-30 22:16:10.723549', 'step': 2348, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:16:10.761325', 'step': 2348, 'epoch': 2} {'type': 'loss', 'content': 0.008760577067732811, 'timestamp': '2025-09-30 22:16:10.774006', 'step': 2349, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:10.812614', 'step': 2349, 'epoch': 2} {'type': 'loss', 'content': 0.008204920217394829, 'timestamp': '2025-09-30 22:16:10.823675', 'step': 2350, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:16:10.868150', 'step': 2350, 'epoch': 2} {'type': 'loss', 'content': 0.004204670432955027, 'timestamp': '2025-09-30 22:16:10.881888', 'step': 2351, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:16:10.920760', 'step': 2351, 'epoch': 2} {'type': 'loss', 'content': 0.0049379183910787106, 'timestamp': '2025-09-30 22:16:10.954940', 'step': 2352, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:16:10.994008', 'step': 2352, 'epoch': 2} {'type': 'loss', 'content': 0.00872363243252039, 'timestamp': '2025-09-30 22:16:11.004536', 'step': 2353, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:16:11.036420', 'step': 2353, 'epoch': 2} {'type': 'loss', 'content': 0.00507052568718791, 'timestamp': '2025-09-30 22:16:11.048576', 'step': 2354, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:16:11.092561', 'step': 2354, 'epoch': 2} {'type': 'loss', 'content': 0.0075722322799265385, 'timestamp': '2025-09-30 22:16:11.108474', 'step': 2355, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:16:11.150002', 'step': 2355, 'epoch': 2} {'type': 'loss', 'content': 0.006875215098261833, 'timestamp': '2025-09-30 22:16:11.184801', 'step': 2356, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:11.222281', 'step': 2356, 'epoch': 2} {'type': 'loss', 'content': 0.005606391932815313, 'timestamp': '2025-09-30 22:16:11.230096', 'step': 2357, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:16:11.272905', 'step': 2357, 'epoch': 2} {'type': 'loss', 'content': 0.005990367848426104, 'timestamp': '2025-09-30 22:16:11.286616', 'step': 2358, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:16:11.321998', 'step': 2358, 'epoch': 2} {'type': 'loss', 'content': 0.0067135924473404884, 'timestamp': '2025-09-30 22:16:11.334311', 'step': 2359, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:11.370658', 'step': 2359, 'epoch': 2} {'type': 'loss', 'content': 0.008387668989598751, 'timestamp': '2025-09-30 22:16:11.402586', 'step': 2360, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:16:11.441575', 'step': 2360, 'epoch': 2} {'type': 'loss', 'content': 0.014556759037077427, 'timestamp': '2025-09-30 22:16:11.452505', 'step': 2361, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:11.485920', 'step': 2361, 'epoch': 2} {'type': 'loss', 'content': 0.012365092523396015, 'timestamp': '2025-09-30 22:16:11.496208', 'step': 2362, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:11.533541', 'step': 2362, 'epoch': 2} {'type': 'loss', 'content': 0.0046156723983585835, 'timestamp': '2025-09-30 22:16:11.544790', 'step': 2363, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:16:11.587643', 'step': 2363, 'epoch': 2} {'type': 'loss', 'content': 0.005602897610515356, 'timestamp': '2025-09-30 22:16:11.615795', 'step': 2364, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:11.650319', 'step': 2364, 'epoch': 2} {'type': 'loss', 'content': 0.012667978182435036, 'timestamp': '2025-09-30 22:16:11.655979', 'step': 2365, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:16:11.695462', 'step': 2365, 'epoch': 2} {'type': 'loss', 'content': 0.012357855215668678, 'timestamp': '2025-09-30 22:16:11.702762', 'step': 2366, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:16:11.746803', 'step': 2366, 'epoch': 2} {'type': 'loss', 'content': 0.009272625669836998, 'timestamp': '2025-09-30 22:16:11.760584', 'step': 2367, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:11.801286', 'step': 2367, 'epoch': 2} {'type': 'loss', 'content': 0.006464353296905756, 'timestamp': '2025-09-30 22:16:11.830158', 'step': 2368, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:16:11.867584', 'step': 2368, 'epoch': 2} {'type': 'loss', 'content': 0.006658497266471386, 'timestamp': '2025-09-30 22:16:11.880867', 'step': 2369, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:16:11.920448', 'step': 2369, 'epoch': 2} {'type': 'loss', 'content': 0.00856840517371893, 'timestamp': '2025-09-30 22:16:11.933019', 'step': 2370, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:11.966895', 'step': 2370, 'epoch': 2} {'type': 'loss', 'content': 0.010510051622986794, 'timestamp': '2025-09-30 22:16:11.974909', 'step': 2371, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:16:12.010697', 'step': 2371, 'epoch': 2} {'type': 'loss', 'content': 0.014684943482279778, 'timestamp': '2025-09-30 22:16:12.043669', 'step': 2372, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:16:12.084295', 'step': 2372, 'epoch': 2} {'type': 'loss', 'content': 0.015482643619179726, 'timestamp': '2025-09-30 22:16:12.097542', 'step': 2373, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:16:12.136340', 'step': 2373, 'epoch': 2} {'type': 'loss', 'content': 0.0045043122954666615, 'timestamp': '2025-09-30 22:16:12.148871', 'step': 2374, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:16:12.191886', 'step': 2374, 'epoch': 2} {'type': 'loss', 'content': 0.0032745180651545525, 'timestamp': '2025-09-30 22:16:12.205656', 'step': 2375, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:16:12.249738', 'step': 2375, 'epoch': 2} {'type': 'loss', 'content': 0.01642913930118084, 'timestamp': '2025-09-30 22:16:12.277673', 'step': 2376, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:16:12.315312', 'step': 2376, 'epoch': 2} {'type': 'loss', 'content': 0.01349701825529337, 'timestamp': '2025-09-30 22:16:12.320074', 'step': 2377, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:12.369416', 'step': 2377, 'epoch': 2} {'type': 'loss', 'content': 0.009943741373717785, 'timestamp': '2025-09-30 22:16:12.377247', 'step': 2378, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:16:12.412531', 'step': 2378, 'epoch': 2} {'type': 'loss', 'content': 0.006427043117582798, 'timestamp': '2025-09-30 22:16:12.425852', 'step': 2379, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:16:12.461702', 'step': 2379, 'epoch': 2} {'type': 'loss', 'content': 0.00699341855943203, 'timestamp': '2025-09-30 22:16:12.489860', 'step': 2380, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:16:12.528088', 'step': 2380, 'epoch': 2} {'type': 'loss', 'content': 0.01376278419047594, 'timestamp': '2025-09-30 22:16:12.533124', 'step': 2381, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:16:12.567016', 'step': 2381, 'epoch': 2} {'type': 'loss', 'content': 0.009683472104370594, 'timestamp': '2025-09-30 22:16:12.574638', 'step': 2382, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:16:12.614887', 'step': 2382, 'epoch': 2} {'type': 'loss', 'content': 0.0067740269005298615, 'timestamp': '2025-09-30 22:16:12.628244', 'step': 2383, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:12.663627', 'step': 2383, 'epoch': 2} {'type': 'loss', 'content': 0.011330408044159412, 'timestamp': '2025-09-30 22:16:12.692387', 'step': 2384, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:16:12.733478', 'step': 2384, 'epoch': 2} {'type': 'loss', 'content': 0.007617818657308817, 'timestamp': '2025-09-30 22:16:12.746519', 'step': 2385, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:16:12.784572', 'step': 2385, 'epoch': 2} {'type': 'loss', 'content': 0.009767604991793633, 'timestamp': '2025-09-30 22:16:12.798242', 'step': 2386, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:16:12.839657', 'step': 2386, 'epoch': 2} {'type': 'loss', 'content': 0.01219944842159748, 'timestamp': '2025-09-30 22:16:12.853019', 'step': 2387, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:16:12.891201', 'step': 2387, 'epoch': 2} {'type': 'loss', 'content': 0.009838934056460857, 'timestamp': '2025-09-30 22:16:12.925962', 'step': 2388, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:16:12.960997', 'step': 2388, 'epoch': 2} {'type': 'loss', 'content': 0.006134702358394861, 'timestamp': '2025-09-30 22:16:12.970896', 'step': 2389, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:13.006589', 'step': 2389, 'epoch': 2} {'type': 'loss', 'content': 0.011561652645468712, 'timestamp': '2025-09-30 22:16:13.017371', 'step': 2390, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:16:13.061650', 'step': 2390, 'epoch': 2} {'type': 'loss', 'content': 0.009399686008691788, 'timestamp': '2025-09-30 22:16:13.074252', 'step': 2391, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:16:13.109015', 'step': 2391, 'epoch': 2} {'type': 'loss', 'content': 0.00535060977563262, 'timestamp': '2025-09-30 22:16:13.142394', 'step': 2392, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:13.181106', 'step': 2392, 'epoch': 2} {'type': 'loss', 'content': 0.01159277930855751, 'timestamp': '2025-09-30 22:16:13.189000', 'step': 2393, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:16:13.239405', 'step': 2393, 'epoch': 2} {'type': 'loss', 'content': 0.011592809110879898, 'timestamp': '2025-09-30 22:16:13.246638', 'step': 2394, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:16:13.279490', 'step': 2394, 'epoch': 2} {'type': 'loss', 'content': 0.010551626794040203, 'timestamp': '2025-09-30 22:16:13.286589', 'step': 2395, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:16:13.321047', 'step': 2395, 'epoch': 2} {'type': 'loss', 'content': 0.009111042134463787, 'timestamp': '2025-09-30 22:16:13.348677', 'step': 2396, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:13.380692', 'step': 2396, 'epoch': 2} {'type': 'loss', 'content': 0.009802721440792084, 'timestamp': '2025-09-30 22:16:13.386543', 'step': 2397, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:16:13.418794', 'step': 2397, 'epoch': 2} {'type': 'loss', 'content': 0.01750762201845646, 'timestamp': '2025-09-30 22:16:13.425641', 'step': 2398, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:16:13.476173', 'step': 2398, 'epoch': 2} {'type': 'loss', 'content': 0.008682657033205032, 'timestamp': '2025-09-30 22:16:13.491744', 'step': 2399, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:16:13.532182', 'step': 2399, 'epoch': 2} {'type': 'loss', 'content': 0.007306818384677172, 'timestamp': '2025-09-30 22:16:13.566773', 'step': 2400, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:16:13.602351', 'step': 2400, 'epoch': 2} {'type': 'loss', 'content': 0.006612339522689581, 'timestamp': '2025-09-30 22:16:13.607719', 'step': 2401, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:16:13.647546', 'step': 2401, 'epoch': 2} {'type': 'loss', 'content': 0.01855228841304779, 'timestamp': '2025-09-30 22:16:13.651843', 'step': 2402, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:16:13.684274', 'step': 2402, 'epoch': 2} {'type': 'loss', 'content': 0.009422802366316319, 'timestamp': '2025-09-30 22:16:13.691819', 'step': 2403, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:16:13.735767', 'step': 2403, 'epoch': 2} {'type': 'loss', 'content': 0.006999382749199867, 'timestamp': '2025-09-30 22:16:13.769084', 'step': 2404, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:16:13.806366', 'step': 2404, 'epoch': 2} {'type': 'loss', 'content': 0.01017130259424448, 'timestamp': '2025-09-30 22:16:13.811600', 'step': 2405, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:16:13.846575', 'step': 2405, 'epoch': 2} {'type': 'loss', 'content': 0.006849296856671572, 'timestamp': '2025-09-30 22:16:13.858943', 'step': 2406, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:13.898835', 'step': 2406, 'epoch': 2} {'type': 'loss', 'content': 0.0065921517089009285, 'timestamp': '2025-09-30 22:16:13.906770', 'step': 2407, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:13.939852', 'step': 2407, 'epoch': 2} {'type': 'loss', 'content': 0.010219089686870575, 'timestamp': '2025-09-30 22:16:13.968598', 'step': 2408, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:14.020548', 'step': 2408, 'epoch': 2} {'type': 'loss', 'content': 0.012665183283388615, 'timestamp': '2025-09-30 22:16:14.026229', 'step': 2409, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:14.072604', 'step': 2409, 'epoch': 2} {'type': 'loss', 'content': 0.00628651725128293, 'timestamp': '2025-09-30 22:16:14.082962', 'step': 2410, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:16:14.127701', 'step': 2410, 'epoch': 2} {'type': 'loss', 'content': 0.006621459033340216, 'timestamp': '2025-09-30 22:16:14.141400', 'step': 2411, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:16:14.180853', 'step': 2411, 'epoch': 2} {'type': 'loss', 'content': 0.005125768482685089, 'timestamp': '2025-09-30 22:16:14.206457', 'step': 2412, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:14.248047', 'step': 2412, 'epoch': 2} {'type': 'loss', 'content': 0.0072546289302408695, 'timestamp': '2025-09-30 22:16:14.255996', 'step': 2413, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:16:14.301296', 'step': 2413, 'epoch': 2} {'type': 'loss', 'content': 0.01086366269737482, 'timestamp': '2025-09-30 22:16:14.308500', 'step': 2414, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:16:14.344791', 'step': 2414, 'epoch': 2} {'type': 'loss', 'content': 0.007983213290572166, 'timestamp': '2025-09-30 22:16:14.358434', 'step': 2415, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-09-30 22:16:16.915306', 'step': 2415, 'epoch': 2} {'type': 'pplx', 'content': 5.737540649916071, 'timestamp': '2025-09-30 22:16:16.919280', 'step': 2415, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:16:16.955181', 'step': 2415, 'epoch': 2} {'type': 'loss', 'content': 0.006403180770576, 'timestamp': '2025-09-30 22:16:16.989677', 'step': 2416, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:16:17.022799', 'step': 2416, 'epoch': 2} {'type': 'loss', 'content': 0.00468442402780056, 'timestamp': '2025-09-30 22:16:17.035425', 'step': 2417, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:17.082390', 'step': 2417, 'epoch': 2} {'type': 'loss', 'content': 0.010039767250418663, 'timestamp': '2025-09-30 22:16:17.090187', 'step': 2418, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:16:17.121965', 'step': 2418, 'epoch': 2} {'type': 'loss', 'content': 0.006315943319350481, 'timestamp': '2025-09-30 22:16:17.129959', 'step': 2419, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:17.166227', 'step': 2419, 'epoch': 2} {'type': 'loss', 'content': 0.007796914782375097, 'timestamp': '2025-09-30 22:16:17.197906', 'step': 2420, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-09-30 22:16:17.243771', 'step': 2420, 'epoch': 2} {'type': 'loss', 'content': 0.008052662014961243, 'timestamp': '2025-09-30 22:16:17.259645', 'step': 2421, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:17.291402', 'step': 2421, 'epoch': 2} {'type': 'loss', 'content': 0.015868842601776123, 'timestamp': '2025-09-30 22:16:17.302606', 'step': 2422, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:16:17.335296', 'step': 2422, 'epoch': 2} {'type': 'loss', 'content': 0.009983470663428307, 'timestamp': '2025-09-30 22:16:17.347470', 'step': 2423, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:17.382111', 'step': 2423, 'epoch': 2} {'type': 'loss', 'content': 0.00888828095048666, 'timestamp': '2025-09-30 22:16:17.410939', 'step': 2424, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 17085996872448}, 'timestamp': '2025-09-30 22:16:17.463049', 'step': 2424, 'epoch': 2} {'type': 'loss', 'content': 0.008122816681861877, 'timestamp': '2025-09-30 22:16:17.482311', 'step': 2425, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:16:17.520721', 'step': 2425, 'epoch': 2} {'type': 'loss', 'content': 0.004961279686540365, 'timestamp': '2025-09-30 22:16:17.534678', 'step': 2426, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:17.574067', 'step': 2426, 'epoch': 2} {'type': 'loss', 'content': 0.01577206328511238, 'timestamp': '2025-09-30 22:16:17.584434', 'step': 2427, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:17.619610', 'step': 2427, 'epoch': 2} {'type': 'loss', 'content': 0.012107166461646557, 'timestamp': '2025-09-30 22:16:17.650878', 'step': 2428, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:16:17.684781', 'step': 2428, 'epoch': 2} {'type': 'loss', 'content': 0.012150055728852749, 'timestamp': '2025-09-30 22:16:17.695378', 'step': 2429, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:16:17.733725', 'step': 2429, 'epoch': 2} {'type': 'loss', 'content': 0.0029978309758007526, 'timestamp': '2025-09-30 22:16:17.747736', 'step': 2430, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:16:17.785934', 'step': 2430, 'epoch': 2} {'type': 'loss', 'content': 0.005970521830022335, 'timestamp': '2025-09-30 22:16:17.799720', 'step': 2431, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:16:17.836452', 'step': 2431, 'epoch': 2} {'type': 'loss', 'content': 0.014105760492384434, 'timestamp': '2025-09-30 22:16:17.864695', 'step': 2432, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:17.902411', 'step': 2432, 'epoch': 2} {'type': 'loss', 'content': 0.008667535148561, 'timestamp': '2025-09-30 22:16:17.911159', 'step': 2433, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:17.948623', 'step': 2433, 'epoch': 2} {'type': 'loss', 'content': 0.009693967178463936, 'timestamp': '2025-09-30 22:16:17.959544', 'step': 2434, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:17.995178', 'step': 2434, 'epoch': 2} {'type': 'loss', 'content': 0.011925452388823032, 'timestamp': '2025-09-30 22:16:18.006317', 'step': 2435, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:18.043425', 'step': 2435, 'epoch': 2} {'type': 'loss', 'content': 0.006588513497263193, 'timestamp': '2025-09-30 22:16:18.074604', 'step': 2436, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-09-30 22:16:18.127126', 'step': 2436, 'epoch': 2} {'type': 'loss', 'content': 0.005089726764708757, 'timestamp': '2025-09-30 22:16:18.143914', 'step': 2437, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:16:18.177605', 'step': 2437, 'epoch': 2} {'type': 'loss', 'content': 0.006040696520358324, 'timestamp': '2025-09-30 22:16:18.184623', 'step': 2438, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:18.220581', 'step': 2438, 'epoch': 2} {'type': 'loss', 'content': 0.00973731279373169, 'timestamp': '2025-09-30 22:16:18.230888', 'step': 2439, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:18.264961', 'step': 2439, 'epoch': 2} {'type': 'loss', 'content': 0.015074980445206165, 'timestamp': '2025-09-30 22:16:18.293759', 'step': 2440, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:18.329568', 'step': 2440, 'epoch': 2} {'type': 'loss', 'content': 0.0043279207311570644, 'timestamp': '2025-09-30 22:16:18.336144', 'step': 2441, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:18.376995', 'step': 2441, 'epoch': 2} {'type': 'loss', 'content': 0.010119735263288021, 'timestamp': '2025-09-30 22:16:18.388059', 'step': 2442, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:16:18.429741', 'step': 2442, 'epoch': 2} {'type': 'loss', 'content': 0.008410328067839146, 'timestamp': '2025-09-30 22:16:18.443218', 'step': 2443, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:18.480669', 'step': 2443, 'epoch': 2} {'type': 'loss', 'content': 0.009942535310983658, 'timestamp': '2025-09-30 22:16:18.512748', 'step': 2444, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:16:18.549460', 'step': 2444, 'epoch': 2} {'type': 'loss', 'content': 0.01868273876607418, 'timestamp': '2025-09-30 22:16:18.559537', 'step': 2445, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:16:18.597734', 'step': 2445, 'epoch': 2} {'type': 'loss', 'content': 0.012855646200478077, 'timestamp': '2025-09-30 22:16:18.605047', 'step': 2446, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:16:18.648395', 'step': 2446, 'epoch': 2} {'type': 'loss', 'content': 0.009789690375328064, 'timestamp': '2025-09-30 22:16:18.656230', 'step': 2447, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:18.691108', 'step': 2447, 'epoch': 2} {'type': 'loss', 'content': 0.008820690214633942, 'timestamp': '2025-09-30 22:16:18.723033', 'step': 2448, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:16:18.755344', 'step': 2448, 'epoch': 2} {'type': 'loss', 'content': 0.008334523998200893, 'timestamp': '2025-09-30 22:16:18.760621', 'step': 2449, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:18.803603', 'step': 2449, 'epoch': 2} {'type': 'loss', 'content': 0.007933998480439186, 'timestamp': '2025-09-30 22:16:18.814007', 'step': 2450, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:16:18.854635', 'step': 2450, 'epoch': 2} {'type': 'loss', 'content': 0.006995997857302427, 'timestamp': '2025-09-30 22:16:18.868573', 'step': 2451, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:16:18.907642', 'step': 2451, 'epoch': 2} {'type': 'loss', 'content': 0.01130268257111311, 'timestamp': '2025-09-30 22:16:18.935684', 'step': 2452, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:18.969416', 'step': 2452, 'epoch': 2} {'type': 'loss', 'content': 0.011828478425741196, 'timestamp': '2025-09-30 22:16:18.978069', 'step': 2453, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:19.018470', 'step': 2453, 'epoch': 2} {'type': 'loss', 'content': 0.015309597365558147, 'timestamp': '2025-09-30 22:16:19.028841', 'step': 2454, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:19.071713', 'step': 2454, 'epoch': 2} {'type': 'loss', 'content': 0.007825467735528946, 'timestamp': '2025-09-30 22:16:19.082826', 'step': 2455, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:19.117512', 'step': 2455, 'epoch': 2} {'type': 'loss', 'content': 0.01192080695182085, 'timestamp': '2025-09-30 22:16:19.149548', 'step': 2456, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:16:19.188800', 'step': 2456, 'epoch': 2} {'type': 'loss', 'content': 0.006193151697516441, 'timestamp': '2025-09-30 22:16:19.196221', 'step': 2457, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:19.236831', 'step': 2457, 'epoch': 2} {'type': 'loss', 'content': 0.013730679638683796, 'timestamp': '2025-09-30 22:16:19.247849', 'step': 2458, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:19.281047', 'step': 2458, 'epoch': 2} {'type': 'loss', 'content': 0.003911986015737057, 'timestamp': '2025-09-30 22:16:19.292332', 'step': 2459, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:16:19.330889', 'step': 2459, 'epoch': 2} {'type': 'loss', 'content': 0.008487740531563759, 'timestamp': '2025-09-30 22:16:19.364271', 'step': 2460, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:19.400229', 'step': 2460, 'epoch': 2} {'type': 'loss', 'content': 0.006258985493332148, 'timestamp': '2025-09-30 22:16:19.405799', 'step': 2461, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:19.442597', 'step': 2461, 'epoch': 2} {'type': 'loss', 'content': 0.01047790702432394, 'timestamp': '2025-09-30 22:16:19.452943', 'step': 2462, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:19.485640', 'step': 2462, 'epoch': 2} {'type': 'loss', 'content': 0.008732033893465996, 'timestamp': '2025-09-30 22:16:19.496708', 'step': 2463, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:19.529525', 'step': 2463, 'epoch': 2} {'type': 'loss', 'content': 0.012635108083486557, 'timestamp': '2025-09-30 22:16:19.559028', 'step': 2464, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:16:19.595619', 'step': 2464, 'epoch': 2} {'type': 'loss', 'content': 0.00821733009070158, 'timestamp': '2025-09-30 22:16:19.608629', 'step': 2465, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 14238374516352}, 'timestamp': '2025-09-30 22:16:19.651985', 'step': 2465, 'epoch': 2} {'type': 'loss', 'content': 0.011071404442191124, 'timestamp': '2025-09-30 22:16:19.669233', 'step': 2466, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:16:19.707719', 'step': 2466, 'epoch': 2} {'type': 'loss', 'content': 0.00789080373942852, 'timestamp': '2025-09-30 22:16:19.721745', 'step': 2467, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:16:19.760665', 'step': 2467, 'epoch': 2} {'type': 'loss', 'content': 0.010599637404084206, 'timestamp': '2025-09-30 22:16:19.795171', 'step': 2468, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:16:19.828950', 'step': 2468, 'epoch': 2} {'type': 'loss', 'content': 0.010458008386194706, 'timestamp': '2025-09-30 22:16:19.835183', 'step': 2469, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:16:19.873203', 'step': 2469, 'epoch': 2} {'type': 'loss', 'content': 0.008729826658964157, 'timestamp': '2025-09-30 22:16:19.886927', 'step': 2470, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-09-30 22:16:19.932519', 'step': 2470, 'epoch': 2} {'type': 'loss', 'content': 0.0057611926458776, 'timestamp': '2025-09-30 22:16:19.949570', 'step': 2471, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:16:19.998869', 'step': 2471, 'epoch': 2} {'type': 'loss', 'content': 0.00834943912923336, 'timestamp': '2025-09-30 22:16:20.033453', 'step': 2472, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:16:20.076278', 'step': 2472, 'epoch': 2} {'type': 'loss', 'content': 0.00887401681393385, 'timestamp': '2025-09-30 22:16:20.089600', 'step': 2473, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-09-30 22:16:20.137178', 'step': 2473, 'epoch': 2} {'type': 'loss', 'content': 0.007680539041757584, 'timestamp': '2025-09-30 22:16:20.153494', 'step': 2474, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 14238374516352}, 'timestamp': '2025-09-30 22:16:20.197366', 'step': 2474, 'epoch': 2} {'type': 'loss', 'content': 0.004287196788936853, 'timestamp': '2025-09-30 22:16:20.214669', 'step': 2475, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:16:20.252421', 'step': 2475, 'epoch': 2} {'type': 'loss', 'content': 0.007217390462756157, 'timestamp': '2025-09-30 22:16:20.287328', 'step': 2476, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-09-30 22:16:20.335581', 'step': 2476, 'epoch': 2} {'type': 'loss', 'content': 0.0035889248829334974, 'timestamp': '2025-09-30 22:16:20.351437', 'step': 2477, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:16:20.384709', 'step': 2477, 'epoch': 2} {'type': 'loss', 'content': 0.02392013557255268, 'timestamp': '2025-09-30 22:16:20.392412', 'step': 2478, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:20.426764', 'step': 2478, 'epoch': 2} {'type': 'loss', 'content': 0.003831626381725073, 'timestamp': '2025-09-30 22:16:20.441049', 'step': 2479, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:16:20.478795', 'step': 2479, 'epoch': 2} {'type': 'loss', 'content': 0.011907713487744331, 'timestamp': '2025-09-30 22:16:20.511857', 'step': 2480, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:16:20.548947', 'step': 2480, 'epoch': 2} {'type': 'loss', 'content': 0.008370657451450825, 'timestamp': '2025-09-30 22:16:20.558592', 'step': 2481, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:16:20.591530', 'step': 2481, 'epoch': 2} {'type': 'loss', 'content': 0.005929028615355492, 'timestamp': '2025-09-30 22:16:20.603878', 'step': 2482, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:16:20.637823', 'step': 2482, 'epoch': 2} {'type': 'loss', 'content': 0.007656347006559372, 'timestamp': '2025-09-30 22:16:20.650373', 'step': 2483, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:20.683612', 'step': 2483, 'epoch': 2} {'type': 'loss', 'content': 0.0049138241447508335, 'timestamp': '2025-09-30 22:16:20.714766', 'step': 2484, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:16:20.750051', 'step': 2484, 'epoch': 2} {'type': 'loss', 'content': 0.005719912238419056, 'timestamp': '2025-09-30 22:16:20.762751', 'step': 2485, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:16:20.799487', 'step': 2485, 'epoch': 2} {'type': 'loss', 'content': 0.011933359317481518, 'timestamp': '2025-09-30 22:16:20.812828', 'step': 2486, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:16:20.846310', 'step': 2486, 'epoch': 2} {'type': 'loss', 'content': 0.018113916739821434, 'timestamp': '2025-09-30 22:16:20.853880', 'step': 2487, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:16:20.890287', 'step': 2487, 'epoch': 2} {'type': 'loss', 'content': 0.012025549076497555, 'timestamp': '2025-09-30 22:16:20.918608', 'step': 2488, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:20.953535', 'step': 2488, 'epoch': 2} {'type': 'loss', 'content': 0.007998720742762089, 'timestamp': '2025-09-30 22:16:20.961927', 'step': 2489, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:16:20.997830', 'step': 2489, 'epoch': 2} {'type': 'loss', 'content': 0.005165472161024809, 'timestamp': '2025-09-30 22:16:21.010097', 'step': 2490, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:16:21.049700', 'step': 2490, 'epoch': 2} {'type': 'loss', 'content': 0.007589935790747404, 'timestamp': '2025-09-30 22:16:21.056953', 'step': 2491, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:16:21.092870', 'step': 2491, 'epoch': 2} {'type': 'loss', 'content': 0.0076933749951422215, 'timestamp': '2025-09-30 22:16:21.126056', 'step': 2492, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:16:21.160510', 'step': 2492, 'epoch': 2} {'type': 'loss', 'content': 0.0104972580447793, 'timestamp': '2025-09-30 22:16:21.166457', 'step': 2493, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:21.211090', 'step': 2493, 'epoch': 2} {'type': 'loss', 'content': 0.013154943473637104, 'timestamp': '2025-09-30 22:16:21.221173', 'step': 2494, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:16:21.255937', 'step': 2494, 'epoch': 2} {'type': 'loss', 'content': 0.011266938410699368, 'timestamp': '2025-09-30 22:16:21.262921', 'step': 2495, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:21.297006', 'step': 2495, 'epoch': 2} {'type': 'loss', 'content': 0.011694843880832195, 'timestamp': '2025-09-30 22:16:21.328117', 'step': 2496, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:21.364606', 'step': 2496, 'epoch': 2} {'type': 'loss', 'content': 0.006242073141038418, 'timestamp': '2025-09-30 22:16:21.372335', 'step': 2497, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:16:21.408912', 'step': 2497, 'epoch': 2} {'type': 'loss', 'content': 0.014575202949345112, 'timestamp': '2025-09-30 22:16:21.421445', 'step': 2498, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:21.457475', 'step': 2498, 'epoch': 2} {'type': 'loss', 'content': 0.006570762023329735, 'timestamp': '2025-09-30 22:16:21.467671', 'step': 2499, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:21.501720', 'step': 2499, 'epoch': 2} {'type': 'loss', 'content': 0.0061673750169575214, 'timestamp': '2025-09-30 22:16:21.530445', 'step': 2500, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 2500', 'timestamp': '2025-09-30 22:16:26.366956', 'step': 2500, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:16:26.406129', 'step': 2500, 'epoch': 2} {'type': 'loss', 'content': 0.010745136067271233, 'timestamp': '2025-09-30 22:16:26.410270', 'step': 2501, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:16:26.442760', 'step': 2501, 'epoch': 2} {'type': 'loss', 'content': 0.00644065672531724, 'timestamp': '2025-09-30 22:16:26.449422', 'step': 2502, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:26.481597', 'step': 2502, 'epoch': 2} {'type': 'loss', 'content': 0.016653813421726227, 'timestamp': '2025-09-30 22:16:26.489325', 'step': 2503, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:26.527490', 'step': 2503, 'epoch': 2} {'type': 'loss', 'content': 0.011729870922863483, 'timestamp': '2025-09-30 22:16:26.556100', 'step': 2504, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:26.589616', 'step': 2504, 'epoch': 2} {'type': 'loss', 'content': 0.006627806928008795, 'timestamp': '2025-09-30 22:16:26.597553', 'step': 2505, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:16:26.638820', 'step': 2505, 'epoch': 2} {'type': 'loss', 'content': 0.005113981198519468, 'timestamp': '2025-09-30 22:16:26.651402', 'step': 2506, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:16:26.683533', 'step': 2506, 'epoch': 2} {'type': 'loss', 'content': 0.004428889602422714, 'timestamp': '2025-09-30 22:16:26.688899', 'step': 2507, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:16:26.726642', 'step': 2507, 'epoch': 2} {'type': 'loss', 'content': 0.025327321141958237, 'timestamp': '2025-09-30 22:16:26.754278', 'step': 2508, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:16:26.790840', 'step': 2508, 'epoch': 2} {'type': 'loss', 'content': 0.01675552688539028, 'timestamp': '2025-09-30 22:16:26.800592', 'step': 2509, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:26.835688', 'step': 2509, 'epoch': 2} {'type': 'loss', 'content': 0.0040430487133562565, 'timestamp': '2025-09-30 22:16:26.846124', 'step': 2510, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:16:26.886527', 'step': 2510, 'epoch': 2} {'type': 'loss', 'content': 0.007266952656209469, 'timestamp': '2025-09-30 22:16:26.899859', 'step': 2511, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:26.937660', 'step': 2511, 'epoch': 2} {'type': 'loss', 'content': 0.01092456839978695, 'timestamp': '2025-09-30 22:16:26.966426', 'step': 2512, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:26.998657', 'step': 2512, 'epoch': 2} {'type': 'loss', 'content': 0.00776576017960906, 'timestamp': '2025-09-30 22:16:27.006758', 'step': 2513, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:16:27.047301', 'step': 2513, 'epoch': 2} {'type': 'loss', 'content': 0.009044959209859371, 'timestamp': '2025-09-30 22:16:27.063161', 'step': 2514, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:16:27.102847', 'step': 2514, 'epoch': 2} {'type': 'loss', 'content': 0.005568441469222307, 'timestamp': '2025-09-30 22:16:27.116192', 'step': 2515, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:16:27.156933', 'step': 2515, 'epoch': 2} {'type': 'loss', 'content': 0.009347072802484035, 'timestamp': '2025-09-30 22:16:27.191520', 'step': 2516, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:16:27.233377', 'step': 2516, 'epoch': 2} {'type': 'loss', 'content': 0.008022931404411793, 'timestamp': '2025-09-30 22:16:27.246390', 'step': 2517, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:16:27.285188', 'step': 2517, 'epoch': 2} {'type': 'loss', 'content': 0.008891950361430645, 'timestamp': '2025-09-30 22:16:27.297429', 'step': 2518, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:27.330607', 'step': 2518, 'epoch': 2} {'type': 'loss', 'content': 0.00859333761036396, 'timestamp': '2025-09-30 22:16:27.338533', 'step': 2519, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:16:27.375467', 'step': 2519, 'epoch': 2} {'type': 'loss', 'content': 0.005585793871432543, 'timestamp': '2025-09-30 22:16:27.409989', 'step': 2520, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:16:27.449868', 'step': 2520, 'epoch': 2} {'type': 'loss', 'content': 0.010565678589046001, 'timestamp': '2025-09-30 22:16:27.462581', 'step': 2521, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:16:27.507750', 'step': 2521, 'epoch': 2} {'type': 'loss', 'content': 0.01040729507803917, 'timestamp': '2025-09-30 22:16:27.521433', 'step': 2522, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:16:27.559216', 'step': 2522, 'epoch': 2} {'type': 'loss', 'content': 0.006701262202113867, 'timestamp': '2025-09-30 22:16:27.565955', 'step': 2523, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:16:27.601023', 'step': 2523, 'epoch': 2} {'type': 'loss', 'content': 0.005230138543993235, 'timestamp': '2025-09-30 22:16:27.629372', 'step': 2524, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:16:27.667404', 'step': 2524, 'epoch': 2} {'type': 'loss', 'content': 0.010266603901982307, 'timestamp': '2025-09-30 22:16:27.682462', 'step': 2525, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:27.717250', 'step': 2525, 'epoch': 2} {'type': 'loss', 'content': 0.0036118924617767334, 'timestamp': '2025-09-30 22:16:27.727497', 'step': 2526, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:16:27.763559', 'step': 2526, 'epoch': 2} {'type': 'loss', 'content': 0.007853069342672825, 'timestamp': '2025-09-30 22:16:27.770406', 'step': 2527, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:27.808173', 'step': 2527, 'epoch': 2} {'type': 'loss', 'content': 0.00653937179595232, 'timestamp': '2025-09-30 22:16:27.839559', 'step': 2528, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:27.873332', 'step': 2528, 'epoch': 2} {'type': 'loss', 'content': 0.010696524754166603, 'timestamp': '2025-09-30 22:16:27.878801', 'step': 2529, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:16:27.919553', 'step': 2529, 'epoch': 2} {'type': 'loss', 'content': 0.014034731313586235, 'timestamp': '2025-09-30 22:16:27.931896', 'step': 2530, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-09-30 22:16:30.424563', 'step': 2530, 'epoch': 2} {'type': 'pplx', 'content': 5.654377316625182, 'timestamp': '2025-09-30 22:16:30.428687', 'step': 2530, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:16:30.464281', 'step': 2530, 'epoch': 2} {'type': 'loss', 'content': 0.00823940522968769, 'timestamp': '2025-09-30 22:16:30.471104', 'step': 2531, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:16:30.522552', 'step': 2531, 'epoch': 2} {'type': 'loss', 'content': 0.015429342165589333, 'timestamp': '2025-09-30 22:16:30.555856', 'step': 2532, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:16:30.592964', 'step': 2532, 'epoch': 2} {'type': 'loss', 'content': 0.008980813436210155, 'timestamp': '2025-09-30 22:16:30.605934', 'step': 2533, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:30.639244', 'step': 2533, 'epoch': 2} {'type': 'loss', 'content': 0.007303296122699976, 'timestamp': '2025-09-30 22:16:30.649415', 'step': 2534, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:30.685297', 'step': 2534, 'epoch': 2} {'type': 'loss', 'content': 0.013391010463237762, 'timestamp': '2025-09-30 22:16:30.695787', 'step': 2535, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:30.738871', 'step': 2535, 'epoch': 2} {'type': 'loss', 'content': 0.007490305695682764, 'timestamp': '2025-09-30 22:16:30.770795', 'step': 2536, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:30.810229', 'step': 2536, 'epoch': 2} {'type': 'loss', 'content': 0.0018074701074510813, 'timestamp': '2025-09-30 22:16:30.817979', 'step': 2537, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:16:30.852843', 'step': 2537, 'epoch': 2} {'type': 'loss', 'content': 0.012867008335888386, 'timestamp': '2025-09-30 22:16:30.859920', 'step': 2538, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:30.893288', 'step': 2538, 'epoch': 2} {'type': 'loss', 'content': 0.00832181703299284, 'timestamp': '2025-09-30 22:16:30.904228', 'step': 2539, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-09-30 22:16:30.945095', 'step': 2539, 'epoch': 2} {'type': 'loss', 'content': 0.00538244005292654, 'timestamp': '2025-09-30 22:16:30.982099', 'step': 2540, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:16:31.018686', 'step': 2540, 'epoch': 2} {'type': 'loss', 'content': 0.00587612995877862, 'timestamp': '2025-09-30 22:16:31.022409', 'step': 2541, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:16:31.062304', 'step': 2541, 'epoch': 2} {'type': 'loss', 'content': 0.016657264903187752, 'timestamp': '2025-09-30 22:16:31.069281', 'step': 2542, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:16:31.108102', 'step': 2542, 'epoch': 2} {'type': 'loss', 'content': 0.011934111826121807, 'timestamp': '2025-09-30 22:16:31.115634', 'step': 2543, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:31.147905', 'step': 2543, 'epoch': 2} {'type': 'loss', 'content': 0.005036134272813797, 'timestamp': '2025-09-30 22:16:31.179686', 'step': 2544, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:31.216022', 'step': 2544, 'epoch': 2} {'type': 'loss', 'content': 0.010882146656513214, 'timestamp': '2025-09-30 22:16:31.221624', 'step': 2545, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:31.253776', 'step': 2545, 'epoch': 2} {'type': 'loss', 'content': 0.01084787119179964, 'timestamp': '2025-09-30 22:16:31.264877', 'step': 2546, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:16:31.299924', 'step': 2546, 'epoch': 2} {'type': 'loss', 'content': 0.01012638583779335, 'timestamp': '2025-09-30 22:16:31.311991', 'step': 2547, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:16:31.345813', 'step': 2547, 'epoch': 2} {'type': 'loss', 'content': 0.006890283897519112, 'timestamp': '2025-09-30 22:16:31.373863', 'step': 2548, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:31.408098', 'step': 2548, 'epoch': 2} {'type': 'loss', 'content': 0.010032973252236843, 'timestamp': '2025-09-30 22:16:31.416516', 'step': 2549, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:16:31.449055', 'step': 2549, 'epoch': 2} {'type': 'loss', 'content': 0.011477479711174965, 'timestamp': '2025-09-30 22:16:31.455962', 'step': 2550, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:16:31.488570', 'step': 2550, 'epoch': 2} {'type': 'loss', 'content': 0.007900014519691467, 'timestamp': '2025-09-30 22:16:31.495300', 'step': 2551, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:16:31.532558', 'step': 2551, 'epoch': 2} {'type': 'loss', 'content': 0.015551622956991196, 'timestamp': '2025-09-30 22:16:31.560961', 'step': 2552, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:16:31.594401', 'step': 2552, 'epoch': 2} {'type': 'loss', 'content': 0.009452381171286106, 'timestamp': '2025-09-30 22:16:31.599608', 'step': 2553, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:16:31.644040', 'step': 2553, 'epoch': 2} {'type': 'loss', 'content': 0.004166701342910528, 'timestamp': '2025-09-30 22:16:31.651274', 'step': 2554, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:31.701101', 'step': 2554, 'epoch': 2} {'type': 'loss', 'content': 0.015731127932667732, 'timestamp': '2025-09-30 22:16:31.709088', 'step': 2555, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:31.745287', 'step': 2555, 'epoch': 2} {'type': 'loss', 'content': 0.004945465829223394, 'timestamp': '2025-09-30 22:16:31.776352', 'step': 2556, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:16:31.811329', 'step': 2556, 'epoch': 2} {'type': 'loss', 'content': 0.007961086928844452, 'timestamp': '2025-09-30 22:16:31.816408', 'step': 2557, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:31.853086', 'step': 2557, 'epoch': 2} {'type': 'loss', 'content': 0.008037255145609379, 'timestamp': '2025-09-30 22:16:31.863402', 'step': 2558, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:16:31.906718', 'step': 2558, 'epoch': 2} {'type': 'loss', 'content': 0.004936587065458298, 'timestamp': '2025-09-30 22:16:31.911001', 'step': 2559, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:16:31.949467', 'step': 2559, 'epoch': 2} {'type': 'loss', 'content': 0.015803245827555656, 'timestamp': '2025-09-30 22:16:31.974932', 'step': 2560, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:16:32.017608', 'step': 2560, 'epoch': 2} {'type': 'loss', 'content': 0.00856830459088087, 'timestamp': '2025-09-30 22:16:32.022313', 'step': 2561, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:16:32.079728', 'step': 2561, 'epoch': 2} {'type': 'loss', 'content': 0.02954977937042713, 'timestamp': '2025-09-30 22:16:32.086470', 'step': 2562, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:16:32.127790', 'step': 2562, 'epoch': 2} {'type': 'loss', 'content': 0.010794623754918575, 'timestamp': '2025-09-30 22:16:32.134859', 'step': 2563, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:16:32.167878', 'step': 2563, 'epoch': 2} {'type': 'loss', 'content': 0.0016124057583510876, 'timestamp': '2025-09-30 22:16:32.195796', 'step': 2564, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:32.233460', 'step': 2564, 'epoch': 2} {'type': 'loss', 'content': 0.0062332903034985065, 'timestamp': '2025-09-30 22:16:32.238652', 'step': 2565, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:16:32.272115', 'step': 2565, 'epoch': 2} {'type': 'loss', 'content': 0.002981910016387701, 'timestamp': '2025-09-30 22:16:32.279156', 'step': 2566, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:16:32.314210', 'step': 2566, 'epoch': 2} {'type': 'loss', 'content': 0.008241718634963036, 'timestamp': '2025-09-30 22:16:32.321503', 'step': 2567, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:16:32.362712', 'step': 2567, 'epoch': 2} {'type': 'loss', 'content': 0.007429973687976599, 'timestamp': '2025-09-30 22:16:32.399485', 'step': 2568, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:16:32.433979', 'step': 2568, 'epoch': 2} {'type': 'loss', 'content': 0.011728125624358654, 'timestamp': '2025-09-30 22:16:32.439461', 'step': 2569, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:16:32.475116', 'step': 2569, 'epoch': 2} {'type': 'loss', 'content': 0.008637667633593082, 'timestamp': '2025-09-30 22:16:32.482225', 'step': 2570, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:32.518689', 'step': 2570, 'epoch': 2} {'type': 'loss', 'content': 0.009170379489660263, 'timestamp': '2025-09-30 22:16:32.529468', 'step': 2571, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:16:32.570697', 'step': 2571, 'epoch': 2} {'type': 'loss', 'content': 0.005199885461479425, 'timestamp': '2025-09-30 22:16:32.605392', 'step': 2572, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:32.641528', 'step': 2572, 'epoch': 2} {'type': 'loss', 'content': 0.020000383257865906, 'timestamp': '2025-09-30 22:16:32.650019', 'step': 2573, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:16:32.686321', 'step': 2573, 'epoch': 2} {'type': 'loss', 'content': 0.008462299592792988, 'timestamp': '2025-09-30 22:16:32.693378', 'step': 2574, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:16:32.728326', 'step': 2574, 'epoch': 2} {'type': 'loss', 'content': 0.004085686057806015, 'timestamp': '2025-09-30 22:16:32.735849', 'step': 2575, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:32.772599', 'step': 2575, 'epoch': 2} {'type': 'loss', 'content': 0.013277919963002205, 'timestamp': '2025-09-30 22:16:32.803629', 'step': 2576, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:32.844427', 'step': 2576, 'epoch': 2} {'type': 'loss', 'content': 0.005645753815770149, 'timestamp': '2025-09-30 22:16:32.850083', 'step': 2577, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:16:32.888253', 'step': 2577, 'epoch': 2} {'type': 'loss', 'content': 0.0074727097526192665, 'timestamp': '2025-09-30 22:16:32.900712', 'step': 2578, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:32.941864', 'step': 2578, 'epoch': 2} {'type': 'loss', 'content': 0.007461579516530037, 'timestamp': '2025-09-30 22:16:32.952756', 'step': 2579, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:16:32.987664', 'step': 2579, 'epoch': 2} {'type': 'loss', 'content': 0.012325400486588478, 'timestamp': '2025-09-30 22:16:33.015601', 'step': 2580, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:16:33.054659', 'step': 2580, 'epoch': 2} {'type': 'loss', 'content': 0.00944701861590147, 'timestamp': '2025-09-30 22:16:33.069755', 'step': 2581, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:33.105362', 'step': 2581, 'epoch': 2} {'type': 'loss', 'content': 0.012234282679855824, 'timestamp': '2025-09-30 22:16:33.113181', 'step': 2582, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:33.149167', 'step': 2582, 'epoch': 2} {'type': 'loss', 'content': 0.0048253824934363365, 'timestamp': '2025-09-30 22:16:33.159244', 'step': 2583, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:33.194481', 'step': 2583, 'epoch': 2} {'type': 'loss', 'content': 0.013428851962089539, 'timestamp': '2025-09-30 22:16:33.225744', 'step': 2584, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:33.267188', 'step': 2584, 'epoch': 2} {'type': 'loss', 'content': 0.011841162107884884, 'timestamp': '2025-09-30 22:16:33.272662', 'step': 2585, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 14238374516352}, 'timestamp': '2025-09-30 22:16:33.317346', 'step': 2585, 'epoch': 2} {'type': 'loss', 'content': 0.010258554480969906, 'timestamp': '2025-09-30 22:16:33.334686', 'step': 2586, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:16:33.370633', 'step': 2586, 'epoch': 2} {'type': 'loss', 'content': 0.005433372687548399, 'timestamp': '2025-09-30 22:16:33.382688', 'step': 2587, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:16:33.418001', 'step': 2587, 'epoch': 2} {'type': 'loss', 'content': 0.017100220546126366, 'timestamp': '2025-09-30 22:16:33.446271', 'step': 2588, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:16:33.480058', 'step': 2588, 'epoch': 2} {'type': 'loss', 'content': 0.009693666361272335, 'timestamp': '2025-09-30 22:16:33.490248', 'step': 2589, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:33.524849', 'step': 2589, 'epoch': 2} {'type': 'loss', 'content': 0.01575091853737831, 'timestamp': '2025-09-30 22:16:33.532749', 'step': 2590, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:16:33.566359', 'step': 2590, 'epoch': 2} {'type': 'loss', 'content': 0.013555164448916912, 'timestamp': '2025-09-30 22:16:33.573939', 'step': 2591, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:33.620980', 'step': 2591, 'epoch': 2} {'type': 'loss', 'content': 0.013175432570278645, 'timestamp': '2025-09-30 22:16:33.649725', 'step': 2592, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:33.685842', 'step': 2592, 'epoch': 2} {'type': 'loss', 'content': 0.007773387245833874, 'timestamp': '2025-09-30 22:16:33.694664', 'step': 2593, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:33.730637', 'step': 2593, 'epoch': 2} {'type': 'loss', 'content': 0.008757934905588627, 'timestamp': '2025-09-30 22:16:33.741739', 'step': 2594, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:33.777947', 'step': 2594, 'epoch': 2} {'type': 'loss', 'content': 0.005879280623048544, 'timestamp': '2025-09-30 22:16:33.788979', 'step': 2595, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:16:33.829584', 'step': 2595, 'epoch': 2} {'type': 'loss', 'content': 0.007139183115214109, 'timestamp': '2025-09-30 22:16:33.864212', 'step': 2596, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:33.897771', 'step': 2596, 'epoch': 2} {'type': 'loss', 'content': 0.008979469537734985, 'timestamp': '2025-09-30 22:16:33.906208', 'step': 2597, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:33.949431', 'step': 2597, 'epoch': 2} {'type': 'loss', 'content': 0.010954844765365124, 'timestamp': '2025-09-30 22:16:33.960462', 'step': 2598, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:33.995326', 'step': 2598, 'epoch': 2} {'type': 'loss', 'content': 0.008132541552186012, 'timestamp': '2025-09-30 22:16:34.006472', 'step': 2599, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:16:34.047075', 'step': 2599, 'epoch': 2} {'type': 'loss', 'content': 0.008435402065515518, 'timestamp': '2025-09-30 22:16:34.080552', 'step': 2600, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:34.123059', 'step': 2600, 'epoch': 2} {'type': 'loss', 'content': 0.019805217161774635, 'timestamp': '2025-09-30 22:16:34.131148', 'step': 2601, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:34.174337', 'step': 2601, 'epoch': 2} {'type': 'loss', 'content': 0.008815511129796505, 'timestamp': '2025-09-30 22:16:34.185605', 'step': 2602, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:16:34.224405', 'step': 2602, 'epoch': 2} {'type': 'loss', 'content': 0.007207936607301235, 'timestamp': '2025-09-30 22:16:34.238341', 'step': 2603, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:16:34.275919', 'step': 2603, 'epoch': 2} {'type': 'loss', 'content': 0.017533591017127037, 'timestamp': '2025-09-30 22:16:34.310609', 'step': 2604, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:16:34.348011', 'step': 2604, 'epoch': 2} {'type': 'loss', 'content': 0.008848981000483036, 'timestamp': '2025-09-30 22:16:34.361129', 'step': 2605, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:34.394621', 'step': 2605, 'epoch': 2} {'type': 'loss', 'content': 0.006609481293708086, 'timestamp': '2025-09-30 22:16:34.404896', 'step': 2606, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:16:34.441784', 'step': 2606, 'epoch': 2} {'type': 'loss', 'content': 0.005682497750967741, 'timestamp': '2025-09-30 22:16:34.455550', 'step': 2607, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:34.493287', 'step': 2607, 'epoch': 2} {'type': 'loss', 'content': 0.010640106163918972, 'timestamp': '2025-09-30 22:16:34.525326', 'step': 2608, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:16:34.566598', 'step': 2608, 'epoch': 2} {'type': 'loss', 'content': 0.011010797694325447, 'timestamp': '2025-09-30 22:16:34.576558', 'step': 2609, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 18984411776512}, 'timestamp': '2025-09-30 22:16:34.641726', 'step': 2609, 'epoch': 2} {'type': 'loss', 'content': 0.004885141737759113, 'timestamp': '2025-09-30 22:16:34.663503', 'step': 2610, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:34.700772', 'step': 2610, 'epoch': 2} {'type': 'loss', 'content': 0.0041885981336236, 'timestamp': '2025-09-30 22:16:34.711163', 'step': 2611, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:16:34.746051', 'step': 2611, 'epoch': 2} {'type': 'loss', 'content': 0.007177658379077911, 'timestamp': '2025-09-30 22:16:34.771359', 'step': 2612, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:16:34.819069', 'step': 2612, 'epoch': 2} {'type': 'loss', 'content': 0.02318485453724861, 'timestamp': '2025-09-30 22:16:34.823663', 'step': 2613, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:16:34.869241', 'step': 2613, 'epoch': 2} {'type': 'loss', 'content': 0.003985985182225704, 'timestamp': '2025-09-30 22:16:34.876722', 'step': 2614, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:16:34.915102', 'step': 2614, 'epoch': 2} {'type': 'loss', 'content': 0.012808909639716148, 'timestamp': '2025-09-30 22:16:34.922383', 'step': 2615, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:16:34.965188', 'step': 2615, 'epoch': 2} {'type': 'loss', 'content': 0.0033121935557574034, 'timestamp': '2025-09-30 22:16:34.999702', 'step': 2616, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:35.051946', 'step': 2616, 'epoch': 2} {'type': 'loss', 'content': 0.008870789781212807, 'timestamp': '2025-09-30 22:16:35.060216', 'step': 2617, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:16:35.099110', 'step': 2617, 'epoch': 2} {'type': 'loss', 'content': 0.007540592923760414, 'timestamp': '2025-09-30 22:16:35.112465', 'step': 2618, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:16:35.158616', 'step': 2618, 'epoch': 2} {'type': 'loss', 'content': 0.006059629376977682, 'timestamp': '2025-09-30 22:16:35.172077', 'step': 2619, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:16:35.213335', 'step': 2619, 'epoch': 2} {'type': 'loss', 'content': 0.007945427671074867, 'timestamp': '2025-09-30 22:16:35.248076', 'step': 2620, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:16:35.284637', 'step': 2620, 'epoch': 2} {'type': 'loss', 'content': 0.010467395186424255, 'timestamp': '2025-09-30 22:16:35.297938', 'step': 2621, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:35.336130', 'step': 2621, 'epoch': 2} {'type': 'loss', 'content': 0.013638225384056568, 'timestamp': '2025-09-30 22:16:35.344127', 'step': 2622, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:16:35.382241', 'step': 2622, 'epoch': 2} {'type': 'loss', 'content': 0.011790476739406586, 'timestamp': '2025-09-30 22:16:35.389735', 'step': 2623, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:16:35.429339', 'step': 2623, 'epoch': 2} {'type': 'loss', 'content': 0.010595177300274372, 'timestamp': '2025-09-30 22:16:35.462690', 'step': 2624, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:16:35.504914', 'step': 2624, 'epoch': 2} {'type': 'loss', 'content': 0.004159559495747089, 'timestamp': '2025-09-30 22:16:35.520372', 'step': 2625, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:35.564123', 'step': 2625, 'epoch': 2} {'type': 'loss', 'content': 0.0123995216563344, 'timestamp': '2025-09-30 22:16:35.572155', 'step': 2626, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:16:35.611513', 'step': 2626, 'epoch': 2} {'type': 'loss', 'content': 0.007546386681497097, 'timestamp': '2025-09-30 22:16:35.624905', 'step': 2627, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:16:35.664135', 'step': 2627, 'epoch': 2} {'type': 'loss', 'content': 0.02181871049106121, 'timestamp': '2025-09-30 22:16:35.697610', 'step': 2628, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:35.738846', 'step': 2628, 'epoch': 2} {'type': 'loss', 'content': 0.008894718252122402, 'timestamp': '2025-09-30 22:16:35.746893', 'step': 2629, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:16:35.789294', 'step': 2629, 'epoch': 2} {'type': 'loss', 'content': 0.011142316274344921, 'timestamp': '2025-09-30 22:16:35.796547', 'step': 2630, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 16136789420416}, 'timestamp': '2025-09-30 22:16:35.847596', 'step': 2630, 'epoch': 2} {'type': 'loss', 'content': 0.005113878753036261, 'timestamp': '2025-09-30 22:16:35.866635', 'step': 2631, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:16:35.906628', 'step': 2631, 'epoch': 2} {'type': 'loss', 'content': 0.011992032639682293, 'timestamp': '2025-09-30 22:16:35.934679', 'step': 2632, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:35.970105', 'step': 2632, 'epoch': 2} {'type': 'loss', 'content': 0.006162740755826235, 'timestamp': '2025-09-30 22:16:35.975656', 'step': 2633, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:16:36.013098', 'step': 2633, 'epoch': 2} {'type': 'loss', 'content': 0.008410664275288582, 'timestamp': '2025-09-30 22:16:36.026853', 'step': 2634, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:16:36.065476', 'step': 2634, 'epoch': 2} {'type': 'loss', 'content': 0.01313704438507557, 'timestamp': '2025-09-30 22:16:36.073020', 'step': 2635, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:36.106648', 'step': 2635, 'epoch': 2} {'type': 'loss', 'content': 0.008390288800001144, 'timestamp': '2025-09-30 22:16:36.135544', 'step': 2636, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:16:36.169330', 'step': 2636, 'epoch': 2} {'type': 'loss', 'content': 0.007198525592684746, 'timestamp': '2025-09-30 22:16:36.179917', 'step': 2637, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:36.217112', 'step': 2637, 'epoch': 2} {'type': 'loss', 'content': 0.008321291767060757, 'timestamp': '2025-09-30 22:16:36.227512', 'step': 2638, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:36.266192', 'step': 2638, 'epoch': 2} {'type': 'loss', 'content': 0.012173098511993885, 'timestamp': '2025-09-30 22:16:36.274101', 'step': 2639, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:36.309567', 'step': 2639, 'epoch': 2} {'type': 'loss', 'content': 0.00847768783569336, 'timestamp': '2025-09-30 22:16:36.340857', 'step': 2640, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:16:36.375885', 'step': 2640, 'epoch': 2} {'type': 'loss', 'content': 0.012957733124494553, 'timestamp': '2025-09-30 22:16:36.386338', 'step': 2641, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:16:36.426287', 'step': 2641, 'epoch': 2} {'type': 'loss', 'content': 0.010323571972548962, 'timestamp': '2025-09-30 22:16:36.433281', 'step': 2642, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:36.467722', 'step': 2642, 'epoch': 2} {'type': 'loss', 'content': 0.0029670840594917536, 'timestamp': '2025-09-30 22:16:36.475774', 'step': 2643, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:16:36.512189', 'step': 2643, 'epoch': 2} {'type': 'loss', 'content': 0.004911190830171108, 'timestamp': '2025-09-30 22:16:36.540758', 'step': 2644, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:36.583105', 'step': 2644, 'epoch': 2} {'type': 'loss', 'content': 0.008394652977585793, 'timestamp': '2025-09-30 22:16:36.591675', 'step': 2645, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-09-30 22:16:39.149327', 'step': 2645, 'epoch': 2} {'type': 'pplx', 'content': 5.612307348274852, 'timestamp': '2025-09-30 22:16:39.152502', 'step': 2645, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:39.188349', 'step': 2645, 'epoch': 2} {'type': 'loss', 'content': 0.005680200643837452, 'timestamp': '2025-09-30 22:16:39.195372', 'step': 2646, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:16:39.230170', 'step': 2646, 'epoch': 2} {'type': 'loss', 'content': 0.010132947005331516, 'timestamp': '2025-09-30 22:16:39.242355', 'step': 2647, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:39.283548', 'step': 2647, 'epoch': 2} {'type': 'loss', 'content': 0.0077790007926523685, 'timestamp': '2025-09-30 22:16:39.314936', 'step': 2648, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:39.348484', 'step': 2648, 'epoch': 2} {'type': 'loss', 'content': 0.012910939753055573, 'timestamp': '2025-09-30 22:16:39.359645', 'step': 2649, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:16:39.393021', 'step': 2649, 'epoch': 2} {'type': 'loss', 'content': 0.00460228743031621, 'timestamp': '2025-09-30 22:16:39.400694', 'step': 2650, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:16:39.452819', 'step': 2650, 'epoch': 2} {'type': 'loss', 'content': 0.008654727600514889, 'timestamp': '2025-09-30 22:16:39.468445', 'step': 2651, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:16:39.512822', 'step': 2651, 'epoch': 2} {'type': 'loss', 'content': 0.010901113040745258, 'timestamp': '2025-09-30 22:16:39.540963', 'step': 2652, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:16:39.583514', 'step': 2652, 'epoch': 2} {'type': 'loss', 'content': 0.006359513849020004, 'timestamp': '2025-09-30 22:16:39.596781', 'step': 2653, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 14238374516352}, 'timestamp': '2025-09-30 22:16:39.646496', 'step': 2653, 'epoch': 2} {'type': 'loss', 'content': 0.004073025193065405, 'timestamp': '2025-09-30 22:16:39.663926', 'step': 2654, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:39.709793', 'step': 2654, 'epoch': 2} {'type': 'loss', 'content': 0.0075466991402208805, 'timestamp': '2025-09-30 22:16:39.717915', 'step': 2655, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:39.765792', 'step': 2655, 'epoch': 2} {'type': 'loss', 'content': 0.005887266248464584, 'timestamp': '2025-09-30 22:16:39.796961', 'step': 2656, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:16:39.847171', 'step': 2656, 'epoch': 2} {'type': 'loss', 'content': 0.005093955434858799, 'timestamp': '2025-09-30 22:16:39.860311', 'step': 2657, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:39.901962', 'step': 2657, 'epoch': 2} {'type': 'loss', 'content': 0.008431058377027512, 'timestamp': '2025-09-30 22:16:39.913189', 'step': 2658, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:39.950678', 'step': 2658, 'epoch': 2} {'type': 'loss', 'content': 0.015701932832598686, 'timestamp': '2025-09-30 22:16:39.958670', 'step': 2659, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:16:39.999911', 'step': 2659, 'epoch': 2} {'type': 'loss', 'content': 0.008542469702661037, 'timestamp': '2025-09-30 22:16:40.034803', 'step': 2660, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:16:40.069636', 'step': 2660, 'epoch': 2} {'type': 'loss', 'content': 0.00910536665469408, 'timestamp': '2025-09-30 22:16:40.080488', 'step': 2661, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:40.127745', 'step': 2661, 'epoch': 2} {'type': 'loss', 'content': 0.008821639232337475, 'timestamp': '2025-09-30 22:16:40.138748', 'step': 2662, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:16:40.186129', 'step': 2662, 'epoch': 2} {'type': 'loss', 'content': 0.010496851988136768, 'timestamp': '2025-09-30 22:16:40.200138', 'step': 2663, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:16:40.243407', 'step': 2663, 'epoch': 2} {'type': 'loss', 'content': 0.012230618856847286, 'timestamp': '2025-09-30 22:16:40.274225', 'step': 2664, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:16:40.325452', 'step': 2664, 'epoch': 2} {'type': 'loss', 'content': 0.010552698746323586, 'timestamp': '2025-09-30 22:16:40.336039', 'step': 2665, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:16:40.392450', 'step': 2665, 'epoch': 2} {'type': 'loss', 'content': 0.008086116053164005, 'timestamp': '2025-09-30 22:16:40.400064', 'step': 2666, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:16:40.437674', 'step': 2666, 'epoch': 2} {'type': 'loss', 'content': 0.011528640985488892, 'timestamp': '2025-09-30 22:16:40.445368', 'step': 2667, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:40.484549', 'step': 2667, 'epoch': 2} {'type': 'loss', 'content': 0.005334731191396713, 'timestamp': '2025-09-30 22:16:40.515958', 'step': 2668, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:40.555628', 'step': 2668, 'epoch': 2} {'type': 'loss', 'content': 0.015003271400928497, 'timestamp': '2025-09-30 22:16:40.561228', 'step': 2669, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:16:40.602401', 'step': 2669, 'epoch': 2} {'type': 'loss', 'content': 0.004248283337801695, 'timestamp': '2025-09-30 22:16:40.614950', 'step': 2670, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:16:40.658511', 'step': 2670, 'epoch': 2} {'type': 'loss', 'content': 0.006140046752989292, 'timestamp': '2025-09-30 22:16:40.671079', 'step': 2671, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:40.707833', 'step': 2671, 'epoch': 2} {'type': 'loss', 'content': 0.006364536006003618, 'timestamp': '2025-09-30 22:16:40.739813', 'step': 2672, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:16:40.776091', 'step': 2672, 'epoch': 2} {'type': 'loss', 'content': 0.008775352500379086, 'timestamp': '2025-09-30 22:16:40.781275', 'step': 2673, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:40.827701', 'step': 2673, 'epoch': 2} {'type': 'loss', 'content': 0.006313290912657976, 'timestamp': '2025-09-30 22:16:40.838836', 'step': 2674, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:16:40.882400', 'step': 2674, 'epoch': 2} {'type': 'loss', 'content': 0.008130986243486404, 'timestamp': '2025-09-30 22:16:40.896139', 'step': 2675, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:40.933361', 'step': 2675, 'epoch': 2} {'type': 'loss', 'content': 0.009295792318880558, 'timestamp': '2025-09-30 22:16:40.964773', 'step': 2676, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:41.001906', 'step': 2676, 'epoch': 2} {'type': 'loss', 'content': 0.015373509377241135, 'timestamp': '2025-09-30 22:16:41.007655', 'step': 2677, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:41.046913', 'step': 2677, 'epoch': 2} {'type': 'loss', 'content': 0.005237262696027756, 'timestamp': '2025-09-30 22:16:41.058163', 'step': 2678, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:41.098783', 'step': 2678, 'epoch': 2} {'type': 'loss', 'content': 0.02086806111037731, 'timestamp': '2025-09-30 22:16:41.109197', 'step': 2679, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:16:41.146735', 'step': 2679, 'epoch': 2} {'type': 'loss', 'content': 0.006491804029792547, 'timestamp': '2025-09-30 22:16:41.179913', 'step': 2680, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:41.224429', 'step': 2680, 'epoch': 2} {'type': 'loss', 'content': 0.008275783620774746, 'timestamp': '2025-09-30 22:16:41.233398', 'step': 2681, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:41.272106', 'step': 2681, 'epoch': 2} {'type': 'loss', 'content': 0.006674888078123331, 'timestamp': '2025-09-30 22:16:41.280478', 'step': 2682, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:16:41.319284', 'step': 2682, 'epoch': 2} {'type': 'loss', 'content': 0.00760753033682704, 'timestamp': '2025-09-30 22:16:41.326584', 'step': 2683, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:16:41.362576', 'step': 2683, 'epoch': 2} {'type': 'loss', 'content': 0.013524006120860577, 'timestamp': '2025-09-30 22:16:41.392952', 'step': 2684, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:16:41.429735', 'step': 2684, 'epoch': 2} {'type': 'loss', 'content': 0.011065523140132427, 'timestamp': '2025-09-30 22:16:41.446644', 'step': 2685, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:16:41.483438', 'step': 2685, 'epoch': 2} {'type': 'loss', 'content': 0.009008095599710941, 'timestamp': '2025-09-30 22:16:41.496004', 'step': 2686, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:16:41.538101', 'step': 2686, 'epoch': 2} {'type': 'loss', 'content': 0.00455522770062089, 'timestamp': '2025-09-30 22:16:41.551830', 'step': 2687, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:16:41.586169', 'step': 2687, 'epoch': 2} {'type': 'loss', 'content': 0.008027766831219196, 'timestamp': '2025-09-30 22:16:41.614628', 'step': 2688, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:41.657018', 'step': 2688, 'epoch': 2} {'type': 'loss', 'content': 0.008173450827598572, 'timestamp': '2025-09-30 22:16:41.664118', 'step': 2689, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:41.701175', 'step': 2689, 'epoch': 2} {'type': 'loss', 'content': 0.008348836563527584, 'timestamp': '2025-09-30 22:16:41.711587', 'step': 2690, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:41.746174', 'step': 2690, 'epoch': 2} {'type': 'loss', 'content': 0.0071425591595470905, 'timestamp': '2025-09-30 22:16:41.757208', 'step': 2691, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:41.788668', 'step': 2691, 'epoch': 2} {'type': 'loss', 'content': 0.009231861680746078, 'timestamp': '2025-09-30 22:16:41.817686', 'step': 2692, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 15662185694400}, 'timestamp': '2025-09-30 22:16:41.864045', 'step': 2692, 'epoch': 2} {'type': 'loss', 'content': 0.006422586273401976, 'timestamp': '2025-09-30 22:16:41.883276', 'step': 2693, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:16:41.918044', 'step': 2693, 'epoch': 2} {'type': 'loss', 'content': 0.006415804382413626, 'timestamp': '2025-09-30 22:16:41.930613', 'step': 2694, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:16:41.975285', 'step': 2694, 'epoch': 2} {'type': 'loss', 'content': 0.011909106746315956, 'timestamp': '2025-09-30 22:16:41.989060', 'step': 2695, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:16:42.026533', 'step': 2695, 'epoch': 2} {'type': 'loss', 'content': 0.004883012734353542, 'timestamp': '2025-09-30 22:16:42.060805', 'step': 2696, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:16:42.096110', 'step': 2696, 'epoch': 2} {'type': 'loss', 'content': 0.00944979302585125, 'timestamp': '2025-09-30 22:16:42.105923', 'step': 2697, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:16:42.147367', 'step': 2697, 'epoch': 2} {'type': 'loss', 'content': 0.008571948856115341, 'timestamp': '2025-09-30 22:16:42.161074', 'step': 2698, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:16:42.196069', 'step': 2698, 'epoch': 2} {'type': 'loss', 'content': 0.009616751223802567, 'timestamp': '2025-09-30 22:16:42.208376', 'step': 2699, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:16:42.246236', 'step': 2699, 'epoch': 2} {'type': 'loss', 'content': 0.0077133565209805965, 'timestamp': '2025-09-30 22:16:42.279415', 'step': 2700, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:16:42.317520', 'step': 2700, 'epoch': 2} {'type': 'loss', 'content': 0.018032224848866463, 'timestamp': '2025-09-30 22:16:42.327538', 'step': 2701, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:16:42.369361', 'step': 2701, 'epoch': 2} {'type': 'loss', 'content': 0.0063260579481720924, 'timestamp': '2025-09-30 22:16:42.381698', 'step': 2702, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:16:42.422675', 'step': 2702, 'epoch': 2} {'type': 'loss', 'content': 0.013565266504883766, 'timestamp': '2025-09-30 22:16:42.436662', 'step': 2703, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:42.473166', 'step': 2703, 'epoch': 2} {'type': 'loss', 'content': 0.004449347034096718, 'timestamp': '2025-09-30 22:16:42.502072', 'step': 2704, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:16:42.548352', 'step': 2704, 'epoch': 2} {'type': 'loss', 'content': 0.005007654894143343, 'timestamp': '2025-09-30 22:16:42.561721', 'step': 2705, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:16:42.602682', 'step': 2705, 'epoch': 2} {'type': 'loss', 'content': 0.007996181957423687, 'timestamp': '2025-09-30 22:16:42.615244', 'step': 2706, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:16:42.653937', 'step': 2706, 'epoch': 2} {'type': 'loss', 'content': 0.008707849308848381, 'timestamp': '2025-09-30 22:16:42.661435', 'step': 2707, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:42.700833', 'step': 2707, 'epoch': 2} {'type': 'loss', 'content': 0.010890424251556396, 'timestamp': '2025-09-30 22:16:42.736095', 'step': 2708, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:16:42.786650', 'step': 2708, 'epoch': 2} {'type': 'loss', 'content': 0.006541731301695108, 'timestamp': '2025-09-30 22:16:42.797260', 'step': 2709, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:16:42.834658', 'step': 2709, 'epoch': 2} {'type': 'loss', 'content': 0.006175168789923191, 'timestamp': '2025-09-30 22:16:42.848477', 'step': 2710, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:16:42.884532', 'step': 2710, 'epoch': 2} {'type': 'loss', 'content': 0.008247343823313713, 'timestamp': '2025-09-30 22:16:42.896901', 'step': 2711, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:16:42.930147', 'step': 2711, 'epoch': 2} {'type': 'loss', 'content': 0.0034400750882923603, 'timestamp': '2025-09-30 22:16:42.963365', 'step': 2712, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:16:42.999823', 'step': 2712, 'epoch': 2} {'type': 'loss', 'content': 0.009791983291506767, 'timestamp': '2025-09-30 22:16:43.009942', 'step': 2713, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:16:43.044919', 'step': 2713, 'epoch': 2} {'type': 'loss', 'content': 0.009476681239902973, 'timestamp': '2025-09-30 22:16:43.057276', 'step': 2714, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:43.100374', 'step': 2714, 'epoch': 2} {'type': 'loss', 'content': 0.010845120064914227, 'timestamp': '2025-09-30 22:16:43.111488', 'step': 2715, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:16:43.146675', 'step': 2715, 'epoch': 2} {'type': 'loss', 'content': 0.010124566033482552, 'timestamp': '2025-09-30 22:16:43.175100', 'step': 2716, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:16:43.211733', 'step': 2716, 'epoch': 2} {'type': 'loss', 'content': 0.0077261775732040405, 'timestamp': '2025-09-30 22:16:43.225104', 'step': 2717, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:16:43.261376', 'step': 2717, 'epoch': 2} {'type': 'loss', 'content': 0.0072675361298024654, 'timestamp': '2025-09-30 22:16:43.275145', 'step': 2718, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:16:43.307553', 'step': 2718, 'epoch': 2} {'type': 'loss', 'content': 0.006441792938858271, 'timestamp': '2025-09-30 22:16:43.320131', 'step': 2719, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:16:43.367749', 'step': 2719, 'epoch': 2} {'type': 'loss', 'content': 0.010015995241701603, 'timestamp': '2025-09-30 22:16:43.401932', 'step': 2720, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:16:43.437689', 'step': 2720, 'epoch': 2} {'type': 'loss', 'content': 0.005246052052825689, 'timestamp': '2025-09-30 22:16:43.447698', 'step': 2721, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:43.488488', 'step': 2721, 'epoch': 2} {'type': 'loss', 'content': 0.007146949879825115, 'timestamp': '2025-09-30 22:16:43.498983', 'step': 2722, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:16:43.530778', 'step': 2722, 'epoch': 2} {'type': 'loss', 'content': 0.02003488689661026, 'timestamp': '2025-09-30 22:16:43.537963', 'step': 2723, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:16:43.578736', 'step': 2723, 'epoch': 2} {'type': 'loss', 'content': 0.0034312934149056673, 'timestamp': '2025-09-30 22:16:43.613536', 'step': 2724, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:16:43.660497', 'step': 2724, 'epoch': 2} {'type': 'loss', 'content': 0.02004372887313366, 'timestamp': '2025-09-30 22:16:43.670668', 'step': 2725, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:16:43.708211', 'step': 2725, 'epoch': 2} {'type': 'loss', 'content': 0.010580234229564667, 'timestamp': '2025-09-30 22:16:43.721930', 'step': 2726, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:16:43.761115', 'step': 2726, 'epoch': 2} {'type': 'loss', 'content': 0.008645176887512207, 'timestamp': '2025-09-30 22:16:43.773466', 'step': 2727, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:43.809014', 'step': 2727, 'epoch': 2} {'type': 'loss', 'content': 0.008177714422345161, 'timestamp': '2025-09-30 22:16:43.841055', 'step': 2728, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:43.884528', 'step': 2728, 'epoch': 2} {'type': 'loss', 'content': 0.011197710409760475, 'timestamp': '2025-09-30 22:16:43.890073', 'step': 2729, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:16:43.935397', 'step': 2729, 'epoch': 2} {'type': 'loss', 'content': 0.009342173114418983, 'timestamp': '2025-09-30 22:16:43.949069', 'step': 2730, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:16:43.988455', 'step': 2730, 'epoch': 2} {'type': 'loss', 'content': 0.006484350189566612, 'timestamp': '2025-09-30 22:16:44.000833', 'step': 2731, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:16:44.046982', 'step': 2731, 'epoch': 2} {'type': 'loss', 'content': 0.00473699951544404, 'timestamp': '2025-09-30 22:16:44.083426', 'step': 2732, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:16:44.135962', 'step': 2732, 'epoch': 2} {'type': 'loss', 'content': 0.00753026595339179, 'timestamp': '2025-09-30 22:16:44.146676', 'step': 2733, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:16:44.181040', 'step': 2733, 'epoch': 2} {'type': 'loss', 'content': 0.009313041344285011, 'timestamp': '2025-09-30 22:16:44.193360', 'step': 2734, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:16:44.227806', 'step': 2734, 'epoch': 2} {'type': 'loss', 'content': 0.007867696695029736, 'timestamp': '2025-09-30 22:16:44.234981', 'step': 2735, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:16:44.277342', 'step': 2735, 'epoch': 2} {'type': 'loss', 'content': 0.007439819164574146, 'timestamp': '2025-09-30 22:16:44.305482', 'step': 2736, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:16:44.339209', 'step': 2736, 'epoch': 2} {'type': 'loss', 'content': 0.0045057861134409904, 'timestamp': '2025-09-30 22:16:44.343976', 'step': 2737, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:44.378691', 'step': 2737, 'epoch': 2} {'type': 'loss', 'content': 0.002972115995362401, 'timestamp': '2025-09-30 22:16:44.386613', 'step': 2738, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:44.419412', 'step': 2738, 'epoch': 2} {'type': 'loss', 'content': 0.004463300108909607, 'timestamp': '2025-09-30 22:16:44.429801', 'step': 2739, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:44.468385', 'step': 2739, 'epoch': 2} {'type': 'loss', 'content': 0.006214289925992489, 'timestamp': '2025-09-30 22:16:44.500414', 'step': 2740, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:44.542639', 'step': 2740, 'epoch': 2} {'type': 'loss', 'content': 0.004895984660834074, 'timestamp': '2025-09-30 22:16:44.551585', 'step': 2741, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:16:44.600086', 'step': 2741, 'epoch': 2} {'type': 'loss', 'content': 0.006714960560202599, 'timestamp': '2025-09-30 22:16:44.613504', 'step': 2742, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:44.653997', 'step': 2742, 'epoch': 2} {'type': 'loss', 'content': 0.00872341264039278, 'timestamp': '2025-09-30 22:16:44.665210', 'step': 2743, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:16:44.699864', 'step': 2743, 'epoch': 2} {'type': 'loss', 'content': 0.002013713587075472, 'timestamp': '2025-09-30 22:16:44.727754', 'step': 2744, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:44.765587', 'step': 2744, 'epoch': 2} {'type': 'loss', 'content': 0.014771471731364727, 'timestamp': '2025-09-30 22:16:44.774245', 'step': 2745, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:44.817125', 'step': 2745, 'epoch': 2} {'type': 'loss', 'content': 0.011147224344313145, 'timestamp': '2025-09-30 22:16:44.828414', 'step': 2746, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:44.863442', 'step': 2746, 'epoch': 2} {'type': 'loss', 'content': 0.006471259519457817, 'timestamp': '2025-09-30 22:16:44.873917', 'step': 2747, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:16:44.908340', 'step': 2747, 'epoch': 2} {'type': 'loss', 'content': 0.004266827832907438, 'timestamp': '2025-09-30 22:16:44.936098', 'step': 2748, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:16:44.981796', 'step': 2748, 'epoch': 2} {'type': 'loss', 'content': 0.004098384641110897, 'timestamp': '2025-09-30 22:16:44.991855', 'step': 2749, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:16:45.024391', 'step': 2749, 'epoch': 2} {'type': 'loss', 'content': 0.0010082995286211371, 'timestamp': '2025-09-30 22:16:45.028924', 'step': 2750, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:45.066802', 'step': 2750, 'epoch': 2} {'type': 'loss', 'content': 0.006948740687221289, 'timestamp': '2025-09-30 22:16:45.074810', 'step': 2751, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:16:45.111703', 'step': 2751, 'epoch': 2} {'type': 'loss', 'content': 0.011320582590997219, 'timestamp': '2025-09-30 22:16:45.137127', 'step': 2752, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:16:45.171102', 'step': 2752, 'epoch': 2} {'type': 'loss', 'content': 0.014880691654980183, 'timestamp': '2025-09-30 22:16:45.183070', 'step': 2753, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:16:45.224525', 'step': 2753, 'epoch': 2} {'type': 'loss', 'content': 0.0175726730376482, 'timestamp': '2025-09-30 22:16:45.232156', 'step': 2754, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:45.265789', 'step': 2754, 'epoch': 2} {'type': 'loss', 'content': 0.010096393525600433, 'timestamp': '2025-09-30 22:16:45.273879', 'step': 2755, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:45.306603', 'step': 2755, 'epoch': 2} {'type': 'loss', 'content': 0.009255954064428806, 'timestamp': '2025-09-30 22:16:45.337975', 'step': 2756, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:45.377046', 'step': 2756, 'epoch': 2} {'type': 'loss', 'content': 0.010803707875311375, 'timestamp': '2025-09-30 22:16:45.385053', 'step': 2757, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:45.428407', 'step': 2757, 'epoch': 2} {'type': 'loss', 'content': 0.00232686847448349, 'timestamp': '2025-09-30 22:16:45.436196', 'step': 2758, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 15187581968384}, 'timestamp': '2025-09-30 22:16:45.483115', 'step': 2758, 'epoch': 2} {'type': 'loss', 'content': 0.007045588456094265, 'timestamp': '2025-09-30 22:16:45.500864', 'step': 2759, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:16:45.535977', 'step': 2759, 'epoch': 2} {'type': 'loss', 'content': 0.014438306912779808, 'timestamp': '2025-09-30 22:16:45.569350', 'step': 2760, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-09-30 22:16:48.138642', 'step': 2760, 'epoch': 2} {'type': 'pplx', 'content': 5.673683070746086, 'timestamp': '2025-09-30 22:16:48.141952', 'step': 2760, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:16:48.181029', 'step': 2760, 'epoch': 2} {'type': 'loss', 'content': 0.010565848089754581, 'timestamp': '2025-09-30 22:16:48.193572', 'step': 2761, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:48.228837', 'step': 2761, 'epoch': 2} {'type': 'loss', 'content': 0.004915672354400158, 'timestamp': '2025-09-30 22:16:48.238782', 'step': 2762, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:48.280729', 'step': 2762, 'epoch': 2} {'type': 'loss', 'content': 0.006179687101393938, 'timestamp': '2025-09-30 22:16:48.290965', 'step': 2763, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:48.328515', 'step': 2763, 'epoch': 2} {'type': 'loss', 'content': 0.005078598856925964, 'timestamp': '2025-09-30 22:16:48.357280', 'step': 2764, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:48.395005', 'step': 2764, 'epoch': 2} {'type': 'loss', 'content': 0.009708116762340069, 'timestamp': '2025-09-30 22:16:48.402867', 'step': 2765, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:16:48.438841', 'step': 2765, 'epoch': 2} {'type': 'loss', 'content': 0.0027474777307361364, 'timestamp': '2025-09-30 22:16:48.451094', 'step': 2766, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-09-30 22:16:48.500859', 'step': 2766, 'epoch': 2} {'type': 'loss', 'content': 0.003014597110450268, 'timestamp': '2025-09-30 22:16:48.517972', 'step': 2767, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:48.559087', 'step': 2767, 'epoch': 2} {'type': 'loss', 'content': 0.003560777520760894, 'timestamp': '2025-09-30 22:16:48.590302', 'step': 2768, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:16:48.629071', 'step': 2768, 'epoch': 2} {'type': 'loss', 'content': 0.0011912431800737977, 'timestamp': '2025-09-30 22:16:48.634146', 'step': 2769, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:16:48.678507', 'step': 2769, 'epoch': 2} {'type': 'loss', 'content': 0.0011664634803310037, 'timestamp': '2025-09-30 22:16:48.685533', 'step': 2770, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:48.719682', 'step': 2770, 'epoch': 2} {'type': 'loss', 'content': 0.004645258653908968, 'timestamp': '2025-09-30 22:16:48.729838', 'step': 2771, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:16:48.762868', 'step': 2771, 'epoch': 2} {'type': 'loss', 'content': 0.001425820984877646, 'timestamp': '2025-09-30 22:16:48.789089', 'step': 2772, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:16:48.833344', 'step': 2772, 'epoch': 2} {'type': 'loss', 'content': 0.0033654433209449053, 'timestamp': '2025-09-30 22:16:48.843132', 'step': 2773, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:16:48.880928', 'step': 2773, 'epoch': 2} {'type': 'loss', 'content': 0.010003729723393917, 'timestamp': '2025-09-30 22:16:48.894800', 'step': 2774, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:16:48.937793', 'step': 2774, 'epoch': 2} {'type': 'loss', 'content': 0.0008842459646984935, 'timestamp': '2025-09-30 22:16:48.949233', 'step': 2775, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:48.993302', 'step': 2775, 'epoch': 2} {'type': 'loss', 'content': 0.0053793699480593204, 'timestamp': '2025-09-30 22:16:49.029923', 'step': 2776, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:16:49.072616', 'step': 2776, 'epoch': 2} {'type': 'loss', 'content': 0.006544886156916618, 'timestamp': '2025-09-30 22:16:49.083218', 'step': 2777, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:16:49.124103', 'step': 2777, 'epoch': 2} {'type': 'loss', 'content': 0.01111549325287342, 'timestamp': '2025-09-30 22:16:49.128581', 'step': 2778, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:16:49.161644', 'step': 2778, 'epoch': 2} {'type': 'loss', 'content': 0.004963804967701435, 'timestamp': '2025-09-30 22:16:49.168635', 'step': 2779, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:16:49.215963', 'step': 2779, 'epoch': 2} {'type': 'loss', 'content': 0.005580283235758543, 'timestamp': '2025-09-30 22:16:49.250160', 'step': 2780, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:16:49.292602', 'step': 2780, 'epoch': 2} {'type': 'loss', 'content': 0.006849126424640417, 'timestamp': '2025-09-30 22:16:49.307978', 'step': 2781, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:49.344740', 'step': 2781, 'epoch': 2} {'type': 'loss', 'content': 0.005620031151920557, 'timestamp': '2025-09-30 22:16:49.355233', 'step': 2782, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:16:49.398813', 'step': 2782, 'epoch': 2} {'type': 'loss', 'content': 0.015408042818307877, 'timestamp': '2025-09-30 22:16:49.412561', 'step': 2783, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-09-30 22:16:49.467169', 'step': 2783, 'epoch': 2} {'type': 'loss', 'content': 0.006662317551672459, 'timestamp': '2025-09-30 22:16:49.505160', 'step': 2784, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:16:49.552554', 'step': 2784, 'epoch': 2} {'type': 'loss', 'content': 0.003204222535714507, 'timestamp': '2025-09-30 22:16:49.562519', 'step': 2785, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:49.597206', 'step': 2785, 'epoch': 2} {'type': 'loss', 'content': 0.007308874279260635, 'timestamp': '2025-09-30 22:16:49.607631', 'step': 2786, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:16:49.647068', 'step': 2786, 'epoch': 2} {'type': 'loss', 'content': 0.007479496765881777, 'timestamp': '2025-09-30 22:16:49.660798', 'step': 2787, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:49.702343', 'step': 2787, 'epoch': 2} {'type': 'loss', 'content': 0.008679402060806751, 'timestamp': '2025-09-30 22:16:49.733617', 'step': 2788, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:49.774691', 'step': 2788, 'epoch': 2} {'type': 'loss', 'content': 0.007373732049018145, 'timestamp': '2025-09-30 22:16:49.782813', 'step': 2789, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:16:49.830177', 'step': 2789, 'epoch': 2} {'type': 'loss', 'content': 0.006688355002552271, 'timestamp': '2025-09-30 22:16:49.845837', 'step': 2790, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:16:49.879203', 'step': 2790, 'epoch': 2} {'type': 'loss', 'content': 0.006614117417484522, 'timestamp': '2025-09-30 22:16:49.891803', 'step': 2791, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:16:49.928934', 'step': 2791, 'epoch': 2} {'type': 'loss', 'content': 0.009344483725726604, 'timestamp': '2025-09-30 22:16:49.963487', 'step': 2792, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:16:49.996062', 'step': 2792, 'epoch': 2} {'type': 'loss', 'content': 0.00403931550681591, 'timestamp': '2025-09-30 22:16:50.005918', 'step': 2793, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:16:50.042762', 'step': 2793, 'epoch': 2} {'type': 'loss', 'content': 0.005787648260593414, 'timestamp': '2025-09-30 22:16:50.050357', 'step': 2794, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:16:50.100953', 'step': 2794, 'epoch': 2} {'type': 'loss', 'content': 0.006187926512211561, 'timestamp': '2025-09-30 22:16:50.108506', 'step': 2795, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:16:50.147094', 'step': 2795, 'epoch': 2} {'type': 'loss', 'content': 0.011713352054357529, 'timestamp': '2025-09-30 22:16:50.174792', 'step': 2796, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:16:50.207787', 'step': 2796, 'epoch': 2} {'type': 'loss', 'content': 0.007518185302615166, 'timestamp': '2025-09-30 22:16:50.213552', 'step': 2797, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:50.252277', 'step': 2797, 'epoch': 2} {'type': 'loss', 'content': 0.008915386162698269, 'timestamp': '2025-09-30 22:16:50.263468', 'step': 2798, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:50.304363', 'step': 2798, 'epoch': 2} {'type': 'loss', 'content': 0.008104098960757256, 'timestamp': '2025-09-30 22:16:50.312359', 'step': 2799, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:50.345595', 'step': 2799, 'epoch': 2} {'type': 'loss', 'content': 0.006924333982169628, 'timestamp': '2025-09-30 22:16:50.377643', 'step': 2800, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:16:50.410768', 'step': 2800, 'epoch': 2} {'type': 'loss', 'content': 0.003365806769579649, 'timestamp': '2025-09-30 22:16:50.421134', 'step': 2801, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:50.454439', 'step': 2801, 'epoch': 2} {'type': 'loss', 'content': 0.0023598482366651297, 'timestamp': '2025-09-30 22:16:50.465344', 'step': 2802, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:16:50.508284', 'step': 2802, 'epoch': 2} {'type': 'loss', 'content': 0.013317740522325039, 'timestamp': '2025-09-30 22:16:50.520802', 'step': 2803, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:16:50.562655', 'step': 2803, 'epoch': 2} {'type': 'loss', 'content': 0.008947531692683697, 'timestamp': '2025-09-30 22:16:50.591680', 'step': 2804, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:50.630296', 'step': 2804, 'epoch': 2} {'type': 'loss', 'content': 0.01369408518075943, 'timestamp': '2025-09-30 22:16:50.638901', 'step': 2805, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:16:50.673020', 'step': 2805, 'epoch': 2} {'type': 'loss', 'content': 0.004378673620522022, 'timestamp': '2025-09-30 22:16:50.680265', 'step': 2806, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:50.713023', 'step': 2806, 'epoch': 2} {'type': 'loss', 'content': 0.007849207147955894, 'timestamp': '2025-09-30 22:16:50.723981', 'step': 2807, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:16:50.765841', 'step': 2807, 'epoch': 2} {'type': 'loss', 'content': 0.0012026155600324273, 'timestamp': '2025-09-30 22:16:50.793758', 'step': 2808, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:16:50.854320', 'step': 2808, 'epoch': 2} {'type': 'loss', 'content': 0.006514945533126593, 'timestamp': '2025-09-30 22:16:50.867724', 'step': 2809, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:50.906035', 'step': 2809, 'epoch': 2} {'type': 'loss', 'content': 0.01418620627373457, 'timestamp': '2025-09-30 22:16:50.914031', 'step': 2810, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:16:50.960182', 'step': 2810, 'epoch': 2} {'type': 'loss', 'content': 0.006249403115361929, 'timestamp': '2025-09-30 22:16:50.966965', 'step': 2811, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:16:51.006078', 'step': 2811, 'epoch': 2} {'type': 'loss', 'content': 0.004195347428321838, 'timestamp': '2025-09-30 22:16:51.040709', 'step': 2812, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:16:51.076931', 'step': 2812, 'epoch': 2} {'type': 'loss', 'content': 0.005581292789429426, 'timestamp': '2025-09-30 22:16:51.089483', 'step': 2813, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:16:51.123525', 'step': 2813, 'epoch': 2} {'type': 'loss', 'content': 0.009607982821762562, 'timestamp': '2025-09-30 22:16:51.135795', 'step': 2814, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:16:51.173791', 'step': 2814, 'epoch': 2} {'type': 'loss', 'content': 0.009186118841171265, 'timestamp': '2025-09-30 22:16:51.186316', 'step': 2815, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:16:51.239936', 'step': 2815, 'epoch': 2} {'type': 'loss', 'content': 0.012692922726273537, 'timestamp': '2025-09-30 22:16:51.274571', 'step': 2816, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 15662185694400}, 'timestamp': '2025-09-30 22:16:51.328584', 'step': 2816, 'epoch': 2} {'type': 'loss', 'content': 0.00466757919639349, 'timestamp': '2025-09-30 22:16:51.347843', 'step': 2817, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:16:51.409137', 'step': 2817, 'epoch': 2} {'type': 'loss', 'content': 0.01156692486256361, 'timestamp': '2025-09-30 22:16:51.424766', 'step': 2818, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:51.480145', 'step': 2818, 'epoch': 2} {'type': 'loss', 'content': 0.013968187384307384, 'timestamp': '2025-09-30 22:16:51.491311', 'step': 2819, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:16:51.532773', 'step': 2819, 'epoch': 2} {'type': 'loss', 'content': 0.009603479877114296, 'timestamp': '2025-09-30 22:16:51.567375', 'step': 2820, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:51.603373', 'step': 2820, 'epoch': 2} {'type': 'loss', 'content': 0.007417512126266956, 'timestamp': '2025-09-30 22:16:51.609078', 'step': 2821, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:16:51.644646', 'step': 2821, 'epoch': 2} {'type': 'loss', 'content': 0.014802886173129082, 'timestamp': '2025-09-30 22:16:51.657202', 'step': 2822, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:16:51.696743', 'step': 2822, 'epoch': 2} {'type': 'loss', 'content': 0.0025944069493561983, 'timestamp': '2025-09-30 22:16:51.712350', 'step': 2823, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:16:51.750045', 'step': 2823, 'epoch': 2} {'type': 'loss', 'content': 0.00613754615187645, 'timestamp': '2025-09-30 22:16:51.784776', 'step': 2824, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:51.830355', 'step': 2824, 'epoch': 2} {'type': 'loss', 'content': 0.007381127215921879, 'timestamp': '2025-09-30 22:16:51.838998', 'step': 2825, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:16:51.884570', 'step': 2825, 'epoch': 2} {'type': 'loss', 'content': 0.0035370741970837116, 'timestamp': '2025-09-30 22:16:51.897885', 'step': 2826, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:16:51.934683', 'step': 2826, 'epoch': 2} {'type': 'loss', 'content': 0.002247849479317665, 'timestamp': '2025-09-30 22:16:51.941913', 'step': 2827, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:51.977097', 'step': 2827, 'epoch': 2} {'type': 'loss', 'content': 0.002631678944453597, 'timestamp': '2025-09-30 22:16:52.008354', 'step': 2828, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:16:52.045211', 'step': 2828, 'epoch': 2} {'type': 'loss', 'content': 0.005599735304713249, 'timestamp': '2025-09-30 22:16:52.055560', 'step': 2829, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:16:52.089528', 'step': 2829, 'epoch': 2} {'type': 'loss', 'content': 0.0031780179124325514, 'timestamp': '2025-09-30 22:16:52.096676', 'step': 2830, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:52.134997', 'step': 2830, 'epoch': 2} {'type': 'loss', 'content': 0.008361829444766045, 'timestamp': '2025-09-30 22:16:52.145976', 'step': 2831, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 16136789420416}, 'timestamp': '2025-09-30 22:16:52.203847', 'step': 2831, 'epoch': 2} {'type': 'loss', 'content': 0.00743240537121892, 'timestamp': '2025-09-30 22:16:52.243651', 'step': 2832, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:52.278318', 'step': 2832, 'epoch': 2} {'type': 'loss', 'content': 0.007705148309469223, 'timestamp': '2025-09-30 22:16:52.287065', 'step': 2833, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:16:52.325827', 'step': 2833, 'epoch': 2} {'type': 'loss', 'content': 0.00037418119609355927, 'timestamp': '2025-09-30 22:16:52.332946', 'step': 2834, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-09-30 22:16:52.380514', 'step': 2834, 'epoch': 2} {'type': 'loss', 'content': 0.004442401695996523, 'timestamp': '2025-09-30 22:16:52.396807', 'step': 2835, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:52.431209', 'step': 2835, 'epoch': 2} {'type': 'loss', 'content': 0.007556099444627762, 'timestamp': '2025-09-30 22:16:52.460105', 'step': 2836, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:16:52.495274', 'step': 2836, 'epoch': 2} {'type': 'loss', 'content': 0.013456666842103004, 'timestamp': '2025-09-30 22:16:52.505104', 'step': 2837, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:52.539341', 'step': 2837, 'epoch': 2} {'type': 'loss', 'content': 0.002586257178336382, 'timestamp': '2025-09-30 22:16:52.547266', 'step': 2838, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:52.582757', 'step': 2838, 'epoch': 2} {'type': 'loss', 'content': 0.005766110494732857, 'timestamp': '2025-09-30 22:16:52.590688', 'step': 2839, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:16:52.631907', 'step': 2839, 'epoch': 2} {'type': 'loss', 'content': 0.008680349215865135, 'timestamp': '2025-09-30 22:16:52.666132', 'step': 2840, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:16:52.703529', 'step': 2840, 'epoch': 2} {'type': 'loss', 'content': 0.0029739767778664827, 'timestamp': '2025-09-30 22:16:52.708715', 'step': 2841, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:16:52.743850', 'step': 2841, 'epoch': 2} {'type': 'loss', 'content': 0.006776679772883654, 'timestamp': '2025-09-30 22:16:52.751084', 'step': 2842, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:16:52.790356', 'step': 2842, 'epoch': 2} {'type': 'loss', 'content': 0.008047245442867279, 'timestamp': '2025-09-30 22:16:52.797905', 'step': 2843, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:52.833168', 'step': 2843, 'epoch': 2} {'type': 'loss', 'content': 0.006306602619588375, 'timestamp': '2025-09-30 22:16:52.865094', 'step': 2844, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:16:52.904289', 'step': 2844, 'epoch': 2} {'type': 'loss', 'content': 0.006988802924752235, 'timestamp': '2025-09-30 22:16:52.909335', 'step': 2845, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:52.941392', 'step': 2845, 'epoch': 2} {'type': 'loss', 'content': 0.004913115408271551, 'timestamp': '2025-09-30 22:16:52.952363', 'step': 2846, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:16:52.985841', 'step': 2846, 'epoch': 2} {'type': 'loss', 'content': 0.009375456720590591, 'timestamp': '2025-09-30 22:16:52.990031', 'step': 2847, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:16:53.026553', 'step': 2847, 'epoch': 2} {'type': 'loss', 'content': 0.005251064896583557, 'timestamp': '2025-09-30 22:16:53.058448', 'step': 2848, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:16:53.097816', 'step': 2848, 'epoch': 2} {'type': 'loss', 'content': 0.0016738343983888626, 'timestamp': '2025-09-30 22:16:53.107436', 'step': 2849, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:16:53.148951', 'step': 2849, 'epoch': 2} {'type': 'loss', 'content': 0.004856899380683899, 'timestamp': '2025-09-30 22:16:53.153531', 'step': 2850, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:53.195365', 'step': 2850, 'epoch': 2} {'type': 'loss', 'content': 0.008596754632890224, 'timestamp': '2025-09-30 22:16:53.205735', 'step': 2851, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:16:53.249826', 'step': 2851, 'epoch': 2} {'type': 'loss', 'content': 0.004325315356254578, 'timestamp': '2025-09-30 22:16:53.284568', 'step': 2852, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:16:53.324072', 'step': 2852, 'epoch': 2} {'type': 'loss', 'content': 0.006976179778575897, 'timestamp': '2025-09-30 22:16:53.337215', 'step': 2853, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:16:53.378544', 'step': 2853, 'epoch': 2} {'type': 'loss', 'content': 0.009582379832863808, 'timestamp': '2025-09-30 22:16:53.385685', 'step': 2854, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:16:53.423852', 'step': 2854, 'epoch': 2} {'type': 'loss', 'content': 0.011023230850696564, 'timestamp': '2025-09-30 22:16:53.431326', 'step': 2855, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:53.468771', 'step': 2855, 'epoch': 2} {'type': 'loss', 'content': 0.011062287725508213, 'timestamp': '2025-09-30 22:16:53.500658', 'step': 2856, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:53.539061', 'step': 2856, 'epoch': 2} {'type': 'loss', 'content': 0.003815129864960909, 'timestamp': '2025-09-30 22:16:53.549008', 'step': 2857, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:53.586736', 'step': 2857, 'epoch': 2} {'type': 'loss', 'content': 0.01672511361539364, 'timestamp': '2025-09-30 22:16:53.597937', 'step': 2858, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:16:53.644944', 'step': 2858, 'epoch': 2} {'type': 'loss', 'content': 0.006538551300764084, 'timestamp': '2025-09-30 22:16:53.658603', 'step': 2859, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:53.695658', 'step': 2859, 'epoch': 2} {'type': 'loss', 'content': 0.011332187801599503, 'timestamp': '2025-09-30 22:16:53.727648', 'step': 2860, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:16:53.764561', 'step': 2860, 'epoch': 2} {'type': 'loss', 'content': 0.008668245747685432, 'timestamp': '2025-09-30 22:16:53.769259', 'step': 2861, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:16:53.807005', 'step': 2861, 'epoch': 2} {'type': 'loss', 'content': 0.003763214685022831, 'timestamp': '2025-09-30 22:16:53.814724', 'step': 2862, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:16:53.859753', 'step': 2862, 'epoch': 2} {'type': 'loss', 'content': 0.006051691249012947, 'timestamp': '2025-09-30 22:16:53.873773', 'step': 2863, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 17560600598464}, 'timestamp': '2025-09-30 22:16:53.930189', 'step': 2863, 'epoch': 2} {'type': 'loss', 'content': 0.004667391534894705, 'timestamp': '2025-09-30 22:16:53.972181', 'step': 2864, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:16:54.012962', 'step': 2864, 'epoch': 2} {'type': 'loss', 'content': 0.006028117146342993, 'timestamp': '2025-09-30 22:16:54.020413', 'step': 2865, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:16:54.062104', 'step': 2865, 'epoch': 2} {'type': 'loss', 'content': 0.002721439115703106, 'timestamp': '2025-09-30 22:16:54.075795', 'step': 2866, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:16:54.116836', 'step': 2866, 'epoch': 2} {'type': 'loss', 'content': 0.004800109192728996, 'timestamp': '2025-09-30 22:16:54.124498', 'step': 2867, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:54.168196', 'step': 2867, 'epoch': 2} {'type': 'loss', 'content': 0.0033718144986778498, 'timestamp': '2025-09-30 22:16:54.199282', 'step': 2868, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:16:54.237596', 'step': 2868, 'epoch': 2} {'type': 'loss', 'content': 0.004756898153573275, 'timestamp': '2025-09-30 22:16:54.246365', 'step': 2869, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:16:54.286563', 'step': 2869, 'epoch': 2} {'type': 'loss', 'content': 0.0047842999920248985, 'timestamp': '2025-09-30 22:16:54.294314', 'step': 2870, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:16:54.332759', 'step': 2870, 'epoch': 2} {'type': 'loss', 'content': 0.00872610229998827, 'timestamp': '2025-09-30 22:16:54.339988', 'step': 2871, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:16:54.388513', 'step': 2871, 'epoch': 2} {'type': 'loss', 'content': 0.016180802136659622, 'timestamp': '2025-09-30 22:16:54.421870', 'step': 2872, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:54.457339', 'step': 2872, 'epoch': 2} {'type': 'loss', 'content': 0.001919434405863285, 'timestamp': '2025-09-30 22:16:54.465984', 'step': 2873, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:16:54.515398', 'step': 2873, 'epoch': 2} {'type': 'loss', 'content': 0.002931988565251231, 'timestamp': '2025-09-30 22:16:54.527958', 'step': 2874, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:16:54.584633', 'step': 2874, 'epoch': 2} {'type': 'loss', 'content': 0.008017763495445251, 'timestamp': '2025-09-30 22:16:54.592343', 'step': 2875, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-09-30 22:16:57.286752', 'step': 2875, 'epoch': 2} {'type': 'pplx', 'content': 5.7056885871454455, 'timestamp': '2025-09-30 22:16:57.291891', 'step': 2875, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:57.338353', 'step': 2875, 'epoch': 2} {'type': 'loss', 'content': 0.015178192406892776, 'timestamp': '2025-09-30 22:16:57.368316', 'step': 2876, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:16:57.410586', 'step': 2876, 'epoch': 2} {'type': 'loss', 'content': 0.006266669370234013, 'timestamp': '2025-09-30 22:16:57.415436', 'step': 2877, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:16:57.452630', 'step': 2877, 'epoch': 2} {'type': 'loss', 'content': 0.005694805644452572, 'timestamp': '2025-09-30 22:16:57.460218', 'step': 2878, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:16:57.512211', 'step': 2878, 'epoch': 2} {'type': 'loss', 'content': 0.005040735471993685, 'timestamp': '2025-09-30 22:16:57.525576', 'step': 2879, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:57.561645', 'step': 2879, 'epoch': 2} {'type': 'loss', 'content': 0.010561853647232056, 'timestamp': '2025-09-30 22:16:57.590981', 'step': 2880, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:57.639185', 'step': 2880, 'epoch': 2} {'type': 'loss', 'content': 0.008320540189743042, 'timestamp': '2025-09-30 22:16:57.647173', 'step': 2881, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:57.682937', 'step': 2881, 'epoch': 2} {'type': 'loss', 'content': 0.009163618087768555, 'timestamp': '2025-09-30 22:16:57.691033', 'step': 2882, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:16:57.728859', 'step': 2882, 'epoch': 2} {'type': 'loss', 'content': 0.007036111783236265, 'timestamp': '2025-09-30 22:16:57.740992', 'step': 2883, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:16:57.789547', 'step': 2883, 'epoch': 2} {'type': 'loss', 'content': 0.008511088788509369, 'timestamp': '2025-09-30 22:16:57.823742', 'step': 2884, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:57.863258', 'step': 2884, 'epoch': 2} {'type': 'loss', 'content': 0.0076420544646680355, 'timestamp': '2025-09-30 22:16:57.868726', 'step': 2885, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:57.921045', 'step': 2885, 'epoch': 2} {'type': 'loss', 'content': 0.0072248405776917934, 'timestamp': '2025-09-30 22:16:57.929032', 'step': 2886, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:57.980423', 'step': 2886, 'epoch': 2} {'type': 'loss', 'content': 0.00268104812130332, 'timestamp': '2025-09-30 22:16:57.988332', 'step': 2887, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:16:58.023808', 'step': 2887, 'epoch': 2} {'type': 'loss', 'content': 0.005942836403846741, 'timestamp': '2025-09-30 22:16:58.057197', 'step': 2888, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:16:58.103611', 'step': 2888, 'epoch': 2} {'type': 'loss', 'content': 0.006358643528074026, 'timestamp': '2025-09-30 22:16:58.122152', 'step': 2889, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:16:58.172186', 'step': 2889, 'epoch': 2} {'type': 'loss', 'content': 0.009358215145766735, 'timestamp': '2025-09-30 22:16:58.185865', 'step': 2890, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:16:58.233833', 'step': 2890, 'epoch': 2} {'type': 'loss', 'content': 0.01006359700113535, 'timestamp': '2025-09-30 22:16:58.240747', 'step': 2891, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-09-30 22:16:58.292537', 'step': 2891, 'epoch': 2} {'type': 'loss', 'content': 0.00435235258191824, 'timestamp': '2025-09-30 22:16:58.329666', 'step': 2892, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:16:58.375789', 'step': 2892, 'epoch': 2} {'type': 'loss', 'content': 0.005403323099017143, 'timestamp': '2025-09-30 22:16:58.388410', 'step': 2893, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:16:58.434296', 'step': 2893, 'epoch': 2} {'type': 'loss', 'content': 0.0034346147440373898, 'timestamp': '2025-09-30 22:16:58.448048', 'step': 2894, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:58.507041', 'step': 2894, 'epoch': 2} {'type': 'loss', 'content': 0.009157082065939903, 'timestamp': '2025-09-30 22:16:58.515607', 'step': 2895, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:16:58.571706', 'step': 2895, 'epoch': 2} {'type': 'loss', 'content': 0.007458867039531469, 'timestamp': '2025-09-30 22:16:58.606492', 'step': 2896, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:16:58.642866', 'step': 2896, 'epoch': 2} {'type': 'loss', 'content': 0.005464588291943073, 'timestamp': '2025-09-30 22:16:58.648022', 'step': 2897, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:58.710283', 'step': 2897, 'epoch': 2} {'type': 'loss', 'content': 0.0011974646477028728, 'timestamp': '2025-09-30 22:16:58.720033', 'step': 2898, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:58.761397', 'step': 2898, 'epoch': 2} {'type': 'loss', 'content': 0.008007212541997433, 'timestamp': '2025-09-30 22:16:58.771689', 'step': 2899, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:16:58.825698', 'step': 2899, 'epoch': 2} {'type': 'loss', 'content': 0.0035000897478312254, 'timestamp': '2025-09-30 22:16:58.859083', 'step': 2900, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-09-30 22:16:58.917655', 'step': 2900, 'epoch': 2} {'type': 'loss', 'content': 0.004464174620807171, 'timestamp': '2025-09-30 22:16:58.933548', 'step': 2901, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:58.971567', 'step': 2901, 'epoch': 2} {'type': 'loss', 'content': 0.007408112287521362, 'timestamp': '2025-09-30 22:16:58.982043', 'step': 2902, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:59.031842', 'step': 2902, 'epoch': 2} {'type': 'loss', 'content': 0.006896634586155415, 'timestamp': '2025-09-30 22:16:59.039975', 'step': 2903, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:16:59.077582', 'step': 2903, 'epoch': 2} {'type': 'loss', 'content': 0.0008722843485884368, 'timestamp': '2025-09-30 22:16:59.108384', 'step': 2904, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:16:59.150699', 'step': 2904, 'epoch': 2} {'type': 'loss', 'content': 0.007032486144453287, 'timestamp': '2025-09-30 22:16:59.163845', 'step': 2905, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:16:59.212817', 'step': 2905, 'epoch': 2} {'type': 'loss', 'content': 0.003517316887155175, 'timestamp': '2025-09-30 22:16:59.219875', 'step': 2906, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:16:59.271295', 'step': 2906, 'epoch': 2} {'type': 'loss', 'content': 0.013583201915025711, 'timestamp': '2025-09-30 22:16:59.280244', 'step': 2907, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:16:59.327283', 'step': 2907, 'epoch': 2} {'type': 'loss', 'content': 0.0067641520872712135, 'timestamp': '2025-09-30 22:16:59.359169', 'step': 2908, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:16:59.393355', 'step': 2908, 'epoch': 2} {'type': 'loss', 'content': 0.004752178210765123, 'timestamp': '2025-09-30 22:16:59.398978', 'step': 2909, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:59.452003', 'step': 2909, 'epoch': 2} {'type': 'loss', 'content': 0.014300595037639141, 'timestamp': '2025-09-30 22:16:59.462179', 'step': 2910, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:16:59.503956', 'step': 2910, 'epoch': 2} {'type': 'loss', 'content': 0.012174049392342567, 'timestamp': '2025-09-30 22:16:59.508402', 'step': 2911, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:16:59.559516', 'step': 2911, 'epoch': 2} {'type': 'loss', 'content': 0.004016537219285965, 'timestamp': '2025-09-30 22:16:59.593760', 'step': 2912, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:59.627343', 'step': 2912, 'epoch': 2} {'type': 'loss', 'content': 0.009380542673170567, 'timestamp': '2025-09-30 22:16:59.635240', 'step': 2913, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:16:59.668597', 'step': 2913, 'epoch': 2} {'type': 'loss', 'content': 0.003296236041933298, 'timestamp': '2025-09-30 22:16:59.676313', 'step': 2914, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:16:59.712346', 'step': 2914, 'epoch': 2} {'type': 'loss', 'content': 0.003966023214161396, 'timestamp': '2025-09-30 22:16:59.722721', 'step': 2915, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:16:59.767432', 'step': 2915, 'epoch': 2} {'type': 'loss', 'content': 0.006834966130554676, 'timestamp': '2025-09-30 22:16:59.798655', 'step': 2916, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:16:59.835360', 'step': 2916, 'epoch': 2} {'type': 'loss', 'content': 0.010554447770118713, 'timestamp': '2025-09-30 22:16:59.846419', 'step': 2917, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:16:59.892493', 'step': 2917, 'epoch': 2} {'type': 'loss', 'content': 0.008746454492211342, 'timestamp': '2025-09-30 22:16:59.899471', 'step': 2918, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:16:59.942401', 'step': 2918, 'epoch': 2} {'type': 'loss', 'content': 0.005904481280595064, 'timestamp': '2025-09-30 22:16:59.951949', 'step': 2919, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:16:59.993798', 'step': 2919, 'epoch': 2} {'type': 'loss', 'content': 0.005135437939316034, 'timestamp': '2025-09-30 22:17:00.027248', 'step': 2920, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:17:00.069343', 'step': 2920, 'epoch': 2} {'type': 'loss', 'content': 0.007645525969564915, 'timestamp': '2025-09-30 22:17:00.080444', 'step': 2921, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:17:00.124087', 'step': 2921, 'epoch': 2} {'type': 'loss', 'content': 0.005756744183599949, 'timestamp': '2025-09-30 22:17:00.128638', 'step': 2922, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:17:00.188589', 'step': 2922, 'epoch': 2} {'type': 'loss', 'content': 0.0007258382975123823, 'timestamp': '2025-09-30 22:17:00.193107', 'step': 2923, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:17:00.239867', 'step': 2923, 'epoch': 2} {'type': 'loss', 'content': 0.00860053114593029, 'timestamp': '2025-09-30 22:17:00.268643', 'step': 2924, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:17:00.330462', 'step': 2924, 'epoch': 2} {'type': 'loss', 'content': 0.0024887947365641594, 'timestamp': '2025-09-30 22:17:00.339254', 'step': 2925, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:17:00.406045', 'step': 2925, 'epoch': 2} {'type': 'loss', 'content': 0.009777331724762917, 'timestamp': '2025-09-30 22:17:00.415168', 'step': 2926, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:17:00.459996', 'step': 2926, 'epoch': 2} {'type': 'loss', 'content': 0.0025563391391187906, 'timestamp': '2025-09-30 22:17:00.467594', 'step': 2927, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:17:00.501644', 'step': 2927, 'epoch': 2} {'type': 'loss', 'content': 0.009383581578731537, 'timestamp': '2025-09-30 22:17:00.529621', 'step': 2928, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:17:00.564929', 'step': 2928, 'epoch': 2} {'type': 'loss', 'content': 0.007304985076189041, 'timestamp': '2025-09-30 22:17:00.569548', 'step': 2929, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:00.612284', 'step': 2929, 'epoch': 2} {'type': 'loss', 'content': 0.004353808239102364, 'timestamp': '2025-09-30 22:17:00.623358', 'step': 2930, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:00.658178', 'step': 2930, 'epoch': 2} {'type': 'loss', 'content': 0.008152646012604237, 'timestamp': '2025-09-30 22:17:00.670293', 'step': 2931, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:17:00.715731', 'step': 2931, 'epoch': 2} {'type': 'loss', 'content': 0.012269679456949234, 'timestamp': '2025-09-30 22:17:00.750604', 'step': 2932, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:00.800482', 'step': 2932, 'epoch': 2} {'type': 'loss', 'content': 0.004335030913352966, 'timestamp': '2025-09-30 22:17:00.812364', 'step': 2933, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:00.858823', 'step': 2933, 'epoch': 2} {'type': 'loss', 'content': 0.007889092899858952, 'timestamp': '2025-09-30 22:17:00.869720', 'step': 2934, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:00.913045', 'step': 2934, 'epoch': 2} {'type': 'loss', 'content': 0.005649187136441469, 'timestamp': '2025-09-30 22:17:00.923362', 'step': 2935, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:17:00.968089', 'step': 2935, 'epoch': 2} {'type': 'loss', 'content': 0.0028243919368833303, 'timestamp': '2025-09-30 22:17:00.996142', 'step': 2936, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:17:01.052426', 'step': 2936, 'epoch': 2} {'type': 'loss', 'content': 0.00842214748263359, 'timestamp': '2025-09-30 22:17:01.067569', 'step': 2937, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:17:01.122236', 'step': 2937, 'epoch': 2} {'type': 'loss', 'content': 0.007733777165412903, 'timestamp': '2025-09-30 22:17:01.136017', 'step': 2938, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:01.187347', 'step': 2938, 'epoch': 2} {'type': 'loss', 'content': 0.015714704990386963, 'timestamp': '2025-09-30 22:17:01.197545', 'step': 2939, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:17:01.242699', 'step': 2939, 'epoch': 2} {'type': 'loss', 'content': 0.011648855172097683, 'timestamp': '2025-09-30 22:17:01.272540', 'step': 2940, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:17:01.312034', 'step': 2940, 'epoch': 2} {'type': 'loss', 'content': 0.00478572491556406, 'timestamp': '2025-09-30 22:17:01.321000', 'step': 2941, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:17:01.359405', 'step': 2941, 'epoch': 2} {'type': 'loss', 'content': 0.00669105863198638, 'timestamp': '2025-09-30 22:17:01.366990', 'step': 2942, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:17:01.417359', 'step': 2942, 'epoch': 2} {'type': 'loss', 'content': 0.004550726152956486, 'timestamp': '2025-09-30 22:17:01.431375', 'step': 2943, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:17:01.472127', 'step': 2943, 'epoch': 2} {'type': 'loss', 'content': 0.005055215209722519, 'timestamp': '2025-09-30 22:17:01.503002', 'step': 2944, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 14238374516352}, 'timestamp': '2025-09-30 22:17:01.561723', 'step': 2944, 'epoch': 2} {'type': 'loss', 'content': 0.008799183182418346, 'timestamp': '2025-09-30 22:17:01.578688', 'step': 2945, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:17:01.625798', 'step': 2945, 'epoch': 2} {'type': 'loss', 'content': 0.005056922324001789, 'timestamp': '2025-09-30 22:17:01.633718', 'step': 2946, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:17:01.671199', 'step': 2946, 'epoch': 2} {'type': 'loss', 'content': 0.007015272043645382, 'timestamp': '2025-09-30 22:17:01.685046', 'step': 2947, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:17:01.730298', 'step': 2947, 'epoch': 2} {'type': 'loss', 'content': 0.008410302922129631, 'timestamp': '2025-09-30 22:17:01.760127', 'step': 2948, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:17:01.806250', 'step': 2948, 'epoch': 2} {'type': 'loss', 'content': 0.013081254437565804, 'timestamp': '2025-09-30 22:17:01.811840', 'step': 2949, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:17:01.853331', 'step': 2949, 'epoch': 2} {'type': 'loss', 'content': 0.005661000497639179, 'timestamp': '2025-09-30 22:17:01.860560', 'step': 2950, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:01.909311', 'step': 2950, 'epoch': 2} {'type': 'loss', 'content': 0.009392441250383854, 'timestamp': '2025-09-30 22:17:01.921473', 'step': 2951, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:17:01.959774', 'step': 2951, 'epoch': 2} {'type': 'loss', 'content': 0.002402786398306489, 'timestamp': '2025-09-30 22:17:01.987503', 'step': 2952, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:17:02.021233', 'step': 2952, 'epoch': 2} {'type': 'loss', 'content': 0.005488426424562931, 'timestamp': '2025-09-30 22:17:02.026486', 'step': 2953, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:02.060254', 'step': 2953, 'epoch': 2} {'type': 'loss', 'content': 0.012102757580578327, 'timestamp': '2025-09-30 22:17:02.072856', 'step': 2954, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:02.118740', 'step': 2954, 'epoch': 2} {'type': 'loss', 'content': 0.008857803419232368, 'timestamp': '2025-09-30 22:17:02.129007', 'step': 2955, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:17:02.184826', 'step': 2955, 'epoch': 2} {'type': 'loss', 'content': 0.003531576367095113, 'timestamp': '2025-09-30 22:17:02.216484', 'step': 2956, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:17:02.252525', 'step': 2956, 'epoch': 2} {'type': 'loss', 'content': 0.008055913262069225, 'timestamp': '2025-09-30 22:17:02.265497', 'step': 2957, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:17:02.300040', 'step': 2957, 'epoch': 2} {'type': 'loss', 'content': 0.003228359157219529, 'timestamp': '2025-09-30 22:17:02.306955', 'step': 2958, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:02.354609', 'step': 2958, 'epoch': 2} {'type': 'loss', 'content': 0.006139842327684164, 'timestamp': '2025-09-30 22:17:02.364888', 'step': 2959, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:17:02.416576', 'step': 2959, 'epoch': 2} {'type': 'loss', 'content': 0.002982628531754017, 'timestamp': '2025-09-30 22:17:02.444204', 'step': 2960, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-09-30 22:17:02.493361', 'step': 2960, 'epoch': 2} {'type': 'loss', 'content': 0.0012173228897154331, 'timestamp': '2025-09-30 22:17:02.509146', 'step': 2961, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:17:02.549634', 'step': 2961, 'epoch': 2} {'type': 'loss', 'content': 0.008141208440065384, 'timestamp': '2025-09-30 22:17:02.556546', 'step': 2962, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:02.611191', 'step': 2962, 'epoch': 2} {'type': 'loss', 'content': 0.0023765189107507467, 'timestamp': '2025-09-30 22:17:02.623789', 'step': 2963, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:02.662087', 'step': 2963, 'epoch': 2} {'type': 'loss', 'content': 0.010212529450654984, 'timestamp': '2025-09-30 22:17:02.693422', 'step': 2964, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:17:02.749492', 'step': 2964, 'epoch': 2} {'type': 'loss', 'content': 0.012079106643795967, 'timestamp': '2025-09-30 22:17:02.762882', 'step': 2965, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:17:02.821060', 'step': 2965, 'epoch': 2} {'type': 'loss', 'content': 0.005635153967887163, 'timestamp': '2025-09-30 22:17:02.835079', 'step': 2966, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:02.905200', 'step': 2966, 'epoch': 2} {'type': 'loss', 'content': 0.004532633814960718, 'timestamp': '2025-09-30 22:17:02.917459', 'step': 2967, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:17:02.964290', 'step': 2967, 'epoch': 2} {'type': 'loss', 'content': 0.01131241861730814, 'timestamp': '2025-09-30 22:17:02.992834', 'step': 2968, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:17:03.034600', 'step': 2968, 'epoch': 2} {'type': 'loss', 'content': 0.010123826563358307, 'timestamp': '2025-09-30 22:17:03.047975', 'step': 2969, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:03.100745', 'step': 2969, 'epoch': 2} {'type': 'loss', 'content': 0.005481799598783255, 'timestamp': '2025-09-30 22:17:03.113334', 'step': 2970, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:17:03.163057', 'step': 2970, 'epoch': 2} {'type': 'loss', 'content': 0.012069096788764, 'timestamp': '2025-09-30 22:17:03.178639', 'step': 2971, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:17:03.219919', 'step': 2971, 'epoch': 2} {'type': 'loss', 'content': 0.00575965316966176, 'timestamp': '2025-09-30 22:17:03.248683', 'step': 2972, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:17:03.291592', 'step': 2972, 'epoch': 2} {'type': 'loss', 'content': 0.007919232361018658, 'timestamp': '2025-09-30 22:17:03.304614', 'step': 2973, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:17:03.354851', 'step': 2973, 'epoch': 2} {'type': 'loss', 'content': 0.0066139488480985165, 'timestamp': '2025-09-30 22:17:03.368670', 'step': 2974, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:17:03.414453', 'step': 2974, 'epoch': 2} {'type': 'loss', 'content': 0.009215443395078182, 'timestamp': '2025-09-30 22:17:03.424011', 'step': 2975, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:17:03.486425', 'step': 2975, 'epoch': 2} {'type': 'loss', 'content': 0.007280724588781595, 'timestamp': '2025-09-30 22:17:03.520643', 'step': 2976, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:03.577036', 'step': 2976, 'epoch': 2} {'type': 'loss', 'content': 0.007378325331956148, 'timestamp': '2025-09-30 22:17:03.587569', 'step': 2977, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:03.638200', 'step': 2977, 'epoch': 2} {'type': 'loss', 'content': 0.002158799208700657, 'timestamp': '2025-09-30 22:17:03.650402', 'step': 2978, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:03.704487', 'step': 2978, 'epoch': 2} {'type': 'loss', 'content': 0.008832129649817944, 'timestamp': '2025-09-30 22:17:03.715384', 'step': 2979, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:17:03.767928', 'step': 2979, 'epoch': 2} {'type': 'loss', 'content': 0.009373163804411888, 'timestamp': '2025-09-30 22:17:03.806012', 'step': 2980, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:17:03.851225', 'step': 2980, 'epoch': 2} {'type': 'loss', 'content': 0.0014124346198514104, 'timestamp': '2025-09-30 22:17:03.856676', 'step': 2981, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:03.896582', 'step': 2981, 'epoch': 2} {'type': 'loss', 'content': 0.005799147766083479, 'timestamp': '2025-09-30 22:17:03.908800', 'step': 2982, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:03.955107', 'step': 2982, 'epoch': 2} {'type': 'loss', 'content': 0.013330061919987202, 'timestamp': '2025-09-30 22:17:03.966136', 'step': 2983, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:04.002303', 'step': 2983, 'epoch': 2} {'type': 'loss', 'content': 0.0024350215680897236, 'timestamp': '2025-09-30 22:17:04.034166', 'step': 2984, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:17:04.082567', 'step': 2984, 'epoch': 2} {'type': 'loss', 'content': 0.010918851010501385, 'timestamp': '2025-09-30 22:17:04.095552', 'step': 2985, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:17:04.138505', 'step': 2985, 'epoch': 2} {'type': 'loss', 'content': 0.005125465802848339, 'timestamp': '2025-09-30 22:17:04.145930', 'step': 2986, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:04.186421', 'step': 2986, 'epoch': 2} {'type': 'loss', 'content': 0.00508470693603158, 'timestamp': '2025-09-30 22:17:04.198561', 'step': 2987, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:04.233943', 'step': 2987, 'epoch': 2} {'type': 'loss', 'content': 0.008678026497364044, 'timestamp': '2025-09-30 22:17:04.266043', 'step': 2988, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:17:04.311350', 'step': 2988, 'epoch': 2} {'type': 'loss', 'content': 0.004566236399114132, 'timestamp': '2025-09-30 22:17:04.316838', 'step': 2989, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:04.363898', 'step': 2989, 'epoch': 2} {'type': 'loss', 'content': 0.0023392995353788137, 'timestamp': '2025-09-30 22:17:04.375928', 'step': 2990, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-09-30 22:17:07.096095', 'step': 2990, 'epoch': 2} {'type': 'pplx', 'content': 5.928878182497796, 'timestamp': '2025-09-30 22:17:07.099519', 'step': 2990, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:17:07.132110', 'step': 2990, 'epoch': 2} {'type': 'loss', 'content': 0.014219870790839195, 'timestamp': '2025-09-30 22:17:07.143294', 'step': 2991, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:07.179397', 'step': 2991, 'epoch': 2} {'type': 'loss', 'content': 0.002887843642383814, 'timestamp': '2025-09-30 22:17:07.212539', 'step': 2992, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:07.253723', 'step': 2992, 'epoch': 2} {'type': 'loss', 'content': 0.004083904437720776, 'timestamp': '2025-09-30 22:17:07.262348', 'step': 2993, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:17:07.310734', 'step': 2993, 'epoch': 2} {'type': 'loss', 'content': 0.001674058148637414, 'timestamp': '2025-09-30 22:17:07.323144', 'step': 2994, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:07.370714', 'step': 2994, 'epoch': 2} {'type': 'loss', 'content': 0.011439098045229912, 'timestamp': '2025-09-30 22:17:07.381724', 'step': 2995, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:07.427584', 'step': 2995, 'epoch': 2} {'type': 'loss', 'content': 0.0066447388380765915, 'timestamp': '2025-09-30 22:17:07.460416', 'step': 2996, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-09-30 22:17:07.514410', 'step': 2996, 'epoch': 2} {'type': 'loss', 'content': 0.0034933576826006174, 'timestamp': '2025-09-30 22:17:07.531183', 'step': 2997, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:17:07.571953', 'step': 2997, 'epoch': 2} {'type': 'loss', 'content': 0.005900776479393244, 'timestamp': '2025-09-30 22:17:07.585309', 'step': 2998, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:17:07.627921', 'step': 2998, 'epoch': 2} {'type': 'loss', 'content': 0.022136129438877106, 'timestamp': '2025-09-30 22:17:07.641615', 'step': 2999, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:17:07.680784', 'step': 2999, 'epoch': 2} {'type': 'loss', 'content': 0.009631790220737457, 'timestamp': '2025-09-30 22:17:07.715002', 'step': 3000, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 3000', 'timestamp': '2025-09-30 22:17:12.635303', 'step': 3000, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:17:12.677170', 'step': 3000, 'epoch': 2} {'type': 'loss', 'content': 0.00503592099994421, 'timestamp': '2025-09-30 22:17:12.689933', 'step': 3001, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:12.731922', 'step': 3001, 'epoch': 2} {'type': 'loss', 'content': 0.005273853428661823, 'timestamp': '2025-09-30 22:17:12.744425', 'step': 3002, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:12.777227', 'step': 3002, 'epoch': 2} {'type': 'loss', 'content': 0.005171041004359722, 'timestamp': '2025-09-30 22:17:12.789445', 'step': 3003, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:17:12.837720', 'step': 3003, 'epoch': 2} {'type': 'loss', 'content': 0.0026260169688612223, 'timestamp': '2025-09-30 22:17:12.872298', 'step': 3004, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-09-30 22:17:12.916880', 'step': 3004, 'epoch': 2} {'type': 'loss', 'content': 0.003942606970667839, 'timestamp': '2025-09-30 22:17:12.932532', 'step': 3005, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:12.966468', 'step': 3005, 'epoch': 2} {'type': 'loss', 'content': 0.011952736414968967, 'timestamp': '2025-09-30 22:17:12.977621', 'step': 3006, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:13.017809', 'step': 3006, 'epoch': 2} {'type': 'loss', 'content': 0.007050004787743092, 'timestamp': '2025-09-30 22:17:13.030395', 'step': 3007, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:17:13.068980', 'step': 3007, 'epoch': 2} {'type': 'loss', 'content': 0.007081000600010157, 'timestamp': '2025-09-30 22:17:13.103728', 'step': 3008, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:13.137718', 'step': 3008, 'epoch': 2} {'type': 'loss', 'content': 0.01373226847499609, 'timestamp': '2025-09-30 22:17:13.148311', 'step': 3009, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:13.189368', 'step': 3009, 'epoch': 2} {'type': 'loss', 'content': 0.008119028992950916, 'timestamp': '2025-09-30 22:17:13.202012', 'step': 3010, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:13.255521', 'step': 3010, 'epoch': 2} {'type': 'loss', 'content': 0.009641955606639385, 'timestamp': '2025-09-30 22:17:13.268078', 'step': 3011, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:13.305539', 'step': 3011, 'epoch': 2} {'type': 'loss', 'content': 0.0064568473026156425, 'timestamp': '2025-09-30 22:17:13.338958', 'step': 3012, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:13.388763', 'step': 3012, 'epoch': 2} {'type': 'loss', 'content': 0.00892971083521843, 'timestamp': '2025-09-30 22:17:13.397890', 'step': 3013, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:17:13.436294', 'step': 3013, 'epoch': 2} {'type': 'loss', 'content': 0.007372962776571512, 'timestamp': '2025-09-30 22:17:13.450108', 'step': 3014, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:17:13.499163', 'step': 3014, 'epoch': 2} {'type': 'loss', 'content': 0.011199056170880795, 'timestamp': '2025-09-30 22:17:13.514769', 'step': 3015, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:17:13.564847', 'step': 3015, 'epoch': 2} {'type': 'loss', 'content': 0.009116713888943195, 'timestamp': '2025-09-30 22:17:13.599438', 'step': 3016, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 15187581968384}, 'timestamp': '2025-09-30 22:17:13.650464', 'step': 3016, 'epoch': 2} {'type': 'loss', 'content': 0.005207949783653021, 'timestamp': '2025-09-30 22:17:13.667823', 'step': 3017, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:17:13.703555', 'step': 3017, 'epoch': 2} {'type': 'loss', 'content': 0.017830152064561844, 'timestamp': '2025-09-30 22:17:13.716896', 'step': 3018, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:13.752480', 'step': 3018, 'epoch': 2} {'type': 'loss', 'content': 0.0038366373628377914, 'timestamp': '2025-09-30 22:17:13.763265', 'step': 3019, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:13.802976', 'step': 3019, 'epoch': 2} {'type': 'loss', 'content': 0.008974471129477024, 'timestamp': '2025-09-30 22:17:13.835760', 'step': 3020, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:17:13.867898', 'step': 3020, 'epoch': 2} {'type': 'loss', 'content': 0.009324179030954838, 'timestamp': '2025-09-30 22:17:13.880624', 'step': 3021, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:17:13.928714', 'step': 3021, 'epoch': 2} {'type': 'loss', 'content': 0.007255516946315765, 'timestamp': '2025-09-30 22:17:13.942526', 'step': 3022, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:17:13.979826', 'step': 3022, 'epoch': 2} {'type': 'loss', 'content': 0.0052978722378611565, 'timestamp': '2025-09-30 22:17:13.993558', 'step': 3023, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:17:14.036132', 'step': 3023, 'epoch': 2} {'type': 'loss', 'content': 0.0037230406887829304, 'timestamp': '2025-09-30 22:17:14.070334', 'step': 3024, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:14.111207', 'step': 3024, 'epoch': 2} {'type': 'loss', 'content': 0.010426033288240433, 'timestamp': '2025-09-30 22:17:14.121072', 'step': 3025, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:14.157374', 'step': 3025, 'epoch': 2} {'type': 'loss', 'content': 0.005306093487888575, 'timestamp': '2025-09-30 22:17:14.168555', 'step': 3026, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:14.206589', 'step': 3026, 'epoch': 2} {'type': 'loss', 'content': 0.009547659195959568, 'timestamp': '2025-09-30 22:17:14.217710', 'step': 3027, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:14.262170', 'step': 3027, 'epoch': 2} {'type': 'loss', 'content': 0.010315535590052605, 'timestamp': '2025-09-30 22:17:14.295465', 'step': 3028, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:17:14.350705', 'step': 3028, 'epoch': 2} {'type': 'loss', 'content': 0.007825941778719425, 'timestamp': '2025-09-30 22:17:14.355549', 'step': 3029, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:14.388480', 'step': 3029, 'epoch': 2} {'type': 'loss', 'content': 0.004918771330267191, 'timestamp': '2025-09-30 22:17:14.399693', 'step': 3030, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-09-30 22:17:14.450381', 'step': 3030, 'epoch': 2} {'type': 'loss', 'content': 0.01733410358428955, 'timestamp': '2025-09-30 22:17:14.467464', 'step': 3031, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:14.513902', 'step': 3031, 'epoch': 2} {'type': 'loss', 'content': 0.0072146011516451836, 'timestamp': '2025-09-30 22:17:14.547090', 'step': 3032, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:17:14.581736', 'step': 3032, 'epoch': 2} {'type': 'loss', 'content': 0.007138307671993971, 'timestamp': '2025-09-30 22:17:14.594363', 'step': 3033, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:17:14.652478', 'step': 3033, 'epoch': 2} {'type': 'loss', 'content': 0.005867151077836752, 'timestamp': '2025-09-30 22:17:14.668181', 'step': 3034, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 16136789420416}, 'timestamp': '2025-09-30 22:17:14.718086', 'step': 3034, 'epoch': 2} {'type': 'loss', 'content': 0.0055921124294400215, 'timestamp': '2025-09-30 22:17:14.737164', 'step': 3035, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:17:14.794496', 'step': 3035, 'epoch': 2} {'type': 'loss', 'content': 0.006952265743166208, 'timestamp': '2025-09-30 22:17:14.829284', 'step': 3036, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:14.861744', 'step': 3036, 'epoch': 2} {'type': 'loss', 'content': 0.026587001979351044, 'timestamp': '2025-09-30 22:17:14.871655', 'step': 3037, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:14.904413', 'step': 3037, 'epoch': 2} {'type': 'loss', 'content': 0.00618098396807909, 'timestamp': '2025-09-30 22:17:14.916735', 'step': 3038, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:17:14.956335', 'step': 3038, 'epoch': 2} {'type': 'loss', 'content': 0.005510224495083094, 'timestamp': '2025-09-30 22:17:14.970109', 'step': 3039, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:17:15.008843', 'step': 3039, 'epoch': 2} {'type': 'loss', 'content': 0.007888545282185078, 'timestamp': '2025-09-30 22:17:15.043521', 'step': 3040, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:17:15.085047', 'step': 3040, 'epoch': 2} {'type': 'loss', 'content': 0.00570820365101099, 'timestamp': '2025-09-30 22:17:15.100470', 'step': 3041, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:17:15.154341', 'step': 3041, 'epoch': 2} {'type': 'loss', 'content': 0.004262340720742941, 'timestamp': '2025-09-30 22:17:15.168069', 'step': 3042, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:15.218947', 'step': 3042, 'epoch': 2} {'type': 'loss', 'content': 0.004007878713309765, 'timestamp': '2025-09-30 22:17:15.230104', 'step': 3043, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:15.267381', 'step': 3043, 'epoch': 2} {'type': 'loss', 'content': 0.0045262956991791725, 'timestamp': '2025-09-30 22:17:15.300735', 'step': 3044, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-09-30 22:17:15.351259', 'step': 3044, 'epoch': 2} {'type': 'loss', 'content': 0.009178610518574715, 'timestamp': '2025-09-30 22:17:15.367106', 'step': 3045, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:15.420548', 'step': 3045, 'epoch': 2} {'type': 'loss', 'content': 0.013096172362565994, 'timestamp': '2025-09-30 22:17:15.436932', 'step': 3046, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:17:15.478593', 'step': 3046, 'epoch': 2} {'type': 'loss', 'content': 0.007595235947519541, 'timestamp': '2025-09-30 22:17:15.491999', 'step': 3047, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-09-30 22:17:15.541654', 'step': 3047, 'epoch': 2} {'type': 'loss', 'content': 0.0028729906771332026, 'timestamp': '2025-09-30 22:17:15.578709', 'step': 3048, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:17:15.619597', 'step': 3048, 'epoch': 2} {'type': 'loss', 'content': 0.008607570081949234, 'timestamp': '2025-09-30 22:17:15.632706', 'step': 3049, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:15.680521', 'step': 3049, 'epoch': 2} {'type': 'loss', 'content': 0.004809021484106779, 'timestamp': '2025-09-30 22:17:15.693132', 'step': 3050, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 14238374516352}, 'timestamp': '2025-09-30 22:17:15.752766', 'step': 3050, 'epoch': 2} {'type': 'loss', 'content': 0.008124749176204205, 'timestamp': '2025-09-30 22:17:15.770135', 'step': 3051, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:17:15.830341', 'step': 3051, 'epoch': 2} {'type': 'loss', 'content': 0.006094157230108976, 'timestamp': '2025-09-30 22:17:15.865191', 'step': 3052, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:17:15.905289', 'step': 3052, 'epoch': 2} {'type': 'loss', 'content': 0.008468708023428917, 'timestamp': '2025-09-30 22:17:15.915435', 'step': 3053, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:17:15.966213', 'step': 3053, 'epoch': 2} {'type': 'loss', 'content': 0.006911612581461668, 'timestamp': '2025-09-30 22:17:15.973124', 'step': 3054, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:17:16.019021', 'step': 3054, 'epoch': 2} {'type': 'loss', 'content': 0.008364547975361347, 'timestamp': '2025-09-30 22:17:16.027050', 'step': 3055, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:17:16.076435', 'step': 3055, 'epoch': 2} {'type': 'loss', 'content': 0.010809467174112797, 'timestamp': '2025-09-30 22:17:16.105159', 'step': 3056, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:17:16.151796', 'step': 3056, 'epoch': 2} {'type': 'loss', 'content': 0.005176931619644165, 'timestamp': '2025-09-30 22:17:16.157398', 'step': 3057, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:17:16.207281', 'step': 3057, 'epoch': 2} {'type': 'loss', 'content': 0.006025994196534157, 'timestamp': '2025-09-30 22:17:16.215231', 'step': 3058, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:17:16.270242', 'step': 3058, 'epoch': 2} {'type': 'loss', 'content': 0.007871462032198906, 'timestamp': '2025-09-30 22:17:16.277671', 'step': 3059, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:16.328676', 'step': 3059, 'epoch': 2} {'type': 'loss', 'content': 0.005231105722486973, 'timestamp': '2025-09-30 22:17:16.359756', 'step': 3060, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:17:16.408175', 'step': 3060, 'epoch': 2} {'type': 'loss', 'content': 0.007062830962240696, 'timestamp': '2025-09-30 22:17:16.412934', 'step': 3061, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:16.461743', 'step': 3061, 'epoch': 2} {'type': 'loss', 'content': 0.004921720828860998, 'timestamp': '2025-09-30 22:17:16.472250', 'step': 3062, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:16.518336', 'step': 3062, 'epoch': 2} {'type': 'loss', 'content': 0.005504702217876911, 'timestamp': '2025-09-30 22:17:16.530847', 'step': 3063, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:17:16.577868', 'step': 3063, 'epoch': 2} {'type': 'loss', 'content': 0.01199677586555481, 'timestamp': '2025-09-30 22:17:16.608408', 'step': 3064, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:17:16.662637', 'step': 3064, 'epoch': 2} {'type': 'loss', 'content': 0.0182404275983572, 'timestamp': '2025-09-30 22:17:16.671114', 'step': 3065, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:16.726724', 'step': 3065, 'epoch': 2} {'type': 'loss', 'content': 0.0021776268258690834, 'timestamp': '2025-09-30 22:17:16.737079', 'step': 3066, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:17:16.776425', 'step': 3066, 'epoch': 2} {'type': 'loss', 'content': 0.003288572421297431, 'timestamp': '2025-09-30 22:17:16.783756', 'step': 3067, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:17:16.831347', 'step': 3067, 'epoch': 2} {'type': 'loss', 'content': 0.008751352317631245, 'timestamp': '2025-09-30 22:17:16.859204', 'step': 3068, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:17:16.921116', 'step': 3068, 'epoch': 2} {'type': 'loss', 'content': 0.002015740144997835, 'timestamp': '2025-09-30 22:17:16.924352', 'step': 3069, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:17:16.962523', 'step': 3069, 'epoch': 2} {'type': 'loss', 'content': 0.006834395695477724, 'timestamp': '2025-09-30 22:17:16.970227', 'step': 3070, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:17.009631', 'step': 3070, 'epoch': 2} {'type': 'loss', 'content': 0.005788995418697596, 'timestamp': '2025-09-30 22:17:17.021895', 'step': 3071, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:17:17.061097', 'step': 3071, 'epoch': 2} {'type': 'loss', 'content': 0.010872102342545986, 'timestamp': '2025-09-30 22:17:17.089880', 'step': 3072, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:17:17.142862', 'step': 3072, 'epoch': 2} {'type': 'loss', 'content': 0.0029374524019658566, 'timestamp': '2025-09-30 22:17:17.148127', 'step': 3073, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:17.202343', 'step': 3073, 'epoch': 2} {'type': 'loss', 'content': 0.0030281918589025736, 'timestamp': '2025-09-30 22:17:17.214664', 'step': 3074, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:17:17.269285', 'step': 3074, 'epoch': 2} {'type': 'loss', 'content': 0.005195050500333309, 'timestamp': '2025-09-30 22:17:17.282642', 'step': 3075, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:17:17.321975', 'step': 3075, 'epoch': 2} {'type': 'loss', 'content': 0.008502720855176449, 'timestamp': '2025-09-30 22:17:17.349858', 'step': 3076, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:17:17.391182', 'step': 3076, 'epoch': 2} {'type': 'loss', 'content': 0.004310964606702328, 'timestamp': '2025-09-30 22:17:17.396917', 'step': 3077, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:17:17.436861', 'step': 3077, 'epoch': 2} {'type': 'loss', 'content': 0.004775337874889374, 'timestamp': '2025-09-30 22:17:17.444796', 'step': 3078, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:17.490689', 'step': 3078, 'epoch': 2} {'type': 'loss', 'content': 0.012922474183142185, 'timestamp': '2025-09-30 22:17:17.501260', 'step': 3079, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:17.559947', 'step': 3079, 'epoch': 2} {'type': 'loss', 'content': 0.003970756195485592, 'timestamp': '2025-09-30 22:17:17.590605', 'step': 3080, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:17:17.633165', 'step': 3080, 'epoch': 2} {'type': 'loss', 'content': 0.020425280556082726, 'timestamp': '2025-09-30 22:17:17.638063', 'step': 3081, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:17:17.678431', 'step': 3081, 'epoch': 2} {'type': 'loss', 'content': 0.004362175241112709, 'timestamp': '2025-09-30 22:17:17.685666', 'step': 3082, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:17:17.722235', 'step': 3082, 'epoch': 2} {'type': 'loss', 'content': 0.007760615553706884, 'timestamp': '2025-09-30 22:17:17.729463', 'step': 3083, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:17.785058', 'step': 3083, 'epoch': 2} {'type': 'loss', 'content': 0.0030244977679103613, 'timestamp': '2025-09-30 22:17:17.818428', 'step': 3084, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:17.863019', 'step': 3084, 'epoch': 2} {'type': 'loss', 'content': 0.0019816835410892963, 'timestamp': '2025-09-30 22:17:17.870777', 'step': 3085, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:17:17.932984', 'step': 3085, 'epoch': 2} {'type': 'loss', 'content': 0.008955265395343304, 'timestamp': '2025-09-30 22:17:17.940671', 'step': 3086, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:17:17.988177', 'step': 3086, 'epoch': 2} {'type': 'loss', 'content': 0.006537090055644512, 'timestamp': '2025-09-30 22:17:17.995765', 'step': 3087, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:17:18.043815', 'step': 3087, 'epoch': 2} {'type': 'loss', 'content': 0.0045349858701229095, 'timestamp': '2025-09-30 22:17:18.072638', 'step': 3088, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:17:18.109857', 'step': 3088, 'epoch': 2} {'type': 'loss', 'content': 0.0019489992409944534, 'timestamp': '2025-09-30 22:17:18.114704', 'step': 3089, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:17:18.162190', 'step': 3089, 'epoch': 2} {'type': 'loss', 'content': 0.007369097787886858, 'timestamp': '2025-09-30 22:17:18.170125', 'step': 3090, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:18.208164', 'step': 3090, 'epoch': 2} {'type': 'loss', 'content': 0.007728835102170706, 'timestamp': '2025-09-30 22:17:18.219190', 'step': 3091, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:18.261980', 'step': 3091, 'epoch': 2} {'type': 'loss', 'content': 0.006376664619892836, 'timestamp': '2025-09-30 22:17:18.293163', 'step': 3092, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:18.340763', 'step': 3092, 'epoch': 2} {'type': 'loss', 'content': 0.011707457713782787, 'timestamp': '2025-09-30 22:17:18.349447', 'step': 3093, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:17:18.391062', 'step': 3093, 'epoch': 2} {'type': 'loss', 'content': 0.0010542640229687095, 'timestamp': '2025-09-30 22:17:18.398847', 'step': 3094, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:17:18.446450', 'step': 3094, 'epoch': 2} {'type': 'loss', 'content': 0.006500933784991503, 'timestamp': '2025-09-30 22:17:18.454134', 'step': 3095, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:17:18.495974', 'step': 3095, 'epoch': 2} {'type': 'loss', 'content': 0.004050260875374079, 'timestamp': '2025-09-30 22:17:18.524724', 'step': 3096, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:18.561262', 'step': 3096, 'epoch': 2} {'type': 'loss', 'content': 0.009400052949786186, 'timestamp': '2025-09-30 22:17:18.569814', 'step': 3097, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:18.609096', 'step': 3097, 'epoch': 2} {'type': 'loss', 'content': 0.003274328075349331, 'timestamp': '2025-09-30 22:17:18.619339', 'step': 3098, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:18.654643', 'step': 3098, 'epoch': 2} {'type': 'loss', 'content': 0.0031144127715379, 'timestamp': '2025-09-30 22:17:18.664875', 'step': 3099, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:18.728374', 'step': 3099, 'epoch': 2} {'type': 'loss', 'content': 0.001828193198889494, 'timestamp': '2025-09-30 22:17:18.760104', 'step': 3100, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:18.801727', 'step': 3100, 'epoch': 2} {'type': 'loss', 'content': 0.004855462349951267, 'timestamp': '2025-09-30 22:17:18.809767', 'step': 3101, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:18.856681', 'step': 3101, 'epoch': 2} {'type': 'loss', 'content': 0.007850201800465584, 'timestamp': '2025-09-30 22:17:18.867744', 'step': 3102, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:17:18.914726', 'step': 3102, 'epoch': 2} {'type': 'loss', 'content': 0.0016034794971346855, 'timestamp': '2025-09-30 22:17:18.922722', 'step': 3103, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:18.957495', 'step': 3103, 'epoch': 2} {'type': 'loss', 'content': 0.050340570509433746, 'timestamp': '2025-09-30 22:17:18.989455', 'step': 3104, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:19.023702', 'step': 3104, 'epoch': 2} {'type': 'loss', 'content': 0.006957797799259424, 'timestamp': '2025-09-30 22:17:19.032287', 'step': 3105, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-09-30 22:17:21.694363', 'step': 3105, 'epoch': 2} {'type': 'pplx', 'content': 5.915286211797802, 'timestamp': '2025-09-30 22:17:21.697768', 'step': 3105, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:21.738087', 'step': 3105, 'epoch': 2} {'type': 'loss', 'content': 0.014353587292134762, 'timestamp': '2025-09-30 22:17:21.749638', 'step': 3106, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:21.794936', 'step': 3106, 'epoch': 2} {'type': 'loss', 'content': 0.0015971955144777894, 'timestamp': '2025-09-30 22:17:21.807534', 'step': 3107, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:21.860590', 'step': 3107, 'epoch': 2} {'type': 'loss', 'content': 0.0026139733381569386, 'timestamp': '2025-09-30 22:17:21.892341', 'step': 3108, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:21.928180', 'step': 3108, 'epoch': 2} {'type': 'loss', 'content': 0.003356874454766512, 'timestamp': '2025-09-30 22:17:21.936980', 'step': 3109, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:21.974838', 'step': 3109, 'epoch': 2} {'type': 'loss', 'content': 0.005268130451440811, 'timestamp': '2025-09-30 22:17:21.985946', 'step': 3110, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:22.026853', 'step': 3110, 'epoch': 2} {'type': 'loss', 'content': 0.005021201446652412, 'timestamp': '2025-09-30 22:17:22.039073', 'step': 3111, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:17:22.093171', 'step': 3111, 'epoch': 2} {'type': 'loss', 'content': 0.0069105904549360275, 'timestamp': '2025-09-30 22:17:22.121972', 'step': 3112, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:17:22.163915', 'step': 3112, 'epoch': 2} {'type': 'loss', 'content': 0.011179746128618717, 'timestamp': '2025-09-30 22:17:22.181149', 'step': 3113, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:22.228091', 'step': 3113, 'epoch': 2} {'type': 'loss', 'content': 0.0006776591180823743, 'timestamp': '2025-09-30 22:17:22.239160', 'step': 3114, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:22.277261', 'step': 3114, 'epoch': 2} {'type': 'loss', 'content': 0.006834322586655617, 'timestamp': '2025-09-30 22:17:22.289877', 'step': 3115, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:17:22.334046', 'step': 3115, 'epoch': 2} {'type': 'loss', 'content': 0.012681066989898682, 'timestamp': '2025-09-30 22:17:22.368288', 'step': 3116, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:22.406542', 'step': 3116, 'epoch': 2} {'type': 'loss', 'content': 0.0020665479823946953, 'timestamp': '2025-09-30 22:17:22.414412', 'step': 3117, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:22.463655', 'step': 3117, 'epoch': 2} {'type': 'loss', 'content': 0.0028975976165384054, 'timestamp': '2025-09-30 22:17:22.476255', 'step': 3118, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:22.524353', 'step': 3118, 'epoch': 2} {'type': 'loss', 'content': 0.0044283573515713215, 'timestamp': '2025-09-30 22:17:22.536688', 'step': 3119, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:22.587305', 'step': 3119, 'epoch': 2} {'type': 'loss', 'content': 0.016512639820575714, 'timestamp': '2025-09-30 22:17:22.620759', 'step': 3120, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:17:22.657410', 'step': 3120, 'epoch': 2} {'type': 'loss', 'content': 0.007381323724985123, 'timestamp': '2025-09-30 22:17:22.670017', 'step': 3121, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:22.704965', 'step': 3121, 'epoch': 2} {'type': 'loss', 'content': 0.0029324996285140514, 'timestamp': '2025-09-30 22:17:22.715970', 'step': 3122, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:22.757110', 'step': 3122, 'epoch': 2} {'type': 'loss', 'content': 0.01893054135143757, 'timestamp': '2025-09-30 22:17:22.768242', 'step': 3123, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:22.811161', 'step': 3123, 'epoch': 2} {'type': 'loss', 'content': 0.006030916702002287, 'timestamp': '2025-09-30 22:17:22.844359', 'step': 3124, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:22.878923', 'step': 3124, 'epoch': 2} {'type': 'loss', 'content': 0.0008163400925695896, 'timestamp': '2025-09-30 22:17:22.887818', 'step': 3125, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:22.940676', 'step': 3125, 'epoch': 2} {'type': 'loss', 'content': 0.0037008821964263916, 'timestamp': '2025-09-30 22:17:22.951828', 'step': 3126, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:22.993521', 'step': 3126, 'epoch': 2} {'type': 'loss', 'content': 0.0012607695534825325, 'timestamp': '2025-09-30 22:17:23.006085', 'step': 3127, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:23.040621', 'step': 3127, 'epoch': 2} {'type': 'loss', 'content': 0.002586707007139921, 'timestamp': '2025-09-30 22:17:23.072745', 'step': 3128, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:23.119279', 'step': 3128, 'epoch': 2} {'type': 'loss', 'content': 0.005804563872516155, 'timestamp': '2025-09-30 22:17:23.129848', 'step': 3129, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:23.165734', 'step': 3129, 'epoch': 2} {'type': 'loss', 'content': 0.004052955657243729, 'timestamp': '2025-09-30 22:17:23.176208', 'step': 3130, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:17:23.219929', 'step': 3130, 'epoch': 2} {'type': 'loss', 'content': 0.013816587626934052, 'timestamp': '2025-09-30 22:17:23.227494', 'step': 3131, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:23.262091', 'step': 3131, 'epoch': 2} {'type': 'loss', 'content': 0.000520117289852351, 'timestamp': '2025-09-30 22:17:23.295307', 'step': 3132, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:23.331290', 'step': 3132, 'epoch': 2} {'type': 'loss', 'content': 0.006719049997627735, 'timestamp': '2025-09-30 22:17:23.342063', 'step': 3133, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:17:23.393863', 'step': 3133, 'epoch': 2} {'type': 'loss', 'content': 0.0056020780466496944, 'timestamp': '2025-09-30 22:17:23.402006', 'step': 3134, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:23.441780', 'step': 3134, 'epoch': 2} {'type': 'loss', 'content': 0.008619586937129498, 'timestamp': '2025-09-30 22:17:23.454105', 'step': 3135, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:17:23.502541', 'step': 3135, 'epoch': 2} {'type': 'loss', 'content': 0.0013584374682977796, 'timestamp': '2025-09-30 22:17:23.531297', 'step': 3136, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:23.566069', 'step': 3136, 'epoch': 2} {'type': 'loss', 'content': 0.002616981277242303, 'timestamp': '2025-09-30 22:17:23.576477', 'step': 3137, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:23.614507', 'step': 3137, 'epoch': 2} {'type': 'loss', 'content': 0.0022322519216686487, 'timestamp': '2025-09-30 22:17:23.626751', 'step': 3138, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:23.668329', 'step': 3138, 'epoch': 2} {'type': 'loss', 'content': 0.013231469318270683, 'timestamp': '2025-09-30 22:17:23.680715', 'step': 3139, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:23.717910', 'step': 3139, 'epoch': 2} {'type': 'loss', 'content': 0.008670350536704063, 'timestamp': '2025-09-30 22:17:23.751378', 'step': 3140, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:17:23.787895', 'step': 3140, 'epoch': 2} {'type': 'loss', 'content': 0.0005706910160370171, 'timestamp': '2025-09-30 22:17:23.800483', 'step': 3141, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:23.845725', 'step': 3141, 'epoch': 2} {'type': 'loss', 'content': 0.00035360330366529524, 'timestamp': '2025-09-30 22:17:23.858316', 'step': 3142, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:23.908820', 'step': 3142, 'epoch': 2} {'type': 'loss', 'content': 0.0022373131942003965, 'timestamp': '2025-09-30 22:17:23.919925', 'step': 3143, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:23.955211', 'step': 3143, 'epoch': 2} {'type': 'loss', 'content': 0.001455026096664369, 'timestamp': '2025-09-30 22:17:23.988421', 'step': 3144, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:17:24.024154', 'step': 3144, 'epoch': 2} {'type': 'loss', 'content': 0.0032580397091805935, 'timestamp': '2025-09-30 22:17:24.036804', 'step': 3145, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:17:24.087963', 'step': 3145, 'epoch': 2} {'type': 'loss', 'content': 0.02811695635318756, 'timestamp': '2025-09-30 22:17:24.101681', 'step': 3146, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:24.152152', 'step': 3146, 'epoch': 2} {'type': 'loss', 'content': 0.002767069498077035, 'timestamp': '2025-09-30 22:17:24.164697', 'step': 3147, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:24.218777', 'step': 3147, 'epoch': 2} {'type': 'loss', 'content': 0.0006574054714292288, 'timestamp': '2025-09-30 22:17:24.252182', 'step': 3148, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:24.299821', 'step': 3148, 'epoch': 2} {'type': 'loss', 'content': 0.007816782221198082, 'timestamp': '2025-09-30 22:17:24.308503', 'step': 3149, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:24.355695', 'step': 3149, 'epoch': 2} {'type': 'loss', 'content': 0.004145421087741852, 'timestamp': '2025-09-30 22:17:24.368298', 'step': 3150, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:24.412544', 'step': 3150, 'epoch': 2} {'type': 'loss', 'content': 0.0063446518033742905, 'timestamp': '2025-09-30 22:17:24.425116', 'step': 3151, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:24.484197', 'step': 3151, 'epoch': 2} {'type': 'loss', 'content': 0.013051629066467285, 'timestamp': '2025-09-30 22:17:24.517442', 'step': 3152, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:17:24.557427', 'step': 3152, 'epoch': 2} {'type': 'loss', 'content': 0.004559283144772053, 'timestamp': '2025-09-30 22:17:24.562797', 'step': 3153, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:24.605492', 'step': 3153, 'epoch': 2} {'type': 'loss', 'content': 0.021444272249937057, 'timestamp': '2025-09-30 22:17:24.617776', 'step': 3154, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:24.662113', 'step': 3154, 'epoch': 2} {'type': 'loss', 'content': 0.008376072160899639, 'timestamp': '2025-09-30 22:17:24.674610', 'step': 3155, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:24.728068', 'step': 3155, 'epoch': 2} {'type': 'loss', 'content': 0.012633971869945526, 'timestamp': '2025-09-30 22:17:24.759338', 'step': 3156, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:24.793521', 'step': 3156, 'epoch': 2} {'type': 'loss', 'content': 0.007506215944886208, 'timestamp': '2025-09-30 22:17:24.801387', 'step': 3157, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:24.861407', 'step': 3157, 'epoch': 2} {'type': 'loss', 'content': 0.014985278248786926, 'timestamp': '2025-09-30 22:17:24.871691', 'step': 3158, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:17:24.904779', 'step': 3158, 'epoch': 2} {'type': 'loss', 'content': 0.00818850938230753, 'timestamp': '2025-09-30 22:17:24.911849', 'step': 3159, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:17:24.949071', 'step': 3159, 'epoch': 2} {'type': 'loss', 'content': 0.0067369369789958, 'timestamp': '2025-09-30 22:17:24.977423', 'step': 3160, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:17:25.012730', 'step': 3160, 'epoch': 2} {'type': 'loss', 'content': 0.008044580928981304, 'timestamp': '2025-09-30 22:17:25.018023', 'step': 3161, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:17:25.067848', 'step': 3161, 'epoch': 2} {'type': 'loss', 'content': 0.0038618387188762426, 'timestamp': '2025-09-30 22:17:25.075516', 'step': 3162, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:17:25.110873', 'step': 3162, 'epoch': 2} {'type': 'loss', 'content': 0.005019427742809057, 'timestamp': '2025-09-30 22:17:25.118541', 'step': 3163, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:17:25.152865', 'step': 3163, 'epoch': 2} {'type': 'loss', 'content': 0.0045925346203148365, 'timestamp': '2025-09-30 22:17:25.181335', 'step': 3164, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:17:25.223982', 'step': 3164, 'epoch': 2} {'type': 'loss', 'content': 0.016353843733668327, 'timestamp': '2025-09-30 22:17:25.232460', 'step': 3165, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:25.271158', 'step': 3165, 'epoch': 2} {'type': 'loss', 'content': 0.012923413887619972, 'timestamp': '2025-09-30 22:17:25.281450', 'step': 3166, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:17:25.329213', 'step': 3166, 'epoch': 2} {'type': 'loss', 'content': 0.0025689145550131798, 'timestamp': '2025-09-30 22:17:25.337431', 'step': 3167, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:17:25.383799', 'step': 3167, 'epoch': 2} {'type': 'loss', 'content': 0.009323082864284515, 'timestamp': '2025-09-30 22:17:25.412374', 'step': 3168, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:17:25.467545', 'step': 3168, 'epoch': 2} {'type': 'loss', 'content': 0.010412830859422684, 'timestamp': '2025-09-30 22:17:25.472682', 'step': 3169, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:17:25.507212', 'step': 3169, 'epoch': 2} {'type': 'loss', 'content': 0.007618281990289688, 'timestamp': '2025-09-30 22:17:25.511411', 'step': 3170, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:25.545617', 'step': 3170, 'epoch': 2} {'type': 'loss', 'content': 0.0035928383003920317, 'timestamp': '2025-09-30 22:17:25.555996', 'step': 3171, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:25.590404', 'step': 3171, 'epoch': 2} {'type': 'loss', 'content': 0.01205290574580431, 'timestamp': '2025-09-30 22:17:25.623582', 'step': 3172, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:17:25.675880', 'step': 3172, 'epoch': 2} {'type': 'loss', 'content': 0.00397237204015255, 'timestamp': '2025-09-30 22:17:25.688931', 'step': 3173, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:17:25.733974', 'step': 3173, 'epoch': 2} {'type': 'loss', 'content': 0.00367568526417017, 'timestamp': '2025-09-30 22:17:25.741364', 'step': 3174, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:25.799702', 'step': 3174, 'epoch': 2} {'type': 'loss', 'content': 0.0020569413900375366, 'timestamp': '2025-09-30 22:17:25.811930', 'step': 3175, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:17:25.855138', 'step': 3175, 'epoch': 2} {'type': 'loss', 'content': 0.008882638067007065, 'timestamp': '2025-09-30 22:17:25.889345', 'step': 3176, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:25.926983', 'step': 3176, 'epoch': 2} {'type': 'loss', 'content': 0.007198006846010685, 'timestamp': '2025-09-30 22:17:25.935640', 'step': 3177, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:17:25.982980', 'step': 3177, 'epoch': 2} {'type': 'loss', 'content': 0.006660681217908859, 'timestamp': '2025-09-30 22:17:25.996379', 'step': 3178, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:26.037650', 'step': 3178, 'epoch': 2} {'type': 'loss', 'content': 0.047912854701280594, 'timestamp': '2025-09-30 22:17:26.050013', 'step': 3179, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:26.088712', 'step': 3179, 'epoch': 2} {'type': 'loss', 'content': 0.004959320183843374, 'timestamp': '2025-09-30 22:17:26.120478', 'step': 3180, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:26.165091', 'step': 3180, 'epoch': 2} {'type': 'loss', 'content': 0.006877145264297724, 'timestamp': '2025-09-30 22:17:26.172951', 'step': 3181, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:26.214096', 'step': 3181, 'epoch': 2} {'type': 'loss', 'content': 0.002868048846721649, 'timestamp': '2025-09-30 22:17:26.226652', 'step': 3182, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:26.274881', 'step': 3182, 'epoch': 2} {'type': 'loss', 'content': 0.0076978690922260284, 'timestamp': '2025-09-30 22:17:26.287070', 'step': 3183, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:26.328298', 'step': 3183, 'epoch': 2} {'type': 'loss', 'content': 0.012668581679463387, 'timestamp': '2025-09-30 22:17:26.361439', 'step': 3184, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:26.396502', 'step': 3184, 'epoch': 2} {'type': 'loss', 'content': 0.010230972431600094, 'timestamp': '2025-09-30 22:17:26.404439', 'step': 3185, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:26.439228', 'step': 3185, 'epoch': 2} {'type': 'loss', 'content': 0.011350931599736214, 'timestamp': '2025-09-30 22:17:26.450371', 'step': 3186, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:26.490598', 'step': 3186, 'epoch': 2} {'type': 'loss', 'content': 0.007605713326483965, 'timestamp': '2025-09-30 22:17:26.501057', 'step': 3187, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:26.535283', 'step': 3187, 'epoch': 2} {'type': 'loss', 'content': 0.009302949532866478, 'timestamp': '2025-09-30 22:17:26.566550', 'step': 3188, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:26.609061', 'step': 3188, 'epoch': 2} {'type': 'loss', 'content': 0.006530994549393654, 'timestamp': '2025-09-30 22:17:26.619059', 'step': 3189, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:26.663734', 'step': 3189, 'epoch': 2} {'type': 'loss', 'content': 0.01011677272617817, 'timestamp': '2025-09-30 22:17:26.676135', 'step': 3190, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:26.710463', 'step': 3190, 'epoch': 2} {'type': 'loss', 'content': 0.007625059224665165, 'timestamp': '2025-09-30 22:17:26.721483', 'step': 3191, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:17:26.759797', 'step': 3191, 'epoch': 2} {'type': 'loss', 'content': 0.017014721408486366, 'timestamp': '2025-09-30 22:17:26.794495', 'step': 3192, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:26.832516', 'step': 3192, 'epoch': 2} {'type': 'loss', 'content': 0.006755263078957796, 'timestamp': '2025-09-30 22:17:26.843346', 'step': 3193, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:17:26.890062', 'step': 3193, 'epoch': 2} {'type': 'loss', 'content': 0.007832643575966358, 'timestamp': '2025-09-30 22:17:26.898058', 'step': 3194, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:26.932531', 'step': 3194, 'epoch': 2} {'type': 'loss', 'content': 0.010002926923334599, 'timestamp': '2025-09-30 22:17:26.945069', 'step': 3195, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:17:26.978892', 'step': 3195, 'epoch': 2} {'type': 'loss', 'content': 0.01744220219552517, 'timestamp': '2025-09-30 22:17:27.007215', 'step': 3196, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:27.043984', 'step': 3196, 'epoch': 2} {'type': 'loss', 'content': 0.007402005139738321, 'timestamp': '2025-09-30 22:17:27.052765', 'step': 3197, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:27.089681', 'step': 3197, 'epoch': 2} {'type': 'loss', 'content': 0.008514383807778358, 'timestamp': '2025-09-30 22:17:27.100784', 'step': 3198, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:17:27.134732', 'step': 3198, 'epoch': 2} {'type': 'loss', 'content': 0.0062779998406767845, 'timestamp': '2025-09-30 22:17:27.141876', 'step': 3199, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:17:27.183349', 'step': 3199, 'epoch': 2} {'type': 'loss', 'content': 0.011944870464503765, 'timestamp': '2025-09-30 22:17:27.211784', 'step': 3200, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:17:27.254086', 'step': 3200, 'epoch': 2} {'type': 'loss', 'content': 0.007012359332293272, 'timestamp': '2025-09-30 22:17:27.258677', 'step': 3201, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:17:27.298105', 'step': 3201, 'epoch': 2} {'type': 'loss', 'content': 0.009369988925755024, 'timestamp': '2025-09-30 22:17:27.302521', 'step': 3202, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:17:27.357373', 'step': 3202, 'epoch': 2} {'type': 'loss', 'content': 0.0048002698458731174, 'timestamp': '2025-09-30 22:17:27.364401', 'step': 3203, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:27.400172', 'step': 3203, 'epoch': 2} {'type': 'loss', 'content': 0.008321247063577175, 'timestamp': '2025-09-30 22:17:27.431280', 'step': 3204, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:27.466594', 'step': 3204, 'epoch': 2} {'type': 'loss', 'content': 0.00541403004899621, 'timestamp': '2025-09-30 22:17:27.474584', 'step': 3205, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:27.513087', 'step': 3205, 'epoch': 2} {'type': 'loss', 'content': 0.015202262438833714, 'timestamp': '2025-09-30 22:17:27.525629', 'step': 3206, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:17:27.581054', 'step': 3206, 'epoch': 2} {'type': 'loss', 'content': 0.013311178423464298, 'timestamp': '2025-09-30 22:17:27.594792', 'step': 3207, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:17:27.654637', 'step': 3207, 'epoch': 2} {'type': 'loss', 'content': 0.0028243535198271275, 'timestamp': '2025-09-30 22:17:27.683283', 'step': 3208, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:17:27.717994', 'step': 3208, 'epoch': 2} {'type': 'loss', 'content': 0.0033892698120325804, 'timestamp': '2025-09-30 22:17:27.723271', 'step': 3209, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:17:27.759010', 'step': 3209, 'epoch': 2} {'type': 'loss', 'content': 0.010153659619390965, 'timestamp': '2025-09-30 22:17:27.766708', 'step': 3210, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:27.809288', 'step': 3210, 'epoch': 2} {'type': 'loss', 'content': 0.012320290319621563, 'timestamp': '2025-09-30 22:17:27.820286', 'step': 3211, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:17:27.863111', 'step': 3211, 'epoch': 2} {'type': 'loss', 'content': 0.007215319201350212, 'timestamp': '2025-09-30 22:17:27.891293', 'step': 3212, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:27.936336', 'step': 3212, 'epoch': 2} {'type': 'loss', 'content': 0.015331793576478958, 'timestamp': '2025-09-30 22:17:27.944240', 'step': 3213, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:17:27.982474', 'step': 3213, 'epoch': 2} {'type': 'loss', 'content': 0.009551141411066055, 'timestamp': '2025-09-30 22:17:27.996280', 'step': 3214, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:17:28.035491', 'step': 3214, 'epoch': 2} {'type': 'loss', 'content': 0.005144950468093157, 'timestamp': '2025-09-30 22:17:28.045610', 'step': 3215, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:28.084333', 'step': 3215, 'epoch': 2} {'type': 'loss', 'content': 0.006618922110646963, 'timestamp': '2025-09-30 22:17:28.116243', 'step': 3216, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:28.151348', 'step': 3216, 'epoch': 2} {'type': 'loss', 'content': 0.006120654754340649, 'timestamp': '2025-09-30 22:17:28.159966', 'step': 3217, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:28.195262', 'step': 3217, 'epoch': 2} {'type': 'loss', 'content': 0.00691972067579627, 'timestamp': '2025-09-30 22:17:28.207788', 'step': 3218, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:28.249274', 'step': 3218, 'epoch': 2} {'type': 'loss', 'content': 0.0032606294844299555, 'timestamp': '2025-09-30 22:17:28.259644', 'step': 3219, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:28.294929', 'step': 3219, 'epoch': 2} {'type': 'loss', 'content': 0.007216659840196371, 'timestamp': '2025-09-30 22:17:28.326023', 'step': 3220, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-09-30 22:17:31.178026', 'step': 3220, 'epoch': 2} {'type': 'pplx', 'content': 5.789587159789891, 'timestamp': '2025-09-30 22:17:31.180389', 'step': 3220, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:31.224306', 'step': 3220, 'epoch': 2} {'type': 'loss', 'content': 0.005082657095044851, 'timestamp': '2025-09-30 22:17:31.233090', 'step': 3221, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:31.280156', 'step': 3221, 'epoch': 2} {'type': 'loss', 'content': 0.008041946217417717, 'timestamp': '2025-09-30 22:17:31.291004', 'step': 3222, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:17:31.338897', 'step': 3222, 'epoch': 2} {'type': 'loss', 'content': 0.002013694727793336, 'timestamp': '2025-09-30 22:17:31.346168', 'step': 3223, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:31.394610', 'step': 3223, 'epoch': 2} {'type': 'loss', 'content': 0.008580495603382587, 'timestamp': '2025-09-30 22:17:31.426231', 'step': 3224, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:17:31.462942', 'step': 3224, 'epoch': 2} {'type': 'loss', 'content': 0.006691499147564173, 'timestamp': '2025-09-30 22:17:31.476016', 'step': 3225, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:17:31.524494', 'step': 3225, 'epoch': 2} {'type': 'loss', 'content': 0.011357761919498444, 'timestamp': '2025-09-30 22:17:31.532316', 'step': 3226, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:17:31.584656', 'step': 3226, 'epoch': 2} {'type': 'loss', 'content': 0.017691265791654587, 'timestamp': '2025-09-30 22:17:31.598351', 'step': 3227, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:17:31.638250', 'step': 3227, 'epoch': 2} {'type': 'loss', 'content': 0.003643968142569065, 'timestamp': '2025-09-30 22:17:31.672341', 'step': 3228, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:31.719198', 'step': 3228, 'epoch': 2} {'type': 'loss', 'content': 0.00515143945813179, 'timestamp': '2025-09-30 22:17:31.728992', 'step': 3229, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:17:31.788540', 'step': 3229, 'epoch': 2} {'type': 'loss', 'content': 0.012103057466447353, 'timestamp': '2025-09-30 22:17:31.801964', 'step': 3230, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:31.847874', 'step': 3230, 'epoch': 2} {'type': 'loss', 'content': 0.004100447986274958, 'timestamp': '2025-09-30 22:17:31.858011', 'step': 3231, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:31.893545', 'step': 3231, 'epoch': 2} {'type': 'loss', 'content': 0.012488999404013157, 'timestamp': '2025-09-30 22:17:31.925254', 'step': 3232, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:17:31.960712', 'step': 3232, 'epoch': 2} {'type': 'loss', 'content': 0.011300384066998959, 'timestamp': '2025-09-30 22:17:31.965460', 'step': 3233, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:17:32.001360', 'step': 3233, 'epoch': 2} {'type': 'loss', 'content': 0.01623716950416565, 'timestamp': '2025-09-30 22:17:32.008978', 'step': 3234, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:17:32.042234', 'step': 3234, 'epoch': 2} {'type': 'loss', 'content': 0.004639583174139261, 'timestamp': '2025-09-30 22:17:32.049858', 'step': 3235, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:17:32.086210', 'step': 3235, 'epoch': 2} {'type': 'loss', 'content': 0.0062539177015423775, 'timestamp': '2025-09-30 22:17:32.114714', 'step': 3236, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:17:32.150941', 'step': 3236, 'epoch': 2} {'type': 'loss', 'content': 0.0031055191066116095, 'timestamp': '2025-09-30 22:17:32.163505', 'step': 3237, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:32.204379', 'step': 3237, 'epoch': 2} {'type': 'loss', 'content': 0.004861655179411173, 'timestamp': '2025-09-30 22:17:32.215314', 'step': 3238, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:32.274324', 'step': 3238, 'epoch': 2} {'type': 'loss', 'content': 0.005641878582537174, 'timestamp': '2025-09-30 22:17:32.285251', 'step': 3239, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:17:32.346365', 'step': 3239, 'epoch': 2} {'type': 'loss', 'content': 0.007063304539769888, 'timestamp': '2025-09-30 22:17:32.381043', 'step': 3240, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:32.415296', 'step': 3240, 'epoch': 2} {'type': 'loss', 'content': 0.008499303832650185, 'timestamp': '2025-09-30 22:17:32.423942', 'step': 3241, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:32.462734', 'step': 3241, 'epoch': 2} {'type': 'loss', 'content': 0.006319647654891014, 'timestamp': '2025-09-30 22:17:32.473748', 'step': 3242, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:32.527219', 'step': 3242, 'epoch': 2} {'type': 'loss', 'content': 0.006031288765370846, 'timestamp': '2025-09-30 22:17:32.537411', 'step': 3243, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:32.587161', 'step': 3243, 'epoch': 2} {'type': 'loss', 'content': 0.00663584191352129, 'timestamp': '2025-09-30 22:17:32.618294', 'step': 3244, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:32.668909', 'step': 3244, 'epoch': 2} {'type': 'loss', 'content': 0.003447463968768716, 'timestamp': '2025-09-30 22:17:32.676773', 'step': 3245, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:17:32.721416', 'step': 3245, 'epoch': 2} {'type': 'loss', 'content': 0.002404881175607443, 'timestamp': '2025-09-30 22:17:32.728481', 'step': 3246, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:32.766877', 'step': 3246, 'epoch': 2} {'type': 'loss', 'content': 0.001494319294579327, 'timestamp': '2025-09-30 22:17:32.777325', 'step': 3247, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:17:32.814898', 'step': 3247, 'epoch': 2} {'type': 'loss', 'content': 0.0030386210419237614, 'timestamp': '2025-09-30 22:17:32.843751', 'step': 3248, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:17:32.880359', 'step': 3248, 'epoch': 2} {'type': 'loss', 'content': 0.003679205197840929, 'timestamp': '2025-09-30 22:17:32.892985', 'step': 3249, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:17:32.929262', 'step': 3249, 'epoch': 2} {'type': 'loss', 'content': 0.012156999669969082, 'timestamp': '2025-09-30 22:17:32.937238', 'step': 3250, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:32.985666', 'step': 3250, 'epoch': 2} {'type': 'loss', 'content': 0.00451328419148922, 'timestamp': '2025-09-30 22:17:32.996745', 'step': 3251, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:33.059606', 'step': 3251, 'epoch': 2} {'type': 'loss', 'content': 0.010425938293337822, 'timestamp': '2025-09-30 22:17:33.093081', 'step': 3252, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:33.129960', 'step': 3252, 'epoch': 2} {'type': 'loss', 'content': 0.009341493248939514, 'timestamp': '2025-09-30 22:17:33.138700', 'step': 3253, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:33.175675', 'step': 3253, 'epoch': 2} {'type': 'loss', 'content': 0.014587215147912502, 'timestamp': '2025-09-30 22:17:33.188089', 'step': 3254, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:17:33.233720', 'step': 3254, 'epoch': 2} {'type': 'loss', 'content': 0.02171679027378559, 'timestamp': '2025-09-30 22:17:33.241555', 'step': 3255, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:33.290111', 'step': 3255, 'epoch': 2} {'type': 'loss', 'content': 0.024895792827010155, 'timestamp': '2025-09-30 22:17:33.321677', 'step': 3256, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:33.364219', 'step': 3256, 'epoch': 2} {'type': 'loss', 'content': 0.004205517005175352, 'timestamp': '2025-09-30 22:17:33.374857', 'step': 3257, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:33.423616', 'step': 3257, 'epoch': 2} {'type': 'loss', 'content': 0.005508000962436199, 'timestamp': '2025-09-30 22:17:33.435985', 'step': 3258, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:17:33.495350', 'step': 3258, 'epoch': 2} {'type': 'loss', 'content': 0.0037440420128405094, 'timestamp': '2025-09-30 22:17:33.508684', 'step': 3259, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:33.555723', 'step': 3259, 'epoch': 2} {'type': 'loss', 'content': 0.010127577930688858, 'timestamp': '2025-09-30 22:17:33.587730', 'step': 3260, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:17:33.639550', 'step': 3260, 'epoch': 2} {'type': 'loss', 'content': 0.003603320801630616, 'timestamp': '2025-09-30 22:17:33.644305', 'step': 3261, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:33.685371', 'step': 3261, 'epoch': 2} {'type': 'loss', 'content': 0.008757252246141434, 'timestamp': '2025-09-30 22:17:33.696481', 'step': 3262, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:17:33.740643', 'step': 3262, 'epoch': 2} {'type': 'loss', 'content': 0.002329410519450903, 'timestamp': '2025-09-30 22:17:33.749796', 'step': 3263, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:33.795534', 'step': 3263, 'epoch': 2} {'type': 'loss', 'content': 0.007676136679947376, 'timestamp': '2025-09-30 22:17:33.826621', 'step': 3264, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:33.880600', 'step': 3264, 'epoch': 2} {'type': 'loss', 'content': 0.008814936503767967, 'timestamp': '2025-09-30 22:17:33.889485', 'step': 3265, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:17:33.958757', 'step': 3265, 'epoch': 2} {'type': 'loss', 'content': 0.005114632658660412, 'timestamp': '2025-09-30 22:17:33.966716', 'step': 3266, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:34.004810', 'step': 3266, 'epoch': 2} {'type': 'loss', 'content': 0.008615276776254177, 'timestamp': '2025-09-30 22:17:34.015924', 'step': 3267, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:17:34.067863', 'step': 3267, 'epoch': 2} {'type': 'loss', 'content': 0.004307322669774294, 'timestamp': '2025-09-30 22:17:34.095939', 'step': 3268, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:17:34.146349', 'step': 3268, 'epoch': 2} {'type': 'loss', 'content': 0.007383343297988176, 'timestamp': '2025-09-30 22:17:34.151313', 'step': 3269, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:17:34.185988', 'step': 3269, 'epoch': 2} {'type': 'loss', 'content': 0.0019633248448371887, 'timestamp': '2025-09-30 22:17:34.192853', 'step': 3270, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:17:34.237473', 'step': 3270, 'epoch': 2} {'type': 'loss', 'content': 0.002463744254782796, 'timestamp': '2025-09-30 22:17:34.246953', 'step': 3271, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:34.288958', 'step': 3271, 'epoch': 2} {'type': 'loss', 'content': 0.00518502201884985, 'timestamp': '2025-09-30 22:17:34.320141', 'step': 3272, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:34.362128', 'step': 3272, 'epoch': 2} {'type': 'loss', 'content': 0.011479921638965607, 'timestamp': '2025-09-30 22:17:34.372316', 'step': 3273, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:17:34.418658', 'step': 3273, 'epoch': 2} {'type': 'loss', 'content': 0.0314481146633625, 'timestamp': '2025-09-30 22:17:34.426482', 'step': 3274, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:17:34.466740', 'step': 3274, 'epoch': 2} {'type': 'loss', 'content': 0.005712313577532768, 'timestamp': '2025-09-30 22:17:34.470856', 'step': 3275, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:17:34.512323', 'step': 3275, 'epoch': 2} {'type': 'loss', 'content': 0.00974821113049984, 'timestamp': '2025-09-30 22:17:34.540050', 'step': 3276, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:17:34.574917', 'step': 3276, 'epoch': 2} {'type': 'loss', 'content': 0.001180569757707417, 'timestamp': '2025-09-30 22:17:34.578067', 'step': 3277, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:17:34.618444', 'step': 3277, 'epoch': 2} {'type': 'loss', 'content': 0.0017637682612985373, 'timestamp': '2025-09-30 22:17:34.625588', 'step': 3278, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:17:34.663061', 'step': 3278, 'epoch': 2} {'type': 'loss', 'content': 0.010607711970806122, 'timestamp': '2025-09-30 22:17:34.671010', 'step': 3279, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:34.708105', 'step': 3279, 'epoch': 2} {'type': 'loss', 'content': 0.007635840680450201, 'timestamp': '2025-09-30 22:17:34.739223', 'step': 3280, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:17:34.785410', 'step': 3280, 'epoch': 2} {'type': 'loss', 'content': 0.004081249237060547, 'timestamp': '2025-09-30 22:17:34.791094', 'step': 3281, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:34.829227', 'step': 3281, 'epoch': 2} {'type': 'loss', 'content': 0.0059111518785357475, 'timestamp': '2025-09-30 22:17:34.839676', 'step': 3282, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:34.901873', 'step': 3282, 'epoch': 2} {'type': 'loss', 'content': 0.0024872845970094204, 'timestamp': '2025-09-30 22:17:34.912871', 'step': 3283, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:17:34.950458', 'step': 3283, 'epoch': 2} {'type': 'loss', 'content': 0.004989683162420988, 'timestamp': '2025-09-30 22:17:34.979509', 'step': 3284, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:17:35.034244', 'step': 3284, 'epoch': 2} {'type': 'loss', 'content': 0.009897114709019661, 'timestamp': '2025-09-30 22:17:35.039439', 'step': 3285, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:17:35.075416', 'step': 3285, 'epoch': 2} {'type': 'loss', 'content': 0.005108351353555918, 'timestamp': '2025-09-30 22:17:35.083242', 'step': 3286, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:17:35.128331', 'step': 3286, 'epoch': 2} {'type': 'loss', 'content': 0.005318902898579836, 'timestamp': '2025-09-30 22:17:35.136013', 'step': 3287, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:35.170538', 'step': 3287, 'epoch': 2} {'type': 'loss', 'content': 0.012161512859165668, 'timestamp': '2025-09-30 22:17:35.203664', 'step': 3288, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:17:35.265403', 'step': 3288, 'epoch': 2} {'type': 'loss', 'content': 0.015478991903364658, 'timestamp': '2025-09-30 22:17:35.270991', 'step': 3289, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:17:35.304251', 'step': 3289, 'epoch': 2} {'type': 'loss', 'content': 0.0025228005833923817, 'timestamp': '2025-09-30 22:17:35.312193', 'step': 3290, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:17:35.349958', 'step': 3290, 'epoch': 2} {'type': 'loss', 'content': 0.0016414644196629524, 'timestamp': '2025-09-30 22:17:35.363258', 'step': 3291, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:17:35.397765', 'step': 3291, 'epoch': 2} {'type': 'loss', 'content': 0.005938539747148752, 'timestamp': '2025-09-30 22:17:35.426600', 'step': 3292, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:35.462537', 'step': 3292, 'epoch': 2} {'type': 'loss', 'content': 0.007894609123468399, 'timestamp': '2025-09-30 22:17:35.471291', 'step': 3293, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:35.516774', 'step': 3293, 'epoch': 2} {'type': 'loss', 'content': 0.0023811052087694407, 'timestamp': '2025-09-30 22:17:35.529340', 'step': 3294, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:17:35.566487', 'step': 3294, 'epoch': 2} {'type': 'loss', 'content': 0.008082598447799683, 'timestamp': '2025-09-30 22:17:35.579860', 'step': 3295, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:17:35.626022', 'step': 3295, 'epoch': 2} {'type': 'loss', 'content': 0.005375884938985109, 'timestamp': '2025-09-30 22:17:35.653966', 'step': 3296, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:17:35.696371', 'step': 3296, 'epoch': 2} {'type': 'loss', 'content': 0.00774806085973978, 'timestamp': '2025-09-30 22:17:35.698592', 'step': 3297, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:17:35.736937', 'step': 3297, 'epoch': 2} {'type': 'loss', 'content': 0.005233556963503361, 'timestamp': '2025-09-30 22:17:35.746878', 'step': 3298, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:17:35.785245', 'step': 3298, 'epoch': 2} {'type': 'loss', 'content': 0.006909824907779694, 'timestamp': '2025-09-30 22:17:35.793074', 'step': 3299, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:17:35.829508', 'step': 3299, 'epoch': 2} {'type': 'loss', 'content': 0.01188234519213438, 'timestamp': '2025-09-30 22:17:35.857519', 'step': 3300, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:35.892184', 'step': 3300, 'epoch': 2} {'type': 'loss', 'content': 0.013236339204013348, 'timestamp': '2025-09-30 22:17:35.899979', 'step': 3301, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:35.939161', 'step': 3301, 'epoch': 2} {'type': 'loss', 'content': 0.015004783868789673, 'timestamp': '2025-09-30 22:17:35.949480', 'step': 3302, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:17:35.992587', 'step': 3302, 'epoch': 2} {'type': 'loss', 'content': 0.009418581612408161, 'timestamp': '2025-09-30 22:17:35.995227', 'step': 3303, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:17:36.054667', 'step': 3303, 'epoch': 2} {'type': 'loss', 'content': 0.001688284333795309, 'timestamp': '2025-09-30 22:17:36.082750', 'step': 3304, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:17:36.137861', 'step': 3304, 'epoch': 2} {'type': 'loss', 'content': 0.005806570872664452, 'timestamp': '2025-09-30 22:17:36.143385', 'step': 3305, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:17:36.179188', 'step': 3305, 'epoch': 2} {'type': 'loss', 'content': 0.004986909683793783, 'timestamp': '2025-09-30 22:17:36.186991', 'step': 3306, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:36.223613', 'step': 3306, 'epoch': 2} {'type': 'loss', 'content': 0.010555509477853775, 'timestamp': '2025-09-30 22:17:36.234679', 'step': 3307, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:17:36.274583', 'step': 3307, 'epoch': 2} {'type': 'loss', 'content': 0.013262015767395496, 'timestamp': '2025-09-30 22:17:36.303415', 'step': 3308, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:36.353681', 'step': 3308, 'epoch': 2} {'type': 'loss', 'content': 0.01061546616256237, 'timestamp': '2025-09-30 22:17:36.363422', 'step': 3309, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:17:36.403844', 'step': 3309, 'epoch': 2} {'type': 'loss', 'content': 0.0010201644618064165, 'timestamp': '2025-09-30 22:17:36.411451', 'step': 3310, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:36.447811', 'step': 3310, 'epoch': 2} {'type': 'loss', 'content': 0.00881708599627018, 'timestamp': '2025-09-30 22:17:36.460370', 'step': 3311, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:36.495146', 'step': 3311, 'epoch': 2} {'type': 'loss', 'content': 0.010540482588112354, 'timestamp': '2025-09-30 22:17:36.528305', 'step': 3312, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:36.571754', 'step': 3312, 'epoch': 2} {'type': 'loss', 'content': 0.01337836030870676, 'timestamp': '2025-09-30 22:17:36.580569', 'step': 3313, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:36.626113', 'step': 3313, 'epoch': 2} {'type': 'loss', 'content': 0.004954076837748289, 'timestamp': '2025-09-30 22:17:36.637169', 'step': 3314, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:17:36.673315', 'step': 3314, 'epoch': 2} {'type': 'loss', 'content': 0.00841221772134304, 'timestamp': '2025-09-30 22:17:36.685101', 'step': 3315, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:17:36.730308', 'step': 3315, 'epoch': 2} {'type': 'loss', 'content': 0.003916066139936447, 'timestamp': '2025-09-30 22:17:36.758607', 'step': 3316, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:17:36.808811', 'step': 3316, 'epoch': 2} {'type': 'loss', 'content': 0.01567855477333069, 'timestamp': '2025-09-30 22:17:36.813652', 'step': 3317, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:36.858349', 'step': 3317, 'epoch': 2} {'type': 'loss', 'content': 0.004972139373421669, 'timestamp': '2025-09-30 22:17:36.869366', 'step': 3318, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:36.916073', 'step': 3318, 'epoch': 2} {'type': 'loss', 'content': 0.007721779402345419, 'timestamp': '2025-09-30 22:17:36.926473', 'step': 3319, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:36.973344', 'step': 3319, 'epoch': 2} {'type': 'loss', 'content': 0.005826642271131277, 'timestamp': '2025-09-30 22:17:37.006318', 'step': 3320, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:37.050434', 'step': 3320, 'epoch': 2} {'type': 'loss', 'content': 0.004990190267562866, 'timestamp': '2025-09-30 22:17:37.058523', 'step': 3321, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:17:37.105387', 'step': 3321, 'epoch': 2} {'type': 'loss', 'content': 0.018866149708628654, 'timestamp': '2025-09-30 22:17:37.112991', 'step': 3322, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:17:37.161559', 'step': 3322, 'epoch': 2} {'type': 'loss', 'content': 0.005869357846677303, 'timestamp': '2025-09-30 22:17:37.169419', 'step': 3323, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:37.225011', 'step': 3323, 'epoch': 2} {'type': 'loss', 'content': 0.003536415984854102, 'timestamp': '2025-09-30 22:17:37.256051', 'step': 3324, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:37.292119', 'step': 3324, 'epoch': 2} {'type': 'loss', 'content': 0.026185350492596626, 'timestamp': '2025-09-30 22:17:37.300786', 'step': 3325, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:37.352448', 'step': 3325, 'epoch': 2} {'type': 'loss', 'content': 0.0053767370991408825, 'timestamp': '2025-09-30 22:17:37.363515', 'step': 3326, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:37.406920', 'step': 3326, 'epoch': 2} {'type': 'loss', 'content': 0.002467031590640545, 'timestamp': '2025-09-30 22:17:37.418091', 'step': 3327, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:17:37.477631', 'step': 3327, 'epoch': 2} {'type': 'loss', 'content': 0.005745346192270517, 'timestamp': '2025-09-30 22:17:37.512227', 'step': 3328, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:17:37.562587', 'step': 3328, 'epoch': 2} {'type': 'loss', 'content': 0.015376402996480465, 'timestamp': '2025-09-30 22:17:37.575631', 'step': 3329, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:17:37.612460', 'step': 3329, 'epoch': 2} {'type': 'loss', 'content': 0.010871312581002712, 'timestamp': '2025-09-30 22:17:37.619553', 'step': 3330, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:37.655803', 'step': 3330, 'epoch': 2} {'type': 'loss', 'content': 0.00525900861248374, 'timestamp': '2025-09-30 22:17:37.668303', 'step': 3331, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 15187581968384}, 'timestamp': '2025-09-30 22:17:37.726102', 'step': 3331, 'epoch': 2} {'type': 'loss', 'content': 0.004298144951462746, 'timestamp': '2025-09-30 22:17:37.764628', 'step': 3332, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:37.801609', 'step': 3332, 'epoch': 2} {'type': 'loss', 'content': 0.01475045271217823, 'timestamp': '2025-09-30 22:17:37.811527', 'step': 3333, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:37.851595', 'step': 3333, 'epoch': 2} {'type': 'loss', 'content': 0.0043676551431417465, 'timestamp': '2025-09-30 22:17:37.863844', 'step': 3334, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:37.909794', 'step': 3334, 'epoch': 2} {'type': 'loss', 'content': 0.006549290381371975, 'timestamp': '2025-09-30 22:17:37.920038', 'step': 3335, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-09-30 22:17:40.628791', 'step': 3335, 'epoch': 2} {'type': 'pplx', 'content': 5.908082136822788, 'timestamp': '2025-09-30 22:17:40.633117', 'step': 3335, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:40.667904', 'step': 3335, 'epoch': 2} {'type': 'loss', 'content': 0.008388176560401917, 'timestamp': '2025-09-30 22:17:40.697972', 'step': 3336, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:17:40.736012', 'step': 3336, 'epoch': 2} {'type': 'loss', 'content': 0.015180687420070171, 'timestamp': '2025-09-30 22:17:40.742334', 'step': 3337, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:17:40.780621', 'step': 3337, 'epoch': 2} {'type': 'loss', 'content': 0.008127564564347267, 'timestamp': '2025-09-30 22:17:40.788041', 'step': 3338, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:40.827745', 'step': 3338, 'epoch': 2} {'type': 'loss', 'content': 0.004592224024236202, 'timestamp': '2025-09-30 22:17:40.838664', 'step': 3339, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:40.876597', 'step': 3339, 'epoch': 2} {'type': 'loss', 'content': 0.012107564136385918, 'timestamp': '2025-09-30 22:17:40.907880', 'step': 3340, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:17:40.944489', 'step': 3340, 'epoch': 2} {'type': 'loss', 'content': 0.0023953975178301334, 'timestamp': '2025-09-30 22:17:40.957634', 'step': 3341, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:17:40.996945', 'step': 3341, 'epoch': 2} {'type': 'loss', 'content': 0.007857408374547958, 'timestamp': '2025-09-30 22:17:41.004601', 'step': 3342, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:17:41.042145', 'step': 3342, 'epoch': 2} {'type': 'loss', 'content': 0.014286278747022152, 'timestamp': '2025-09-30 22:17:41.049848', 'step': 3343, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:41.085224', 'step': 3343, 'epoch': 2} {'type': 'loss', 'content': 0.008866170421242714, 'timestamp': '2025-09-30 22:17:41.117114', 'step': 3344, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:41.156705', 'step': 3344, 'epoch': 2} {'type': 'loss', 'content': 0.011578583158552647, 'timestamp': '2025-09-30 22:17:41.165467', 'step': 3345, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:41.205114', 'step': 3345, 'epoch': 2} {'type': 'loss', 'content': 0.002213649218901992, 'timestamp': '2025-09-30 22:17:41.217691', 'step': 3346, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:17:41.263687', 'step': 3346, 'epoch': 2} {'type': 'loss', 'content': 0.00874285213649273, 'timestamp': '2025-09-30 22:17:41.277019', 'step': 3347, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:41.320364', 'step': 3347, 'epoch': 2} {'type': 'loss', 'content': 0.005169060546904802, 'timestamp': '2025-09-30 22:17:41.351437', 'step': 3348, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:41.391340', 'step': 3348, 'epoch': 2} {'type': 'loss', 'content': 0.0063797226175665855, 'timestamp': '2025-09-30 22:17:41.399993', 'step': 3349, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:41.437305', 'step': 3349, 'epoch': 2} {'type': 'loss', 'content': 0.0072136567905545235, 'timestamp': '2025-09-30 22:17:41.448354', 'step': 3350, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:17:41.480755', 'step': 3350, 'epoch': 2} {'type': 'loss', 'content': 0.0035704094916582108, 'timestamp': '2025-09-30 22:17:41.487804', 'step': 3351, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:17:41.524211', 'step': 3351, 'epoch': 2} {'type': 'loss', 'content': 0.0043285614810884, 'timestamp': '2025-09-30 22:17:41.552832', 'step': 3352, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:17:41.601607', 'step': 3352, 'epoch': 2} {'type': 'loss', 'content': 0.0018978390144184232, 'timestamp': '2025-09-30 22:17:41.614676', 'step': 3353, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:41.647926', 'step': 3353, 'epoch': 2} {'type': 'loss', 'content': 0.009518872015178204, 'timestamp': '2025-09-30 22:17:41.659218', 'step': 3354, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:41.694646', 'step': 3354, 'epoch': 2} {'type': 'loss', 'content': 0.003425286151468754, 'timestamp': '2025-09-30 22:17:41.705715', 'step': 3355, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:41.738857', 'step': 3355, 'epoch': 2} {'type': 'loss', 'content': 0.005201328080147505, 'timestamp': '2025-09-30 22:17:41.770107', 'step': 3356, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:41.804820', 'step': 3356, 'epoch': 2} {'type': 'loss', 'content': 0.003430295269936323, 'timestamp': '2025-09-30 22:17:41.814749', 'step': 3357, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:41.861476', 'step': 3357, 'epoch': 2} {'type': 'loss', 'content': 0.003906135680153966, 'timestamp': '2025-09-30 22:17:41.873690', 'step': 3358, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:17:41.912285', 'step': 3358, 'epoch': 2} {'type': 'loss', 'content': 0.0035449049901217222, 'timestamp': '2025-09-30 22:17:41.920913', 'step': 3359, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:41.957649', 'step': 3359, 'epoch': 2} {'type': 'loss', 'content': 0.009680942632257938, 'timestamp': '2025-09-30 22:17:41.989504', 'step': 3360, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:42.026858', 'step': 3360, 'epoch': 2} {'type': 'loss', 'content': 0.0037081630434840918, 'timestamp': '2025-09-30 22:17:42.034827', 'step': 3361, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:17:42.073550', 'step': 3361, 'epoch': 2} {'type': 'loss', 'content': 0.0030928482301533222, 'timestamp': '2025-09-30 22:17:42.081169', 'step': 3362, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:17:42.114832', 'step': 3362, 'epoch': 2} {'type': 'loss', 'content': 0.0008025756105780602, 'timestamp': '2025-09-30 22:17:42.121925', 'step': 3363, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:17:42.161439', 'step': 3363, 'epoch': 2} {'type': 'loss', 'content': 0.020035002380609512, 'timestamp': '2025-09-30 22:17:42.190234', 'step': 3364, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:17:42.225423', 'step': 3364, 'epoch': 2} {'type': 'loss', 'content': 0.0031864179763942957, 'timestamp': '2025-09-30 22:17:42.231052', 'step': 3365, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:42.276076', 'step': 3365, 'epoch': 2} {'type': 'loss', 'content': 0.004433062393218279, 'timestamp': '2025-09-30 22:17:42.288380', 'step': 3366, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:17:42.329253', 'step': 3366, 'epoch': 2} {'type': 'loss', 'content': 0.0015743272379040718, 'timestamp': '2025-09-30 22:17:42.342599', 'step': 3367, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:17:42.386007', 'step': 3367, 'epoch': 2} {'type': 'loss', 'content': 0.014348004944622517, 'timestamp': '2025-09-30 22:17:42.414997', 'step': 3368, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:17:42.458692', 'step': 3368, 'epoch': 2} {'type': 'loss', 'content': 0.018920985981822014, 'timestamp': '2025-09-30 22:17:42.471067', 'step': 3369, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:17:42.509253', 'step': 3369, 'epoch': 2} {'type': 'loss', 'content': 0.010625313967466354, 'timestamp': '2025-09-30 22:17:42.516494', 'step': 3370, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:42.559117', 'step': 3370, 'epoch': 2} {'type': 'loss', 'content': 0.004841940477490425, 'timestamp': '2025-09-30 22:17:42.570219', 'step': 3371, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:17:42.618317', 'step': 3371, 'epoch': 2} {'type': 'loss', 'content': 0.011649803258478642, 'timestamp': '2025-09-30 22:17:42.652487', 'step': 3372, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:42.686308', 'step': 3372, 'epoch': 2} {'type': 'loss', 'content': 0.006236497312784195, 'timestamp': '2025-09-30 22:17:42.694305', 'step': 3373, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:42.741176', 'step': 3373, 'epoch': 2} {'type': 'loss', 'content': 0.004768477752804756, 'timestamp': '2025-09-30 22:17:42.752463', 'step': 3374, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:17:42.790754', 'step': 3374, 'epoch': 2} {'type': 'loss', 'content': 0.0019870330579578876, 'timestamp': '2025-09-30 22:17:42.804464', 'step': 3375, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:42.843290', 'step': 3375, 'epoch': 2} {'type': 'loss', 'content': 0.003379875561222434, 'timestamp': '2025-09-30 22:17:42.876765', 'step': 3376, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:42.916730', 'step': 3376, 'epoch': 2} {'type': 'loss', 'content': 0.005048112478107214, 'timestamp': '2025-09-30 22:17:42.927546', 'step': 3377, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:42.970004', 'step': 3377, 'epoch': 2} {'type': 'loss', 'content': 0.006450078450143337, 'timestamp': '2025-09-30 22:17:42.982568', 'step': 3378, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:43.020617', 'step': 3378, 'epoch': 2} {'type': 'loss', 'content': 0.011793630197644234, 'timestamp': '2025-09-30 22:17:43.031898', 'step': 3379, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:43.065366', 'step': 3379, 'epoch': 2} {'type': 'loss', 'content': 0.0065678758546710014, 'timestamp': '2025-09-30 22:17:43.098885', 'step': 3380, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:17:43.152071', 'step': 3380, 'epoch': 2} {'type': 'loss', 'content': 0.0019863827619701624, 'timestamp': '2025-09-30 22:17:43.164739', 'step': 3381, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:43.201711', 'step': 3381, 'epoch': 2} {'type': 'loss', 'content': 0.006069038063287735, 'timestamp': '2025-09-30 22:17:43.214045', 'step': 3382, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:43.251080', 'step': 3382, 'epoch': 2} {'type': 'loss', 'content': 0.00993566308170557, 'timestamp': '2025-09-30 22:17:43.263549', 'step': 3383, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:43.304789', 'step': 3383, 'epoch': 2} {'type': 'loss', 'content': 0.004957782104611397, 'timestamp': '2025-09-30 22:17:43.338001', 'step': 3384, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:43.379473', 'step': 3384, 'epoch': 2} {'type': 'loss', 'content': 0.0007455699960701168, 'timestamp': '2025-09-30 22:17:43.387422', 'step': 3385, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:17:43.427876', 'step': 3385, 'epoch': 2} {'type': 'loss', 'content': 0.0004897300386801362, 'timestamp': '2025-09-30 22:17:43.441570', 'step': 3386, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:43.476274', 'step': 3386, 'epoch': 2} {'type': 'loss', 'content': 0.005194354802370071, 'timestamp': '2025-09-30 22:17:43.488632', 'step': 3387, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:43.530174', 'step': 3387, 'epoch': 2} {'type': 'loss', 'content': 0.0049303011037409306, 'timestamp': '2025-09-30 22:17:43.563369', 'step': 3388, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:43.607448', 'step': 3388, 'epoch': 2} {'type': 'loss', 'content': 0.0001996582723222673, 'timestamp': '2025-09-30 22:17:43.617338', 'step': 3389, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:17:43.655086', 'step': 3389, 'epoch': 2} {'type': 'loss', 'content': 0.0012007271870970726, 'timestamp': '2025-09-30 22:17:43.668425', 'step': 3390, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:43.712980', 'step': 3390, 'epoch': 2} {'type': 'loss', 'content': 0.011003163643181324, 'timestamp': '2025-09-30 22:17:43.725312', 'step': 3391, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:43.758097', 'step': 3391, 'epoch': 2} {'type': 'loss', 'content': 0.0011313065188005567, 'timestamp': '2025-09-30 22:17:43.791422', 'step': 3392, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:17:43.826884', 'step': 3392, 'epoch': 2} {'type': 'loss', 'content': 0.00030222817440517247, 'timestamp': '2025-09-30 22:17:43.839562', 'step': 3393, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:43.879451', 'step': 3393, 'epoch': 2} {'type': 'loss', 'content': 0.015398084186017513, 'timestamp': '2025-09-30 22:17:43.892044', 'step': 3394, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:43.930176', 'step': 3394, 'epoch': 2} {'type': 'loss', 'content': 0.0030468441545963287, 'timestamp': '2025-09-30 22:17:43.941370', 'step': 3395, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:17:43.992633', 'step': 3395, 'epoch': 2} {'type': 'loss', 'content': 0.013181681744754314, 'timestamp': '2025-09-30 22:17:44.026950', 'step': 3396, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:44.061030', 'step': 3396, 'epoch': 2} {'type': 'loss', 'content': 0.0019466944504529238, 'timestamp': '2025-09-30 22:17:44.071857', 'step': 3397, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:44.114840', 'step': 3397, 'epoch': 2} {'type': 'loss', 'content': 0.014047914184629917, 'timestamp': '2025-09-30 22:17:44.125973', 'step': 3398, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:17:44.168860', 'step': 3398, 'epoch': 2} {'type': 'loss', 'content': 0.00035534953349269927, 'timestamp': '2025-09-30 22:17:44.182279', 'step': 3399, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:17:44.220083', 'step': 3399, 'epoch': 2} {'type': 'loss', 'content': 0.000862656335812062, 'timestamp': '2025-09-30 22:17:44.249110', 'step': 3400, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:44.297001', 'step': 3400, 'epoch': 2} {'type': 'loss', 'content': 0.0008597745327278972, 'timestamp': '2025-09-30 22:17:44.305138', 'step': 3401, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:44.340173', 'step': 3401, 'epoch': 2} {'type': 'loss', 'content': 0.0005968649056740105, 'timestamp': '2025-09-30 22:17:44.351254', 'step': 3402, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:17:44.397077', 'step': 3402, 'epoch': 2} {'type': 'loss', 'content': 0.00504825497046113, 'timestamp': '2025-09-30 22:17:44.410822', 'step': 3403, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:17:44.445798', 'step': 3403, 'epoch': 2} {'type': 'loss', 'content': 0.000195363987586461, 'timestamp': '2025-09-30 22:17:44.474648', 'step': 3404, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:44.506608', 'step': 3404, 'epoch': 2} {'type': 'loss', 'content': 0.0005043658311478794, 'timestamp': '2025-09-30 22:17:44.514840', 'step': 3405, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:17:44.552532', 'step': 3405, 'epoch': 2} {'type': 'loss', 'content': 0.0029634914826601744, 'timestamp': '2025-09-30 22:17:44.566580', 'step': 3406, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:44.611334', 'step': 3406, 'epoch': 2} {'type': 'loss', 'content': 0.002190728671848774, 'timestamp': '2025-09-30 22:17:44.622488', 'step': 3407, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:44.665511', 'step': 3407, 'epoch': 2} {'type': 'loss', 'content': 0.000769341888371855, 'timestamp': '2025-09-30 22:17:44.696718', 'step': 3408, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:44.733529', 'step': 3408, 'epoch': 2} {'type': 'loss', 'content': 0.0019631364848464727, 'timestamp': '2025-09-30 22:17:44.742439', 'step': 3409, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:44.780362', 'step': 3409, 'epoch': 2} {'type': 'loss', 'content': 0.012203425168991089, 'timestamp': '2025-09-30 22:17:44.792794', 'step': 3410, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:17:44.841594', 'step': 3410, 'epoch': 2} {'type': 'loss', 'content': 0.0056638531386852264, 'timestamp': '2025-09-30 22:17:44.855016', 'step': 3411, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:44.895881', 'step': 3411, 'epoch': 2} {'type': 'loss', 'content': 0.014068983495235443, 'timestamp': '2025-09-30 22:17:44.927391', 'step': 3412, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:44.960622', 'step': 3412, 'epoch': 2} {'type': 'loss', 'content': 0.0052124448120594025, 'timestamp': '2025-09-30 22:17:44.968566', 'step': 3413, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:45.007118', 'step': 3413, 'epoch': 2} {'type': 'loss', 'content': 0.003837403142824769, 'timestamp': '2025-09-30 22:17:45.019580', 'step': 3414, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:45.058590', 'step': 3414, 'epoch': 2} {'type': 'loss', 'content': 0.006270041223615408, 'timestamp': '2025-09-30 22:17:45.069847', 'step': 3415, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:45.103550', 'step': 3415, 'epoch': 2} {'type': 'loss', 'content': 0.027436494827270508, 'timestamp': '2025-09-30 22:17:45.136973', 'step': 3416, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:45.171601', 'step': 3416, 'epoch': 2} {'type': 'loss', 'content': 0.0063796150498092175, 'timestamp': '2025-09-30 22:17:45.182102', 'step': 3417, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:45.221814', 'step': 3417, 'epoch': 2} {'type': 'loss', 'content': 0.002977534895762801, 'timestamp': '2025-09-30 22:17:45.234186', 'step': 3418, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:45.276203', 'step': 3418, 'epoch': 2} {'type': 'loss', 'content': 0.008533847518265247, 'timestamp': '2025-09-30 22:17:45.288575', 'step': 3419, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:17:45.333244', 'step': 3419, 'epoch': 2} {'type': 'loss', 'content': 0.003514854470267892, 'timestamp': '2025-09-30 22:17:45.367791', 'step': 3420, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:17:45.404767', 'step': 3420, 'epoch': 2} {'type': 'loss', 'content': 0.0006031044758856297, 'timestamp': '2025-09-30 22:17:45.417435', 'step': 3421, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:17:45.475120', 'step': 3421, 'epoch': 2} {'type': 'loss', 'content': 0.0008695040596649051, 'timestamp': '2025-09-30 22:17:45.491020', 'step': 3422, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:17:45.539413', 'step': 3422, 'epoch': 2} {'type': 'loss', 'content': 0.0012744531268253922, 'timestamp': '2025-09-30 22:17:45.555012', 'step': 3423, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:17:45.592006', 'step': 3423, 'epoch': 2} {'type': 'loss', 'content': 0.0037018214352428913, 'timestamp': '2025-09-30 22:17:45.626561', 'step': 3424, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:45.669164', 'step': 3424, 'epoch': 2} {'type': 'loss', 'content': 0.01598265767097473, 'timestamp': '2025-09-30 22:17:45.678089', 'step': 3425, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:45.711075', 'step': 3425, 'epoch': 2} {'type': 'loss', 'content': 0.010136940516531467, 'timestamp': '2025-09-30 22:17:45.722389', 'step': 3426, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:45.762026', 'step': 3426, 'epoch': 2} {'type': 'loss', 'content': 0.01466772984713316, 'timestamp': '2025-09-30 22:17:45.772482', 'step': 3427, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:45.814834', 'step': 3427, 'epoch': 2} {'type': 'loss', 'content': 0.008609617128968239, 'timestamp': '2025-09-30 22:17:45.848261', 'step': 3428, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:45.888198', 'step': 3428, 'epoch': 2} {'type': 'loss', 'content': 0.010983388870954514, 'timestamp': '2025-09-30 22:17:45.897285', 'step': 3429, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:45.932513', 'step': 3429, 'epoch': 2} {'type': 'loss', 'content': 0.0038160153198987246, 'timestamp': '2025-09-30 22:17:45.945009', 'step': 3430, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:45.979710', 'step': 3430, 'epoch': 2} {'type': 'loss', 'content': 0.014309203252196312, 'timestamp': '2025-09-30 22:17:45.990345', 'step': 3431, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:46.028259', 'step': 3431, 'epoch': 2} {'type': 'loss', 'content': 0.003925409633666277, 'timestamp': '2025-09-30 22:17:46.061632', 'step': 3432, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:17:46.102008', 'step': 3432, 'epoch': 2} {'type': 'loss', 'content': 0.005317303352057934, 'timestamp': '2025-09-30 22:17:46.114646', 'step': 3433, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:46.150606', 'step': 3433, 'epoch': 2} {'type': 'loss', 'content': 0.011380461975932121, 'timestamp': '2025-09-30 22:17:46.161786', 'step': 3434, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:17:46.196231', 'step': 3434, 'epoch': 2} {'type': 'loss', 'content': 0.002438656520098448, 'timestamp': '2025-09-30 22:17:46.203912', 'step': 3435, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:46.244673', 'step': 3435, 'epoch': 2} {'type': 'loss', 'content': 0.002902389271184802, 'timestamp': '2025-09-30 22:17:46.275794', 'step': 3436, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:46.308790', 'step': 3436, 'epoch': 2} {'type': 'loss', 'content': 0.005086651537567377, 'timestamp': '2025-09-30 22:17:46.317639', 'step': 3437, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:46.356635', 'step': 3437, 'epoch': 2} {'type': 'loss', 'content': 0.0034991842694580555, 'timestamp': '2025-09-30 22:17:46.369137', 'step': 3438, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:17:46.416681', 'step': 3438, 'epoch': 2} {'type': 'loss', 'content': 0.008994112722575665, 'timestamp': '2025-09-30 22:17:46.430526', 'step': 3439, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:17:46.469464', 'step': 3439, 'epoch': 2} {'type': 'loss', 'content': 0.0012463306775316596, 'timestamp': '2025-09-30 22:17:46.498898', 'step': 3440, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:46.533581', 'step': 3440, 'epoch': 2} {'type': 'loss', 'content': 0.002525878604501486, 'timestamp': '2025-09-30 22:17:46.543583', 'step': 3441, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:17:46.587531', 'step': 3441, 'epoch': 2} {'type': 'loss', 'content': 0.003564007580280304, 'timestamp': '2025-09-30 22:17:46.601407', 'step': 3442, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:46.638808', 'step': 3442, 'epoch': 2} {'type': 'loss', 'content': 0.005786378402262926, 'timestamp': '2025-09-30 22:17:46.649344', 'step': 3443, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:46.685036', 'step': 3443, 'epoch': 2} {'type': 'loss', 'content': 0.005785984918475151, 'timestamp': '2025-09-30 22:17:46.718258', 'step': 3444, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:17:46.765835', 'step': 3444, 'epoch': 2} {'type': 'loss', 'content': 0.0027834847569465637, 'timestamp': '2025-09-30 22:17:46.779153', 'step': 3445, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:46.818020', 'step': 3445, 'epoch': 2} {'type': 'loss', 'content': 0.01306731253862381, 'timestamp': '2025-09-30 22:17:46.830596', 'step': 3446, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:17:46.868195', 'step': 3446, 'epoch': 2} {'type': 'loss', 'content': 0.0030403793789446354, 'timestamp': '2025-09-30 22:17:46.881634', 'step': 3447, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:17:46.916669', 'step': 3447, 'epoch': 2} {'type': 'loss', 'content': 0.004225566517561674, 'timestamp': '2025-09-30 22:17:46.950929', 'step': 3448, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:46.990716', 'step': 3448, 'epoch': 2} {'type': 'loss', 'content': 0.013057212345302105, 'timestamp': '2025-09-30 22:17:47.001354', 'step': 3449, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:17:47.040337', 'step': 3449, 'epoch': 2} {'type': 'loss', 'content': 0.009223368018865585, 'timestamp': '2025-09-30 22:17:47.054046', 'step': 3450, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-09-30 22:17:49.542905', 'step': 3450, 'epoch': 2} {'type': 'pplx', 'content': 6.117527749085999, 'timestamp': '2025-09-30 22:17:49.547521', 'step': 3450, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:17:49.583827', 'step': 3450, 'epoch': 2} {'type': 'loss', 'content': 0.019778696820139885, 'timestamp': '2025-09-30 22:17:49.597174', 'step': 3451, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:49.631448', 'step': 3451, 'epoch': 2} {'type': 'loss', 'content': 0.00811647716909647, 'timestamp': '2025-09-30 22:17:49.664475', 'step': 3452, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:17:49.699916', 'step': 3452, 'epoch': 2} {'type': 'loss', 'content': 0.008224857039749622, 'timestamp': '2025-09-30 22:17:49.707212', 'step': 3453, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:49.750293', 'step': 3453, 'epoch': 2} {'type': 'loss', 'content': 0.013419202528893948, 'timestamp': '2025-09-30 22:17:49.761291', 'step': 3454, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:49.793686', 'step': 3454, 'epoch': 2} {'type': 'loss', 'content': 0.010387551970779896, 'timestamp': '2025-09-30 22:17:49.806297', 'step': 3455, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-09-30 22:17:49.849767', 'step': 3455, 'epoch': 2} {'type': 'loss', 'content': 0.004591148346662521, 'timestamp': '2025-09-30 22:17:49.886823', 'step': 3456, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:17:49.933266', 'step': 3456, 'epoch': 2} {'type': 'loss', 'content': 0.00521214259788394, 'timestamp': '2025-09-30 22:17:49.945955', 'step': 3457, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:17:49.992345', 'step': 3457, 'epoch': 2} {'type': 'loss', 'content': 0.0060057747177779675, 'timestamp': '2025-09-30 22:17:50.008046', 'step': 3458, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:50.067097', 'step': 3458, 'epoch': 2} {'type': 'loss', 'content': 0.006628590170294046, 'timestamp': '2025-09-30 22:17:50.079340', 'step': 3459, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:50.113658', 'step': 3459, 'epoch': 2} {'type': 'loss', 'content': 0.006081595551222563, 'timestamp': '2025-09-30 22:17:50.147100', 'step': 3460, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:50.184360', 'step': 3460, 'epoch': 2} {'type': 'loss', 'content': 0.006910016760230064, 'timestamp': '2025-09-30 22:17:50.196159', 'step': 3461, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:50.241555', 'step': 3461, 'epoch': 2} {'type': 'loss', 'content': 0.013972374610602856, 'timestamp': '2025-09-30 22:17:50.253865', 'step': 3462, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:17:50.294472', 'step': 3462, 'epoch': 2} {'type': 'loss', 'content': 0.003051967127248645, 'timestamp': '2025-09-30 22:17:50.308247', 'step': 3463, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:17:50.340592', 'step': 3463, 'epoch': 2} {'type': 'loss', 'content': 0.003830001689493656, 'timestamp': '2025-09-30 22:17:50.366939', 'step': 3464, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:17:50.419176', 'step': 3464, 'epoch': 2} {'type': 'loss', 'content': 0.014183532446622849, 'timestamp': '2025-09-30 22:17:50.434285', 'step': 3465, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:17:50.484990', 'step': 3465, 'epoch': 2} {'type': 'loss', 'content': 0.01136015821248293, 'timestamp': '2025-09-30 22:17:50.489566', 'step': 3466, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-09-30 22:17:50.532623', 'step': 3466, 'epoch': 2} {'type': 'loss', 'content': 0.00248980475589633, 'timestamp': '2025-09-30 22:17:50.548728', 'step': 3467, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:17:50.590292', 'step': 3467, 'epoch': 2} {'type': 'loss', 'content': 0.006607360672205687, 'timestamp': '2025-09-30 22:17:50.621798', 'step': 3468, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:50.661099', 'step': 3468, 'epoch': 2} {'type': 'loss', 'content': 0.0017212983220815659, 'timestamp': '2025-09-30 22:17:50.669748', 'step': 3469, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:50.715904', 'step': 3469, 'epoch': 2} {'type': 'loss', 'content': 0.011916225776076317, 'timestamp': '2025-09-30 22:17:50.726974', 'step': 3470, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:50.761985', 'step': 3470, 'epoch': 2} {'type': 'loss', 'content': 0.007234348449856043, 'timestamp': '2025-09-30 22:17:50.772521', 'step': 3471, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:50.812777', 'step': 3471, 'epoch': 2} {'type': 'loss', 'content': 0.006829807534813881, 'timestamp': '2025-09-30 22:17:50.845975', 'step': 3472, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:17:50.882281', 'step': 3472, 'epoch': 2} {'type': 'loss', 'content': 0.005056047346442938, 'timestamp': '2025-09-30 22:17:50.887237', 'step': 3473, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:50.920750', 'step': 3473, 'epoch': 2} {'type': 'loss', 'content': 0.0025959117338061333, 'timestamp': '2025-09-30 22:17:50.932979', 'step': 3474, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:50.974300', 'step': 3474, 'epoch': 2} {'type': 'loss', 'content': 0.001975255785509944, 'timestamp': '2025-09-30 22:17:50.985368', 'step': 3475, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:17:51.025023', 'step': 3475, 'epoch': 2} {'type': 'loss', 'content': 0.01245441660284996, 'timestamp': '2025-09-30 22:17:51.054026', 'step': 3476, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:17:51.089583', 'step': 3476, 'epoch': 2} {'type': 'loss', 'content': 0.029889047145843506, 'timestamp': '2025-09-30 22:17:51.098786', 'step': 3477, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:51.134470', 'step': 3477, 'epoch': 2} {'type': 'loss', 'content': 0.007431174162775278, 'timestamp': '2025-09-30 22:17:51.146807', 'step': 3478, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:17:51.195084', 'step': 3478, 'epoch': 2} {'type': 'loss', 'content': 0.005341086536645889, 'timestamp': '2025-09-30 22:17:51.211051', 'step': 3479, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:17:51.254072', 'step': 3479, 'epoch': 2} {'type': 'loss', 'content': 0.004321379121392965, 'timestamp': '2025-09-30 22:17:51.282018', 'step': 3480, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:51.315880', 'step': 3480, 'epoch': 2} {'type': 'loss', 'content': 0.002522587776184082, 'timestamp': '2025-09-30 22:17:51.326216', 'step': 3481, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:51.362293', 'step': 3481, 'epoch': 2} {'type': 'loss', 'content': 0.016913603991270065, 'timestamp': '2025-09-30 22:17:51.374628', 'step': 3482, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:51.408980', 'step': 3482, 'epoch': 2} {'type': 'loss', 'content': 0.015241632238030434, 'timestamp': '2025-09-30 22:17:51.419367', 'step': 3483, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:17:51.467287', 'step': 3483, 'epoch': 2} {'type': 'loss', 'content': 0.0023043362889438868, 'timestamp': '2025-09-30 22:17:51.502006', 'step': 3484, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:51.536552', 'step': 3484, 'epoch': 2} {'type': 'loss', 'content': 0.014905478805303574, 'timestamp': '2025-09-30 22:17:51.545160', 'step': 3485, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:51.579022', 'step': 3485, 'epoch': 2} {'type': 'loss', 'content': 0.004559694789350033, 'timestamp': '2025-09-30 22:17:51.591393', 'step': 3486, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:51.625234', 'step': 3486, 'epoch': 2} {'type': 'loss', 'content': 0.023312093690037727, 'timestamp': '2025-09-30 22:17:51.637802', 'step': 3487, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:51.680058', 'step': 3487, 'epoch': 2} {'type': 'loss', 'content': 0.00656821159645915, 'timestamp': '2025-09-30 22:17:51.713274', 'step': 3488, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:17:51.748771', 'step': 3488, 'epoch': 2} {'type': 'loss', 'content': 0.00758517486974597, 'timestamp': '2025-09-30 22:17:51.761837', 'step': 3489, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:17:51.800343', 'step': 3489, 'epoch': 2} {'type': 'loss', 'content': 0.004730802029371262, 'timestamp': '2025-09-30 22:17:51.813728', 'step': 3490, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:17:51.850002', 'step': 3490, 'epoch': 2} {'type': 'loss', 'content': 0.00482148164883256, 'timestamp': '2025-09-30 22:17:51.863406', 'step': 3491, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:51.900098', 'step': 3491, 'epoch': 2} {'type': 'loss', 'content': 0.007809313479810953, 'timestamp': '2025-09-30 22:17:51.933344', 'step': 3492, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:51.968297', 'step': 3492, 'epoch': 2} {'type': 'loss', 'content': 0.0036748633719980717, 'timestamp': '2025-09-30 22:17:51.978398', 'step': 3493, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:52.011884', 'step': 3493, 'epoch': 2} {'type': 'loss', 'content': 0.008129787631332874, 'timestamp': '2025-09-30 22:17:52.022186', 'step': 3494, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:17:52.059863', 'step': 3494, 'epoch': 2} {'type': 'loss', 'content': 0.007218873593956232, 'timestamp': '2025-09-30 22:17:52.067457', 'step': 3495, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:52.105686', 'step': 3495, 'epoch': 2} {'type': 'loss', 'content': 0.012301747687160969, 'timestamp': '2025-09-30 22:17:52.136866', 'step': 3496, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:17:52.171717', 'step': 3496, 'epoch': 2} {'type': 'loss', 'content': 0.003263867227360606, 'timestamp': '2025-09-30 22:17:52.184775', 'step': 3497, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:17:52.218457', 'step': 3497, 'epoch': 2} {'type': 'loss', 'content': 0.012883387506008148, 'timestamp': '2025-09-30 22:17:52.225621', 'step': 3498, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:17:52.263542', 'step': 3498, 'epoch': 2} {'type': 'loss', 'content': 0.011431973427534103, 'timestamp': '2025-09-30 22:17:52.277252', 'step': 3499, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:17:52.310154', 'step': 3499, 'epoch': 2} {'type': 'loss', 'content': 0.006922499742358923, 'timestamp': '2025-09-30 22:17:52.338898', 'step': 3500, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 3500', 'timestamp': '2025-09-30 22:17:57.308118', 'step': 3500, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:57.353016', 'step': 3500, 'epoch': 2} {'type': 'loss', 'content': 0.0016732515068724751, 'timestamp': '2025-09-30 22:17:57.359525', 'step': 3501, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:57.391776', 'step': 3501, 'epoch': 2} {'type': 'loss', 'content': 0.0008775260648690164, 'timestamp': '2025-09-30 22:17:57.404037', 'step': 3502, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:17:57.438395', 'step': 3502, 'epoch': 2} {'type': 'loss', 'content': 0.0034057104494422674, 'timestamp': '2025-09-30 22:17:57.445586', 'step': 3503, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:17:57.483387', 'step': 3503, 'epoch': 2} {'type': 'loss', 'content': 0.0009829505579546094, 'timestamp': '2025-09-30 22:17:57.517367', 'step': 3504, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:57.562341', 'step': 3504, 'epoch': 2} {'type': 'loss', 'content': 0.014472831040620804, 'timestamp': '2025-09-30 22:17:57.571036', 'step': 3505, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:17:57.603373', 'step': 3505, 'epoch': 2} {'type': 'loss', 'content': 0.006887474562972784, 'timestamp': '2025-09-30 22:17:57.611385', 'step': 3506, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:57.645486', 'step': 3506, 'epoch': 2} {'type': 'loss', 'content': 0.006549729034304619, 'timestamp': '2025-09-30 22:17:57.655734', 'step': 3507, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:17:57.695999', 'step': 3507, 'epoch': 2} {'type': 'loss', 'content': 0.02756788209080696, 'timestamp': '2025-09-30 22:17:57.724837', 'step': 3508, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:57.762996', 'step': 3508, 'epoch': 2} {'type': 'loss', 'content': 0.010234571993350983, 'timestamp': '2025-09-30 22:17:57.773464', 'step': 3509, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:17:57.818115', 'step': 3509, 'epoch': 2} {'type': 'loss', 'content': 0.002638920210301876, 'timestamp': '2025-09-30 22:17:57.831508', 'step': 3510, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:17:57.874789', 'step': 3510, 'epoch': 2} {'type': 'loss', 'content': 0.006166706793010235, 'timestamp': '2025-09-30 22:17:57.888566', 'step': 3511, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-09-30 22:17:57.937779', 'step': 3511, 'epoch': 2} {'type': 'loss', 'content': 0.006264224648475647, 'timestamp': '2025-09-30 22:17:57.974840', 'step': 3512, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:17:58.017635', 'step': 3512, 'epoch': 2} {'type': 'loss', 'content': 0.008886733092367649, 'timestamp': '2025-09-30 22:17:58.030306', 'step': 3513, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:58.068493', 'step': 3513, 'epoch': 2} {'type': 'loss', 'content': 0.012475165538489819, 'timestamp': '2025-09-30 22:17:58.081059', 'step': 3514, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:58.121910', 'step': 3514, 'epoch': 2} {'type': 'loss', 'content': 0.004459563177078962, 'timestamp': '2025-09-30 22:17:58.132995', 'step': 3515, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:17:58.188590', 'step': 3515, 'epoch': 2} {'type': 'loss', 'content': 0.005173417739570141, 'timestamp': '2025-09-30 22:17:58.223207', 'step': 3516, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:17:58.260455', 'step': 3516, 'epoch': 2} {'type': 'loss', 'content': 0.011100078001618385, 'timestamp': '2025-09-30 22:17:58.273770', 'step': 3517, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:17:58.316037', 'step': 3517, 'epoch': 2} {'type': 'loss', 'content': 0.0042218053713440895, 'timestamp': '2025-09-30 22:17:58.324028', 'step': 3518, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:58.361455', 'step': 3518, 'epoch': 2} {'type': 'loss', 'content': 0.002848744625225663, 'timestamp': '2025-09-30 22:17:58.374025', 'step': 3519, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:17:58.411963', 'step': 3519, 'epoch': 2} {'type': 'loss', 'content': 0.005674897227436304, 'timestamp': '2025-09-30 22:17:58.440795', 'step': 3520, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:17:58.475140', 'step': 3520, 'epoch': 2} {'type': 'loss', 'content': 0.0039841290563344955, 'timestamp': '2025-09-30 22:17:58.480054', 'step': 3521, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:17:58.512964', 'step': 3521, 'epoch': 2} {'type': 'loss', 'content': 0.006728755310177803, 'timestamp': '2025-09-30 22:17:58.526478', 'step': 3522, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:58.566534', 'step': 3522, 'epoch': 2} {'type': 'loss', 'content': 0.008942547254264355, 'timestamp': '2025-09-30 22:17:58.576948', 'step': 3523, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:17:58.609868', 'step': 3523, 'epoch': 2} {'type': 'loss', 'content': 0.006240403279662132, 'timestamp': '2025-09-30 22:17:58.638372', 'step': 3524, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:58.671331', 'step': 3524, 'epoch': 2} {'type': 'loss', 'content': 0.004982698708772659, 'timestamp': '2025-09-30 22:17:58.682350', 'step': 3525, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-09-30 22:17:58.735511', 'step': 3525, 'epoch': 2} {'type': 'loss', 'content': 0.005394322331994772, 'timestamp': '2025-09-30 22:17:58.752678', 'step': 3526, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-09-30 22:17:58.797192', 'step': 3526, 'epoch': 2} {'type': 'loss', 'content': 0.012957288883626461, 'timestamp': '2025-09-30 22:17:58.813316', 'step': 3527, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:17:58.849195', 'step': 3527, 'epoch': 2} {'type': 'loss', 'content': 0.015166162513196468, 'timestamp': '2025-09-30 22:17:58.883382', 'step': 3528, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:58.921823', 'step': 3528, 'epoch': 2} {'type': 'loss', 'content': 0.00798399280756712, 'timestamp': '2025-09-30 22:17:58.931785', 'step': 3529, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:17:58.978640', 'step': 3529, 'epoch': 2} {'type': 'loss', 'content': 0.012991075403988361, 'timestamp': '2025-09-30 22:17:58.992061', 'step': 3530, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:17:59.030718', 'step': 3530, 'epoch': 2} {'type': 'loss', 'content': 0.008770058862864971, 'timestamp': '2025-09-30 22:17:59.044061', 'step': 3531, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:17:59.087615', 'step': 3531, 'epoch': 2} {'type': 'loss', 'content': 0.011841779574751854, 'timestamp': '2025-09-30 22:17:59.121902', 'step': 3532, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:17:59.160113', 'step': 3532, 'epoch': 2} {'type': 'loss', 'content': 0.00944326352328062, 'timestamp': '2025-09-30 22:17:59.173289', 'step': 3533, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:59.206108', 'step': 3533, 'epoch': 2} {'type': 'loss', 'content': 0.005587635096162558, 'timestamp': '2025-09-30 22:17:59.216373', 'step': 3534, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:17:59.254155', 'step': 3534, 'epoch': 2} {'type': 'loss', 'content': 0.010135992430150509, 'timestamp': '2025-09-30 22:17:59.267917', 'step': 3535, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:59.311584', 'step': 3535, 'epoch': 2} {'type': 'loss', 'content': 0.010656927712261677, 'timestamp': '2025-09-30 22:17:59.344817', 'step': 3536, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:17:59.387294', 'step': 3536, 'epoch': 2} {'type': 'loss', 'content': 0.005624601151794195, 'timestamp': '2025-09-30 22:17:59.400338', 'step': 3537, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:59.448040', 'step': 3537, 'epoch': 2} {'type': 'loss', 'content': 0.004959089681506157, 'timestamp': '2025-09-30 22:17:59.460572', 'step': 3538, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:17:59.495632', 'step': 3538, 'epoch': 2} {'type': 'loss', 'content': 0.007135627791285515, 'timestamp': '2025-09-30 22:17:59.503316', 'step': 3539, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:17:59.541476', 'step': 3539, 'epoch': 2} {'type': 'loss', 'content': 0.009773723781108856, 'timestamp': '2025-09-30 22:17:59.575695', 'step': 3540, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:17:59.617889', 'step': 3540, 'epoch': 2} {'type': 'loss', 'content': 0.0032830105628818274, 'timestamp': '2025-09-30 22:17:59.633062', 'step': 3541, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:59.671348', 'step': 3541, 'epoch': 2} {'type': 'loss', 'content': 0.01609373278915882, 'timestamp': '2025-09-30 22:17:59.683650', 'step': 3542, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:17:59.721876', 'step': 3542, 'epoch': 2} {'type': 'loss', 'content': 0.0061156414449214935, 'timestamp': '2025-09-30 22:17:59.734455', 'step': 3543, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:17:59.769153', 'step': 3543, 'epoch': 2} {'type': 'loss', 'content': 0.0034365863539278507, 'timestamp': '2025-09-30 22:17:59.802349', 'step': 3544, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:59.835512', 'step': 3544, 'epoch': 2} {'type': 'loss', 'content': 0.005177946295589209, 'timestamp': '2025-09-30 22:17:59.844203', 'step': 3545, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:17:59.880452', 'step': 3545, 'epoch': 2} {'type': 'loss', 'content': 0.00885400827974081, 'timestamp': '2025-09-30 22:17:59.891367', 'step': 3546, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:59.934709', 'step': 3546, 'epoch': 2} {'type': 'loss', 'content': 0.00847975630313158, 'timestamp': '2025-09-30 22:17:59.945110', 'step': 3547, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:17:59.994731', 'step': 3547, 'epoch': 2} {'type': 'loss', 'content': 0.005916960071772337, 'timestamp': '2025-09-30 22:18:00.025886', 'step': 3548, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:00.063673', 'step': 3548, 'epoch': 2} {'type': 'loss', 'content': 0.0055039809085428715, 'timestamp': '2025-09-30 22:18:00.072294', 'step': 3549, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:00.115768', 'step': 3549, 'epoch': 2} {'type': 'loss', 'content': 0.005276626441627741, 'timestamp': '2025-09-30 22:18:00.126774', 'step': 3550, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:18:00.168581', 'step': 3550, 'epoch': 2} {'type': 'loss', 'content': 0.0064256805926561356, 'timestamp': '2025-09-30 22:18:00.176222', 'step': 3551, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:18:00.219163', 'step': 3551, 'epoch': 2} {'type': 'loss', 'content': 0.011961769312620163, 'timestamp': '2025-09-30 22:18:00.253763', 'step': 3552, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:18:00.298773', 'step': 3552, 'epoch': 2} {'type': 'loss', 'content': 0.0013833267148584127, 'timestamp': '2025-09-30 22:18:00.311767', 'step': 3553, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:00.345731', 'step': 3553, 'epoch': 2} {'type': 'loss', 'content': 0.0023183198645710945, 'timestamp': '2025-09-30 22:18:00.358288', 'step': 3554, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:18:00.393100', 'step': 3554, 'epoch': 2} {'type': 'loss', 'content': 0.009293297305703163, 'timestamp': '2025-09-30 22:18:00.403428', 'step': 3555, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:00.444093', 'step': 3555, 'epoch': 2} {'type': 'loss', 'content': 0.004751947708427906, 'timestamp': '2025-09-30 22:18:00.486685', 'step': 3556, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:18:00.533955', 'step': 3556, 'epoch': 2} {'type': 'loss', 'content': 0.005438275169581175, 'timestamp': '2025-09-30 22:18:00.547006', 'step': 3557, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:18:00.586092', 'step': 3557, 'epoch': 2} {'type': 'loss', 'content': 0.0027588580269366503, 'timestamp': '2025-09-30 22:18:00.599803', 'step': 3558, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-09-30 22:18:00.648346', 'step': 3558, 'epoch': 2} {'type': 'loss', 'content': 0.003551148111000657, 'timestamp': '2025-09-30 22:18:00.664703', 'step': 3559, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:00.701927', 'step': 3559, 'epoch': 2} {'type': 'loss', 'content': 0.005467879585921764, 'timestamp': '2025-09-30 22:18:00.734030', 'step': 3560, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:00.780843', 'step': 3560, 'epoch': 2} {'type': 'loss', 'content': 0.0032824459485709667, 'timestamp': '2025-09-30 22:18:00.790777', 'step': 3561, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:00.825505', 'step': 3561, 'epoch': 2} {'type': 'loss', 'content': 0.006720301229506731, 'timestamp': '2025-09-30 22:18:00.838056', 'step': 3562, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:18:00.891753', 'step': 3562, 'epoch': 2} {'type': 'loss', 'content': 0.004803223069757223, 'timestamp': '2025-09-30 22:18:00.902055', 'step': 3563, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:00.939996', 'step': 3563, 'epoch': 2} {'type': 'loss', 'content': 0.010715251788496971, 'timestamp': '2025-09-30 22:18:00.973434', 'step': 3564, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:01.014551', 'step': 3564, 'epoch': 2} {'type': 'loss', 'content': 0.01167300995439291, 'timestamp': '2025-09-30 22:18:01.023360', 'step': 3565, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-09-30 22:18:03.579866', 'step': 3565, 'epoch': 2} {'type': 'pplx', 'content': 5.9511250598265555, 'timestamp': '2025-09-30 22:18:03.587462', 'step': 3565, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:03.626031', 'step': 3565, 'epoch': 2} {'type': 'loss', 'content': 0.008313131518661976, 'timestamp': '2025-09-30 22:18:03.636593', 'step': 3566, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:03.676883', 'step': 3566, 'epoch': 2} {'type': 'loss', 'content': 0.00039068798650987446, 'timestamp': '2025-09-30 22:18:03.689479', 'step': 3567, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:03.725437', 'step': 3567, 'epoch': 2} {'type': 'loss', 'content': 0.0022331487853080034, 'timestamp': '2025-09-30 22:18:03.758546', 'step': 3568, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:03.799546', 'step': 3568, 'epoch': 2} {'type': 'loss', 'content': 0.0064695486798882484, 'timestamp': '2025-09-30 22:18:03.805245', 'step': 3569, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:03.847191', 'step': 3569, 'epoch': 2} {'type': 'loss', 'content': 0.005519408266991377, 'timestamp': '2025-09-30 22:18:03.860630', 'step': 3570, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:03.896338', 'step': 3570, 'epoch': 2} {'type': 'loss', 'content': 0.006128426641225815, 'timestamp': '2025-09-30 22:18:03.904310', 'step': 3571, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:18:03.943387', 'step': 3571, 'epoch': 2} {'type': 'loss', 'content': 0.00431226147338748, 'timestamp': '2025-09-30 22:18:03.978004', 'step': 3572, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:18:04.018027', 'step': 3572, 'epoch': 2} {'type': 'loss', 'content': 0.0014643726171925664, 'timestamp': '2025-09-30 22:18:04.025948', 'step': 3573, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:18:04.066185', 'step': 3573, 'epoch': 2} {'type': 'loss', 'content': 0.005359445232897997, 'timestamp': '2025-09-30 22:18:04.073064', 'step': 3574, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:04.107715', 'step': 3574, 'epoch': 2} {'type': 'loss', 'content': 0.0027361405082046986, 'timestamp': '2025-09-30 22:18:04.118580', 'step': 3575, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:18:04.164337', 'step': 3575, 'epoch': 2} {'type': 'loss', 'content': 0.005159073509275913, 'timestamp': '2025-09-30 22:18:04.195588', 'step': 3576, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:04.232694', 'step': 3576, 'epoch': 2} {'type': 'loss', 'content': 0.005905451253056526, 'timestamp': '2025-09-30 22:18:04.238343', 'step': 3577, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:04.276581', 'step': 3577, 'epoch': 2} {'type': 'loss', 'content': 0.0021716770716011524, 'timestamp': '2025-09-30 22:18:04.288924', 'step': 3578, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:04.323677', 'step': 3578, 'epoch': 2} {'type': 'loss', 'content': 0.0011484583374112844, 'timestamp': '2025-09-30 22:18:04.335832', 'step': 3579, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:18:04.386138', 'step': 3579, 'epoch': 2} {'type': 'loss', 'content': 0.008656934835016727, 'timestamp': '2025-09-30 22:18:04.420674', 'step': 3580, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:18:04.457080', 'step': 3580, 'epoch': 2} {'type': 'loss', 'content': 0.006916280835866928, 'timestamp': '2025-09-30 22:18:04.470157', 'step': 3581, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:18:04.518334', 'step': 3581, 'epoch': 2} {'type': 'loss', 'content': 0.0018486609915271401, 'timestamp': '2025-09-30 22:18:04.525231', 'step': 3582, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:18:04.564350', 'step': 3582, 'epoch': 2} {'type': 'loss', 'content': 0.02502571791410446, 'timestamp': '2025-09-30 22:18:04.573590', 'step': 3583, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:18:04.616304', 'step': 3583, 'epoch': 2} {'type': 'loss', 'content': 0.0017878885846585035, 'timestamp': '2025-09-30 22:18:04.641288', 'step': 3584, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:04.681221', 'step': 3584, 'epoch': 2} {'type': 'loss', 'content': 0.003630830440670252, 'timestamp': '2025-09-30 22:18:04.690989', 'step': 3585, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:18:04.726034', 'step': 3585, 'epoch': 2} {'type': 'loss', 'content': 0.0015692427987232804, 'timestamp': '2025-09-30 22:18:04.736547', 'step': 3586, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:18:04.769676', 'step': 3586, 'epoch': 2} {'type': 'loss', 'content': 0.002070869319140911, 'timestamp': '2025-09-30 22:18:04.781709', 'step': 3587, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:18:04.835350', 'step': 3587, 'epoch': 2} {'type': 'loss', 'content': 0.00642598420381546, 'timestamp': '2025-09-30 22:18:04.867140', 'step': 3588, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:04.905496', 'step': 3588, 'epoch': 2} {'type': 'loss', 'content': 0.0004540416703093797, 'timestamp': '2025-09-30 22:18:04.916457', 'step': 3589, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:04.963183', 'step': 3589, 'epoch': 2} {'type': 'loss', 'content': 0.0027480798307806253, 'timestamp': '2025-09-30 22:18:04.971201', 'step': 3590, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:05.017289', 'step': 3590, 'epoch': 2} {'type': 'loss', 'content': 0.004405462648719549, 'timestamp': '2025-09-30 22:18:05.029796', 'step': 3591, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:05.067979', 'step': 3591, 'epoch': 2} {'type': 'loss', 'content': 0.004299256484955549, 'timestamp': '2025-09-30 22:18:05.101481', 'step': 3592, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:05.146310', 'step': 3592, 'epoch': 2} {'type': 'loss', 'content': 0.02254783734679222, 'timestamp': '2025-09-30 22:18:05.154885', 'step': 3593, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:05.206608', 'step': 3593, 'epoch': 2} {'type': 'loss', 'content': 0.005831711459904909, 'timestamp': '2025-09-30 22:18:05.219945', 'step': 3594, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:05.266857', 'step': 3594, 'epoch': 2} {'type': 'loss', 'content': 0.011741677299141884, 'timestamp': '2025-09-30 22:18:05.274767', 'step': 3595, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:18:05.317630', 'step': 3595, 'epoch': 2} {'type': 'loss', 'content': 0.006558686029165983, 'timestamp': '2025-09-30 22:18:05.348870', 'step': 3596, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:18:05.392171', 'step': 3596, 'epoch': 2} {'type': 'loss', 'content': 0.0009061984019353986, 'timestamp': '2025-09-30 22:18:05.397522', 'step': 3597, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:05.437162', 'step': 3597, 'epoch': 2} {'type': 'loss', 'content': 0.012972662225365639, 'timestamp': '2025-09-30 22:18:05.445145', 'step': 3598, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:05.513240', 'step': 3598, 'epoch': 2} {'type': 'loss', 'content': 0.014107991009950638, 'timestamp': '2025-09-30 22:18:05.526629', 'step': 3599, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:05.562590', 'step': 3599, 'epoch': 2} {'type': 'loss', 'content': 0.009034661576151848, 'timestamp': '2025-09-30 22:18:05.594698', 'step': 3600, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:05.634592', 'step': 3600, 'epoch': 2} {'type': 'loss', 'content': 0.009804898872971535, 'timestamp': '2025-09-30 22:18:05.642229', 'step': 3601, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:05.695011', 'step': 3601, 'epoch': 2} {'type': 'loss', 'content': 0.006780870258808136, 'timestamp': '2025-09-30 22:18:05.708401', 'step': 3602, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 14712978242368}, 'timestamp': '2025-09-30 22:18:05.762370', 'step': 3602, 'epoch': 2} {'type': 'loss', 'content': 0.012236690148711205, 'timestamp': '2025-09-30 22:18:05.779964', 'step': 3603, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:18:05.826822', 'step': 3603, 'epoch': 2} {'type': 'loss', 'content': 0.0076903593726456165, 'timestamp': '2025-09-30 22:18:05.861410', 'step': 3604, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:18:05.900535', 'step': 3604, 'epoch': 2} {'type': 'loss', 'content': 0.010933350771665573, 'timestamp': '2025-09-30 22:18:05.915645', 'step': 3605, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:05.953390', 'step': 3605, 'epoch': 2} {'type': 'loss', 'content': 0.017923496663570404, 'timestamp': '2025-09-30 22:18:05.965949', 'step': 3606, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:06.017167', 'step': 3606, 'epoch': 2} {'type': 'loss', 'content': 0.007505920249968767, 'timestamp': '2025-09-30 22:18:06.029706', 'step': 3607, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:18:06.084058', 'step': 3607, 'epoch': 2} {'type': 'loss', 'content': 0.00268744770437479, 'timestamp': '2025-09-30 22:18:06.118572', 'step': 3608, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:06.156299', 'step': 3608, 'epoch': 2} {'type': 'loss', 'content': 0.005204895976930857, 'timestamp': '2025-09-30 22:18:06.166845', 'step': 3609, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:06.201351', 'step': 3609, 'epoch': 2} {'type': 'loss', 'content': 0.010513301938772202, 'timestamp': '2025-09-30 22:18:06.213741', 'step': 3610, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:18:06.252494', 'step': 3610, 'epoch': 2} {'type': 'loss', 'content': 0.005763137713074684, 'timestamp': '2025-09-30 22:18:06.266197', 'step': 3611, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:18:06.301990', 'step': 3611, 'epoch': 2} {'type': 'loss', 'content': 0.0038706306368112564, 'timestamp': '2025-09-30 22:18:06.333284', 'step': 3612, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:06.366805', 'step': 3612, 'epoch': 2} {'type': 'loss', 'content': 0.010753096081316471, 'timestamp': '2025-09-30 22:18:06.376876', 'step': 3613, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:18:06.424707', 'step': 3613, 'epoch': 2} {'type': 'loss', 'content': 0.01010825764387846, 'timestamp': '2025-09-30 22:18:06.438500', 'step': 3614, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:06.487863', 'step': 3614, 'epoch': 2} {'type': 'loss', 'content': 0.008509020321071148, 'timestamp': '2025-09-30 22:18:06.500388', 'step': 3615, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:18:06.534037', 'step': 3615, 'epoch': 2} {'type': 'loss', 'content': 0.004479021765291691, 'timestamp': '2025-09-30 22:18:06.565350', 'step': 3616, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:06.601866', 'step': 3616, 'epoch': 2} {'type': 'loss', 'content': 0.004267436917871237, 'timestamp': '2025-09-30 22:18:06.610562', 'step': 3617, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:06.655877', 'step': 3617, 'epoch': 2} {'type': 'loss', 'content': 0.003969523124396801, 'timestamp': '2025-09-30 22:18:06.667272', 'step': 3618, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:06.701524', 'step': 3618, 'epoch': 2} {'type': 'loss', 'content': 0.0031346294563263655, 'timestamp': '2025-09-30 22:18:06.709634', 'step': 3619, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:18:06.749965', 'step': 3619, 'epoch': 2} {'type': 'loss', 'content': 0.0017775017768144608, 'timestamp': '2025-09-30 22:18:06.784509', 'step': 3620, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:18:06.824116', 'step': 3620, 'epoch': 2} {'type': 'loss', 'content': 0.004031859338283539, 'timestamp': '2025-09-30 22:18:06.839557', 'step': 3621, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:18:06.877903', 'step': 3621, 'epoch': 2} {'type': 'loss', 'content': 0.003785045351833105, 'timestamp': '2025-09-30 22:18:06.891671', 'step': 3622, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:18:06.940169', 'step': 3622, 'epoch': 2} {'type': 'loss', 'content': 0.001441820291802287, 'timestamp': '2025-09-30 22:18:06.954052', 'step': 3623, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:07.013384', 'step': 3623, 'epoch': 2} {'type': 'loss', 'content': 0.005841195583343506, 'timestamp': '2025-09-30 22:18:07.046775', 'step': 3624, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:07.091239', 'step': 3624, 'epoch': 2} {'type': 'loss', 'content': 0.0024000804405659437, 'timestamp': '2025-09-30 22:18:07.100823', 'step': 3625, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:07.148537', 'step': 3625, 'epoch': 2} {'type': 'loss', 'content': 0.0025940327905118465, 'timestamp': '2025-09-30 22:18:07.161070', 'step': 3626, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:18:07.204166', 'step': 3626, 'epoch': 2} {'type': 'loss', 'content': 0.004042869433760643, 'timestamp': '2025-09-30 22:18:07.220090', 'step': 3627, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:18:07.255740', 'step': 3627, 'epoch': 2} {'type': 'loss', 'content': 0.0035541884135454893, 'timestamp': '2025-09-30 22:18:07.284225', 'step': 3628, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-09-30 22:18:07.326175', 'step': 3628, 'epoch': 2} {'type': 'loss', 'content': 0.01492700632661581, 'timestamp': '2025-09-30 22:18:07.341752', 'step': 3629, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:07.375854', 'step': 3629, 'epoch': 2} {'type': 'loss', 'content': 0.005095267202705145, 'timestamp': '2025-09-30 22:18:07.386831', 'step': 3630, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:07.422853', 'step': 3630, 'epoch': 2} {'type': 'loss', 'content': 0.0027024373412132263, 'timestamp': '2025-09-30 22:18:07.433809', 'step': 3631, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:18:07.470453', 'step': 3631, 'epoch': 2} {'type': 'loss', 'content': 0.01257782056927681, 'timestamp': '2025-09-30 22:18:07.501664', 'step': 3632, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-09-30 22:18:07.543839', 'step': 3632, 'epoch': 2} {'type': 'loss', 'content': 0.0018136730650439858, 'timestamp': '2025-09-30 22:18:07.559505', 'step': 3633, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:18:07.608170', 'step': 3633, 'epoch': 2} {'type': 'loss', 'content': 0.012361763045191765, 'timestamp': '2025-09-30 22:18:07.621980', 'step': 3634, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:18:07.662675', 'step': 3634, 'epoch': 2} {'type': 'loss', 'content': 0.006774898152798414, 'timestamp': '2025-09-30 22:18:07.676567', 'step': 3635, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 14238374516352}, 'timestamp': '2025-09-30 22:18:07.726282', 'step': 3635, 'epoch': 2} {'type': 'loss', 'content': 0.0051282295025885105, 'timestamp': '2025-09-30 22:18:07.764407', 'step': 3636, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:07.808806', 'step': 3636, 'epoch': 2} {'type': 'loss', 'content': 0.013438411988317966, 'timestamp': '2025-09-30 22:18:07.817381', 'step': 3637, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:07.863841', 'step': 3637, 'epoch': 2} {'type': 'loss', 'content': 0.005512099713087082, 'timestamp': '2025-09-30 22:18:07.876073', 'step': 3638, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:18:07.917012', 'step': 3638, 'epoch': 2} {'type': 'loss', 'content': 0.006519559770822525, 'timestamp': '2025-09-30 22:18:07.930701', 'step': 3639, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:18:07.968613', 'step': 3639, 'epoch': 2} {'type': 'loss', 'content': 0.004097971599549055, 'timestamp': '2025-09-30 22:18:08.003521', 'step': 3640, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:08.054465', 'step': 3640, 'epoch': 2} {'type': 'loss', 'content': 0.013717074878513813, 'timestamp': '2025-09-30 22:18:08.062783', 'step': 3641, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:08.172347', 'step': 3641, 'epoch': 2} {'type': 'loss', 'content': 0.003827743697911501, 'timestamp': '2025-09-30 22:18:08.185713', 'step': 3642, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:18:08.244585', 'step': 3642, 'epoch': 2} {'type': 'loss', 'content': 0.00614333339035511, 'timestamp': '2025-09-30 22:18:08.258295', 'step': 3643, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:08.326999', 'step': 3643, 'epoch': 2} {'type': 'loss', 'content': 0.001711672986857593, 'timestamp': '2025-09-30 22:18:08.360023', 'step': 3644, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:08.411885', 'step': 3644, 'epoch': 2} {'type': 'loss', 'content': 0.005505403969436884, 'timestamp': '2025-09-30 22:18:08.421670', 'step': 3645, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:08.484141', 'step': 3645, 'epoch': 2} {'type': 'loss', 'content': 0.002894895849749446, 'timestamp': '2025-09-30 22:18:08.494948', 'step': 3646, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:18:08.550174', 'step': 3646, 'epoch': 2} {'type': 'loss', 'content': 0.007011787500232458, 'timestamp': '2025-09-30 22:18:08.560245', 'step': 3647, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:08.673565', 'step': 3647, 'epoch': 2} {'type': 'loss', 'content': 0.00981463398784399, 'timestamp': '2025-09-30 22:18:08.705387', 'step': 3648, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:08.766177', 'step': 3648, 'epoch': 2} {'type': 'loss', 'content': 0.0028689431492239237, 'timestamp': '2025-09-30 22:18:08.782746', 'step': 3649, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:18:08.851521', 'step': 3649, 'epoch': 2} {'type': 'loss', 'content': 0.0021789041347801685, 'timestamp': '2025-09-30 22:18:08.858237', 'step': 3650, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:08.916571', 'step': 3650, 'epoch': 2} {'type': 'loss', 'content': 0.00205511343665421, 'timestamp': '2025-09-30 22:18:08.929929', 'step': 3651, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:08.995585', 'step': 3651, 'epoch': 2} {'type': 'loss', 'content': 0.007314031012356281, 'timestamp': '2025-09-30 22:18:09.028974', 'step': 3652, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:09.090624', 'step': 3652, 'epoch': 2} {'type': 'loss', 'content': 0.005639208946377039, 'timestamp': '2025-09-30 22:18:09.100618', 'step': 3653, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:09.162051', 'step': 3653, 'epoch': 2} {'type': 'loss', 'content': 0.009423138573765755, 'timestamp': '2025-09-30 22:18:09.174588', 'step': 3654, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:09.245390', 'step': 3654, 'epoch': 2} {'type': 'loss', 'content': 0.007394131738692522, 'timestamp': '2025-09-30 22:18:09.258764', 'step': 3655, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:09.334706', 'step': 3655, 'epoch': 2} {'type': 'loss', 'content': 0.004264110699295998, 'timestamp': '2025-09-30 22:18:09.367936', 'step': 3656, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:18:09.447573', 'step': 3656, 'epoch': 2} {'type': 'loss', 'content': 0.004879354499280453, 'timestamp': '2025-09-30 22:18:09.460917', 'step': 3657, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:09.513441', 'step': 3657, 'epoch': 2} {'type': 'loss', 'content': 0.004300492815673351, 'timestamp': '2025-09-30 22:18:09.525662', 'step': 3658, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 17560600598464}, 'timestamp': '2025-09-30 22:18:09.602920', 'step': 3658, 'epoch': 2} {'type': 'loss', 'content': 0.0033125756308436394, 'timestamp': '2025-09-30 22:18:09.624011', 'step': 3659, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:09.667155', 'step': 3659, 'epoch': 2} {'type': 'loss', 'content': 0.0047155385836958885, 'timestamp': '2025-09-30 22:18:09.701400', 'step': 3660, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:18:09.742719', 'step': 3660, 'epoch': 2} {'type': 'loss', 'content': 0.0008582398295402527, 'timestamp': '2025-09-30 22:18:09.747104', 'step': 3661, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:09.787113', 'step': 3661, 'epoch': 2} {'type': 'loss', 'content': 0.00034500984475016594, 'timestamp': '2025-09-30 22:18:09.799403', 'step': 3662, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:09.840305', 'step': 3662, 'epoch': 2} {'type': 'loss', 'content': 0.002383210463449359, 'timestamp': '2025-09-30 22:18:09.853698', 'step': 3663, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:09.910050', 'step': 3663, 'epoch': 2} {'type': 'loss', 'content': 0.0007570055895484984, 'timestamp': '2025-09-30 22:18:09.943472', 'step': 3664, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:09.982836', 'step': 3664, 'epoch': 2} {'type': 'loss', 'content': 0.0018659909255802631, 'timestamp': '2025-09-30 22:18:09.995484', 'step': 3665, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:10.032010', 'step': 3665, 'epoch': 2} {'type': 'loss', 'content': 0.004569348879158497, 'timestamp': '2025-09-30 22:18:10.043038', 'step': 3666, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:10.085512', 'step': 3666, 'epoch': 2} {'type': 'loss', 'content': 0.016304440796375275, 'timestamp': '2025-09-30 22:18:10.097866', 'step': 3667, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:18:10.146805', 'step': 3667, 'epoch': 2} {'type': 'loss', 'content': 0.016486702486872673, 'timestamp': '2025-09-30 22:18:10.181555', 'step': 3668, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:10.216748', 'step': 3668, 'epoch': 2} {'type': 'loss', 'content': 0.007853741757571697, 'timestamp': '2025-09-30 22:18:10.229392', 'step': 3669, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:10.276372', 'step': 3669, 'epoch': 2} {'type': 'loss', 'content': 0.008043302223086357, 'timestamp': '2025-09-30 22:18:10.288986', 'step': 3670, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:18:10.327790', 'step': 3670, 'epoch': 2} {'type': 'loss', 'content': 0.005524238105863333, 'timestamp': '2025-09-30 22:18:10.341529', 'step': 3671, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:10.398078', 'step': 3671, 'epoch': 2} {'type': 'loss', 'content': 0.013961317017674446, 'timestamp': '2025-09-30 22:18:10.431345', 'step': 3672, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:10.471043', 'step': 3672, 'epoch': 2} {'type': 'loss', 'content': 0.02146792970597744, 'timestamp': '2025-09-30 22:18:10.480894', 'step': 3673, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:10.523058', 'step': 3673, 'epoch': 2} {'type': 'loss', 'content': 0.013621116988360882, 'timestamp': '2025-09-30 22:18:10.534335', 'step': 3674, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:18:10.575568', 'step': 3674, 'epoch': 2} {'type': 'loss', 'content': 0.04091104492545128, 'timestamp': '2025-09-30 22:18:10.585857', 'step': 3675, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:18:10.631802', 'step': 3675, 'epoch': 2} {'type': 'loss', 'content': 0.005582859739661217, 'timestamp': '2025-09-30 22:18:10.666662', 'step': 3676, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:18:10.729822', 'step': 3676, 'epoch': 2} {'type': 'loss', 'content': 0.0049339765682816505, 'timestamp': '2025-09-30 22:18:10.742877', 'step': 3677, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 15187581968384}, 'timestamp': '2025-09-30 22:18:10.792187', 'step': 3677, 'epoch': 2} {'type': 'loss', 'content': 0.006935216952115297, 'timestamp': '2025-09-30 22:18:10.809936', 'step': 3678, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:18:10.853920', 'step': 3678, 'epoch': 2} {'type': 'loss', 'content': 0.011995434761047363, 'timestamp': '2025-09-30 22:18:10.869857', 'step': 3679, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:10.918129', 'step': 3679, 'epoch': 2} {'type': 'loss', 'content': 0.00030699989292770624, 'timestamp': '2025-09-30 22:18:10.952403', 'step': 3680, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-09-30 22:18:13.583130', 'step': 3680, 'epoch': 2} {'type': 'pplx', 'content': 5.94899863726725, 'timestamp': '2025-09-30 22:18:13.589379', 'step': 3680, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:13.624541', 'step': 3680, 'epoch': 2} {'type': 'loss', 'content': 0.0005120789282955229, 'timestamp': '2025-09-30 22:18:13.633037', 'step': 3681, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:13.669467', 'step': 3681, 'epoch': 2} {'type': 'loss', 'content': 0.005108590237796307, 'timestamp': '2025-09-30 22:18:13.682862', 'step': 3682, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:13.727819', 'step': 3682, 'epoch': 2} {'type': 'loss', 'content': 0.009081355296075344, 'timestamp': '2025-09-30 22:18:13.741147', 'step': 3683, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:18:13.782928', 'step': 3683, 'epoch': 2} {'type': 'loss', 'content': 0.0060650804080069065, 'timestamp': '2025-09-30 22:18:13.817521', 'step': 3684, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:13.857715', 'step': 3684, 'epoch': 2} {'type': 'loss', 'content': 0.004745157901197672, 'timestamp': '2025-09-30 22:18:13.870357', 'step': 3685, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:13.913884', 'step': 3685, 'epoch': 2} {'type': 'loss', 'content': 0.01413491740822792, 'timestamp': '2025-09-30 22:18:13.926461', 'step': 3686, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:18:13.969493', 'step': 3686, 'epoch': 2} {'type': 'loss', 'content': 0.009859028272330761, 'timestamp': '2025-09-30 22:18:13.983321', 'step': 3687, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:18:14.018550', 'step': 3687, 'epoch': 2} {'type': 'loss', 'content': 0.014908470213413239, 'timestamp': '2025-09-30 22:18:14.047320', 'step': 3688, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:18:14.085736', 'step': 3688, 'epoch': 3} {'type': 'loss', 'content': 0.032499223947525024, 'timestamp': '2025-09-30 22:18:14.091241', 'step': 3689, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:14.129379', 'step': 3689, 'epoch': 3} {'type': 'loss', 'content': 0.0037434506230056286, 'timestamp': '2025-09-30 22:18:14.142741', 'step': 3690, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:18:14.177667', 'step': 3690, 'epoch': 3} {'type': 'loss', 'content': 0.008036208339035511, 'timestamp': '2025-09-30 22:18:14.185276', 'step': 3691, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:14.218654', 'step': 3691, 'epoch': 3} {'type': 'loss', 'content': 0.005498153623193502, 'timestamp': '2025-09-30 22:18:14.250584', 'step': 3692, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:14.289796', 'step': 3692, 'epoch': 3} {'type': 'loss', 'content': 0.005650009959936142, 'timestamp': '2025-09-30 22:18:14.299679', 'step': 3693, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:18:14.339116', 'step': 3693, 'epoch': 3} {'type': 'loss', 'content': 0.005255770869553089, 'timestamp': '2025-09-30 22:18:14.346721', 'step': 3694, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:14.382786', 'step': 3694, 'epoch': 3} {'type': 'loss', 'content': 0.004523404873907566, 'timestamp': '2025-09-30 22:18:14.396185', 'step': 3695, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:14.433585', 'step': 3695, 'epoch': 3} {'type': 'loss', 'content': 0.005276726558804512, 'timestamp': '2025-09-30 22:18:14.466981', 'step': 3696, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-09-30 22:18:14.517339', 'step': 3696, 'epoch': 3} {'type': 'loss', 'content': 0.0043405890464782715, 'timestamp': '2025-09-30 22:18:14.534000', 'step': 3697, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:14.567132', 'step': 3697, 'epoch': 3} {'type': 'loss', 'content': 0.008746081963181496, 'timestamp': '2025-09-30 22:18:14.578258', 'step': 3698, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:14.615584', 'step': 3698, 'epoch': 3} {'type': 'loss', 'content': 0.0019219155656173825, 'timestamp': '2025-09-30 22:18:14.628935', 'step': 3699, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:14.679949', 'step': 3699, 'epoch': 3} {'type': 'loss', 'content': 0.005548653658479452, 'timestamp': '2025-09-30 22:18:14.714160', 'step': 3700, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:18:14.749749', 'step': 3700, 'epoch': 3} {'type': 'loss', 'content': 0.0061048720963299274, 'timestamp': '2025-09-30 22:18:14.754964', 'step': 3701, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:14.788440', 'step': 3701, 'epoch': 3} {'type': 'loss', 'content': 0.007814295589923859, 'timestamp': '2025-09-30 22:18:14.796389', 'step': 3702, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:14.839871', 'step': 3702, 'epoch': 3} {'type': 'loss', 'content': 0.009157909080386162, 'timestamp': '2025-09-30 22:18:14.850974', 'step': 3703, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:14.890618', 'step': 3703, 'epoch': 3} {'type': 'loss', 'content': 0.013317212462425232, 'timestamp': '2025-09-30 22:18:14.924864', 'step': 3704, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:18:14.969274', 'step': 3704, 'epoch': 3} {'type': 'loss', 'content': 0.012214075773954391, 'timestamp': '2025-09-30 22:18:14.977380', 'step': 3705, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:15.025620', 'step': 3705, 'epoch': 3} {'type': 'loss', 'content': 0.005884271580725908, 'timestamp': '2025-09-30 22:18:15.038246', 'step': 3706, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:18:15.083113', 'step': 3706, 'epoch': 3} {'type': 'loss', 'content': 0.010557309724390507, 'timestamp': '2025-09-30 22:18:15.090296', 'step': 3707, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:15.125585', 'step': 3707, 'epoch': 3} {'type': 'loss', 'content': 0.0075211371295154095, 'timestamp': '2025-09-30 22:18:15.154332', 'step': 3708, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:15.193522', 'step': 3708, 'epoch': 3} {'type': 'loss', 'content': 0.012411229312419891, 'timestamp': '2025-09-30 22:18:15.199191', 'step': 3709, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:15.232039', 'step': 3709, 'epoch': 3} {'type': 'loss', 'content': 0.0031475538853555918, 'timestamp': '2025-09-30 22:18:15.240086', 'step': 3710, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:18:15.272609', 'step': 3710, 'epoch': 3} {'type': 'loss', 'content': 0.003913676366209984, 'timestamp': '2025-09-30 22:18:15.279503', 'step': 3711, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:18:15.312554', 'step': 3711, 'epoch': 3} {'type': 'loss', 'content': 0.005341598764061928, 'timestamp': '2025-09-30 22:18:15.340442', 'step': 3712, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:18:15.379094', 'step': 3712, 'epoch': 3} {'type': 'loss', 'content': 0.005785258952528238, 'timestamp': '2025-09-30 22:18:15.392206', 'step': 3713, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:18:15.432110', 'step': 3713, 'epoch': 3} {'type': 'loss', 'content': 0.017216932028532028, 'timestamp': '2025-09-30 22:18:15.439323', 'step': 3714, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:18:15.476184', 'step': 3714, 'epoch': 3} {'type': 'loss', 'content': 0.010194359347224236, 'timestamp': '2025-09-30 22:18:15.489908', 'step': 3715, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:15.523929', 'step': 3715, 'epoch': 3} {'type': 'loss', 'content': 0.014519540593028069, 'timestamp': '2025-09-30 22:18:15.557325', 'step': 3716, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:15.593431', 'step': 3716, 'epoch': 3} {'type': 'loss', 'content': 0.008453921414911747, 'timestamp': '2025-09-30 22:18:15.603977', 'step': 3717, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:15.642795', 'step': 3717, 'epoch': 3} {'type': 'loss', 'content': 0.006131226196885109, 'timestamp': '2025-09-30 22:18:15.656203', 'step': 3718, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:15.691523', 'step': 3718, 'epoch': 3} {'type': 'loss', 'content': 0.00950410682708025, 'timestamp': '2025-09-30 22:18:15.702563', 'step': 3719, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:15.738430', 'step': 3719, 'epoch': 3} {'type': 'loss', 'content': 0.002914604963734746, 'timestamp': '2025-09-30 22:18:15.767307', 'step': 3720, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:18:15.813277', 'step': 3720, 'epoch': 3} {'type': 'loss', 'content': 0.0038131820037961006, 'timestamp': '2025-09-30 22:18:15.826248', 'step': 3721, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:15.862374', 'step': 3721, 'epoch': 3} {'type': 'loss', 'content': 0.005772165954113007, 'timestamp': '2025-09-30 22:18:15.873484', 'step': 3722, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:18:15.913366', 'step': 3722, 'epoch': 3} {'type': 'loss', 'content': 0.012822024524211884, 'timestamp': '2025-09-30 22:18:15.927075', 'step': 3723, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:18:15.970664', 'step': 3723, 'epoch': 3} {'type': 'loss', 'content': 0.012181980535387993, 'timestamp': '2025-09-30 22:18:16.005223', 'step': 3724, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:16.042346', 'step': 3724, 'epoch': 3} {'type': 'loss', 'content': 0.003662961069494486, 'timestamp': '2025-09-30 22:18:16.048077', 'step': 3725, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:16.083318', 'step': 3725, 'epoch': 3} {'type': 'loss', 'content': 0.0059233191423118114, 'timestamp': '2025-09-30 22:18:16.094446', 'step': 3726, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:18:16.130168', 'step': 3726, 'epoch': 3} {'type': 'loss', 'content': 0.00668327696621418, 'timestamp': '2025-09-30 22:18:16.137169', 'step': 3727, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:18:16.177771', 'step': 3727, 'epoch': 3} {'type': 'loss', 'content': 0.0058019645512104034, 'timestamp': '2025-09-30 22:18:16.205353', 'step': 3728, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:16.243485', 'step': 3728, 'epoch': 3} {'type': 'loss', 'content': 0.018236473202705383, 'timestamp': '2025-09-30 22:18:16.249032', 'step': 3729, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:18:16.287176', 'step': 3729, 'epoch': 3} {'type': 'loss', 'content': 0.00995565950870514, 'timestamp': '2025-09-30 22:18:16.297575', 'step': 3730, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 15187581968384}, 'timestamp': '2025-09-30 22:18:16.349980', 'step': 3730, 'epoch': 3} {'type': 'loss', 'content': 0.005770450923591852, 'timestamp': '2025-09-30 22:18:16.367682', 'step': 3731, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:18:16.414736', 'step': 3731, 'epoch': 3} {'type': 'loss', 'content': 0.006130880210548639, 'timestamp': '2025-09-30 22:18:16.449245', 'step': 3732, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:18:16.487474', 'step': 3732, 'epoch': 3} {'type': 'loss', 'content': 0.005417075008153915, 'timestamp': '2025-09-30 22:18:16.502927', 'step': 3733, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:16.535994', 'step': 3733, 'epoch': 3} {'type': 'loss', 'content': 0.010411316528916359, 'timestamp': '2025-09-30 22:18:16.548101', 'step': 3734, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:16.583282', 'step': 3734, 'epoch': 3} {'type': 'loss', 'content': 0.008753958158195019, 'timestamp': '2025-09-30 22:18:16.591092', 'step': 3735, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:16.666646', 'step': 3735, 'epoch': 3} {'type': 'loss', 'content': 0.008556557819247246, 'timestamp': '2025-09-30 22:18:16.698704', 'step': 3736, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:16.753508', 'step': 3736, 'epoch': 3} {'type': 'loss', 'content': 0.009406271390616894, 'timestamp': '2025-09-30 22:18:16.759246', 'step': 3737, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:16.792796', 'step': 3737, 'epoch': 3} {'type': 'loss', 'content': 0.016798771917819977, 'timestamp': '2025-09-30 22:18:16.803919', 'step': 3738, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:18:16.845608', 'step': 3738, 'epoch': 3} {'type': 'loss', 'content': 0.004465001169592142, 'timestamp': '2025-09-30 22:18:16.853180', 'step': 3739, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:16.903296', 'step': 3739, 'epoch': 3} {'type': 'loss', 'content': 0.004485375713557005, 'timestamp': '2025-09-30 22:18:16.937502', 'step': 3740, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:18:16.972287', 'step': 3740, 'epoch': 3} {'type': 'loss', 'content': 0.007075873203575611, 'timestamp': '2025-09-30 22:18:16.985382', 'step': 3741, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:17.033112', 'step': 3741, 'epoch': 3} {'type': 'loss', 'content': 0.008225277997553349, 'timestamp': '2025-09-30 22:18:17.043940', 'step': 3742, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:17.077473', 'step': 3742, 'epoch': 3} {'type': 'loss', 'content': 0.0036486752796918154, 'timestamp': '2025-09-30 22:18:17.088440', 'step': 3743, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:17.124654', 'step': 3743, 'epoch': 3} {'type': 'loss', 'content': 0.0066786594688892365, 'timestamp': '2025-09-30 22:18:17.157610', 'step': 3744, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:17.195367', 'step': 3744, 'epoch': 3} {'type': 'loss', 'content': 0.006404911633580923, 'timestamp': '2025-09-30 22:18:17.206069', 'step': 3745, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:17.247997', 'step': 3745, 'epoch': 3} {'type': 'loss', 'content': 0.007119921967387199, 'timestamp': '2025-09-30 22:18:17.259023', 'step': 3746, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:18:17.293777', 'step': 3746, 'epoch': 3} {'type': 'loss', 'content': 0.005634994246065617, 'timestamp': '2025-09-30 22:18:17.300905', 'step': 3747, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:18:17.338543', 'step': 3747, 'epoch': 3} {'type': 'loss', 'content': 0.013412128202617168, 'timestamp': '2025-09-30 22:18:17.366620', 'step': 3748, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:18:17.404523', 'step': 3748, 'epoch': 3} {'type': 'loss', 'content': 0.0031789415515959263, 'timestamp': '2025-09-30 22:18:17.411273', 'step': 3749, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:18:17.448584', 'step': 3749, 'epoch': 3} {'type': 'loss', 'content': 0.005689022596925497, 'timestamp': '2025-09-30 22:18:17.457660', 'step': 3750, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:17.498445', 'step': 3750, 'epoch': 3} {'type': 'loss', 'content': 0.009960222989320755, 'timestamp': '2025-09-30 22:18:17.510824', 'step': 3751, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:18:17.549958', 'step': 3751, 'epoch': 3} {'type': 'loss', 'content': 0.007598586846143007, 'timestamp': '2025-09-30 22:18:17.584605', 'step': 3752, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:18:17.624193', 'step': 3752, 'epoch': 3} {'type': 'loss', 'content': 0.007153657730668783, 'timestamp': '2025-09-30 22:18:17.637518', 'step': 3753, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:18:17.675946', 'step': 3753, 'epoch': 3} {'type': 'loss', 'content': 0.007242599036544561, 'timestamp': '2025-09-30 22:18:17.683228', 'step': 3754, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:17.717440', 'step': 3754, 'epoch': 3} {'type': 'loss', 'content': 0.012119963765144348, 'timestamp': '2025-09-30 22:18:17.729662', 'step': 3755, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:17.770034', 'step': 3755, 'epoch': 3} {'type': 'loss', 'content': 0.006791086867451668, 'timestamp': '2025-09-30 22:18:17.803276', 'step': 3756, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:17.848398', 'step': 3756, 'epoch': 3} {'type': 'loss', 'content': 0.008207529783248901, 'timestamp': '2025-09-30 22:18:17.857735', 'step': 3757, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:18:17.901325', 'step': 3757, 'epoch': 3} {'type': 'loss', 'content': 0.007791235111653805, 'timestamp': '2025-09-30 22:18:17.915167', 'step': 3758, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:17.957581', 'step': 3758, 'epoch': 3} {'type': 'loss', 'content': 0.0068681500852108, 'timestamp': '2025-09-30 22:18:17.966773', 'step': 3759, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:18.005725', 'step': 3759, 'epoch': 3} {'type': 'loss', 'content': 0.0025856555439531803, 'timestamp': '2025-09-30 22:18:18.038880', 'step': 3760, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:18:18.076082', 'step': 3760, 'epoch': 3} {'type': 'loss', 'content': 0.008086767978966236, 'timestamp': '2025-09-30 22:18:18.084004', 'step': 3761, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:18.134728', 'step': 3761, 'epoch': 3} {'type': 'loss', 'content': 0.004576331470161676, 'timestamp': '2025-09-30 22:18:18.147311', 'step': 3762, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:18.181583', 'step': 3762, 'epoch': 3} {'type': 'loss', 'content': 0.005068800412118435, 'timestamp': '2025-09-30 22:18:18.192599', 'step': 3763, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:18:18.230750', 'step': 3763, 'epoch': 3} {'type': 'loss', 'content': 0.006224495824426413, 'timestamp': '2025-09-30 22:18:18.259263', 'step': 3764, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:18.299343', 'step': 3764, 'epoch': 3} {'type': 'loss', 'content': 0.005041790194809437, 'timestamp': '2025-09-30 22:18:18.309092', 'step': 3765, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:18.358404', 'step': 3765, 'epoch': 3} {'type': 'loss', 'content': 0.004011180251836777, 'timestamp': '2025-09-30 22:18:18.371021', 'step': 3766, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:18.411470', 'step': 3766, 'epoch': 3} {'type': 'loss', 'content': 0.012970248237252235, 'timestamp': '2025-09-30 22:18:18.422444', 'step': 3767, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:18:18.458086', 'step': 3767, 'epoch': 3} {'type': 'loss', 'content': 0.010750544257462025, 'timestamp': '2025-09-30 22:18:18.489510', 'step': 3768, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:18:18.522637', 'step': 3768, 'epoch': 3} {'type': 'loss', 'content': 0.011017782613635063, 'timestamp': '2025-09-30 22:18:18.531500', 'step': 3769, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:18.565151', 'step': 3769, 'epoch': 3} {'type': 'loss', 'content': 0.003940001130104065, 'timestamp': '2025-09-30 22:18:18.575387', 'step': 3770, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:18.614676', 'step': 3770, 'epoch': 3} {'type': 'loss', 'content': 0.0042926715686917305, 'timestamp': '2025-09-30 22:18:18.628008', 'step': 3771, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-09-30 22:18:18.675700', 'step': 3771, 'epoch': 3} {'type': 'loss', 'content': 0.0032875791657716036, 'timestamp': '2025-09-30 22:18:18.712741', 'step': 3772, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:18.751794', 'step': 3772, 'epoch': 3} {'type': 'loss', 'content': 0.00762244313955307, 'timestamp': '2025-09-30 22:18:18.764132', 'step': 3773, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:18:18.810163', 'step': 3773, 'epoch': 3} {'type': 'loss', 'content': 0.006488482002168894, 'timestamp': '2025-09-30 22:18:18.824042', 'step': 3774, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:18:18.866935', 'step': 3774, 'epoch': 3} {'type': 'loss', 'content': 0.002553711412474513, 'timestamp': '2025-09-30 22:18:18.880657', 'step': 3775, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:18.928045', 'step': 3775, 'epoch': 3} {'type': 'loss', 'content': 0.012262634001672268, 'timestamp': '2025-09-30 22:18:18.962252', 'step': 3776, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:18:19.004165', 'step': 3776, 'epoch': 3} {'type': 'loss', 'content': 0.00850253738462925, 'timestamp': '2025-09-30 22:18:19.017355', 'step': 3777, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:18:19.053743', 'step': 3777, 'epoch': 3} {'type': 'loss', 'content': 0.00532106775790453, 'timestamp': '2025-09-30 22:18:19.060620', 'step': 3778, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:18:19.096076', 'step': 3778, 'epoch': 3} {'type': 'loss', 'content': 0.004747466649860144, 'timestamp': '2025-09-30 22:18:19.109663', 'step': 3779, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:19.149284', 'step': 3779, 'epoch': 3} {'type': 'loss', 'content': 0.008793232031166553, 'timestamp': '2025-09-30 22:18:19.178008', 'step': 3780, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:18:19.222410', 'step': 3780, 'epoch': 3} {'type': 'loss', 'content': 0.013200669549405575, 'timestamp': '2025-09-30 22:18:19.230390', 'step': 3781, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:19.276699', 'step': 3781, 'epoch': 3} {'type': 'loss', 'content': 0.008499288000166416, 'timestamp': '2025-09-30 22:18:19.287765', 'step': 3782, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:18:19.324157', 'step': 3782, 'epoch': 3} {'type': 'loss', 'content': 0.008562013506889343, 'timestamp': '2025-09-30 22:18:19.337912', 'step': 3783, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:19.378701', 'step': 3783, 'epoch': 3} {'type': 'loss', 'content': 0.0022412212565541267, 'timestamp': '2025-09-30 22:18:19.411851', 'step': 3784, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:18:19.453428', 'step': 3784, 'epoch': 3} {'type': 'loss', 'content': 0.018065424636006355, 'timestamp': '2025-09-30 22:18:19.463669', 'step': 3785, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:19.504192', 'step': 3785, 'epoch': 3} {'type': 'loss', 'content': 0.0039479536935687065, 'timestamp': '2025-09-30 22:18:19.516736', 'step': 3786, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:18:19.561715', 'step': 3786, 'epoch': 3} {'type': 'loss', 'content': 0.009036360308527946, 'timestamp': '2025-09-30 22:18:19.575422', 'step': 3787, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:19.616151', 'step': 3787, 'epoch': 3} {'type': 'loss', 'content': 0.01659063808619976, 'timestamp': '2025-09-30 22:18:19.650346', 'step': 3788, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:18:19.701756', 'step': 3788, 'epoch': 3} {'type': 'loss', 'content': 0.0018625481752678752, 'timestamp': '2025-09-30 22:18:19.714789', 'step': 3789, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:19.760397', 'step': 3789, 'epoch': 3} {'type': 'loss', 'content': 0.0016890355618670583, 'timestamp': '2025-09-30 22:18:19.772752', 'step': 3790, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:19.805736', 'step': 3790, 'epoch': 3} {'type': 'loss', 'content': 0.0023757508024573326, 'timestamp': '2025-09-30 22:18:19.817934', 'step': 3791, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:18:19.856916', 'step': 3791, 'epoch': 3} {'type': 'loss', 'content': 0.002403699792921543, 'timestamp': '2025-09-30 22:18:19.885237', 'step': 3792, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:18:19.929851', 'step': 3792, 'epoch': 3} {'type': 'loss', 'content': 0.0006492491811513901, 'timestamp': '2025-09-30 22:18:19.937803', 'step': 3793, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:18:19.977718', 'step': 3793, 'epoch': 3} {'type': 'loss', 'content': 0.0023878298234194517, 'timestamp': '2025-09-30 22:18:19.991548', 'step': 3794, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:20.031931', 'step': 3794, 'epoch': 3} {'type': 'loss', 'content': 0.0011837700149044394, 'timestamp': '2025-09-30 22:18:20.044324', 'step': 3795, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-09-30 22:18:22.618775', 'step': 3795, 'epoch': 3} {'type': 'pplx', 'content': 5.984622815359175, 'timestamp': '2025-09-30 22:18:22.627025', 'step': 3795, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:18:22.663615', 'step': 3795, 'epoch': 3} {'type': 'loss', 'content': 0.005366284865885973, 'timestamp': '2025-09-30 22:18:22.694570', 'step': 3796, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:18:22.732899', 'step': 3796, 'epoch': 3} {'type': 'loss', 'content': 0.0032648183405399323, 'timestamp': '2025-09-30 22:18:22.741131', 'step': 3797, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:22.781169', 'step': 3797, 'epoch': 3} {'type': 'loss', 'content': 0.018629658967256546, 'timestamp': '2025-09-30 22:18:22.793708', 'step': 3798, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-09-30 22:18:22.846167', 'step': 3798, 'epoch': 3} {'type': 'loss', 'content': 0.006669124588370323, 'timestamp': '2025-09-30 22:18:22.862375', 'step': 3799, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:22.909373', 'step': 3799, 'epoch': 3} {'type': 'loss', 'content': 0.009376229718327522, 'timestamp': '2025-09-30 22:18:22.942558', 'step': 3800, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:18:22.982068', 'step': 3800, 'epoch': 3} {'type': 'loss', 'content': 0.0030857212841510773, 'timestamp': '2025-09-30 22:18:22.995425', 'step': 3801, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:23.031864', 'step': 3801, 'epoch': 3} {'type': 'loss', 'content': 0.019586985930800438, 'timestamp': '2025-09-30 22:18:23.040066', 'step': 3802, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:18:23.078380', 'step': 3802, 'epoch': 3} {'type': 'loss', 'content': 0.0083575788885355, 'timestamp': '2025-09-30 22:18:23.085212', 'step': 3803, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:18:23.137035', 'step': 3803, 'epoch': 3} {'type': 'loss', 'content': 0.0033423728309571743, 'timestamp': '2025-09-30 22:18:23.173728', 'step': 3804, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:18:23.231733', 'step': 3804, 'epoch': 3} {'type': 'loss', 'content': 0.004513347055763006, 'timestamp': '2025-09-30 22:18:23.239850', 'step': 3805, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:23.279003', 'step': 3805, 'epoch': 3} {'type': 'loss', 'content': 0.008332960307598114, 'timestamp': '2025-09-30 22:18:23.291274', 'step': 3806, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:23.339024', 'step': 3806, 'epoch': 3} {'type': 'loss', 'content': 0.00962772872298956, 'timestamp': '2025-09-30 22:18:23.350185', 'step': 3807, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:18:23.394310', 'step': 3807, 'epoch': 3} {'type': 'loss', 'content': 0.004567010793834925, 'timestamp': '2025-09-30 22:18:23.428979', 'step': 3808, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:18:23.472171', 'step': 3808, 'epoch': 3} {'type': 'loss', 'content': 0.004992264788597822, 'timestamp': '2025-09-30 22:18:23.485372', 'step': 3809, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:23.531957', 'step': 3809, 'epoch': 3} {'type': 'loss', 'content': 0.006169665139168501, 'timestamp': '2025-09-30 22:18:23.544523', 'step': 3810, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:18:23.583335', 'step': 3810, 'epoch': 3} {'type': 'loss', 'content': 0.00960304494947195, 'timestamp': '2025-09-30 22:18:23.597131', 'step': 3811, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:18:23.644699', 'step': 3811, 'epoch': 3} {'type': 'loss', 'content': 0.004652353469282389, 'timestamp': '2025-09-30 22:18:23.681314', 'step': 3812, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:18:23.725157', 'step': 3812, 'epoch': 3} {'type': 'loss', 'content': 0.023862021043896675, 'timestamp': '2025-09-30 22:18:23.740582', 'step': 3813, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:18:23.789265', 'step': 3813, 'epoch': 3} {'type': 'loss', 'content': 0.012361971661448479, 'timestamp': '2025-09-30 22:18:23.804918', 'step': 3814, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:23.842579', 'step': 3814, 'epoch': 3} {'type': 'loss', 'content': 0.005945376120507717, 'timestamp': '2025-09-30 22:18:23.853646', 'step': 3815, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 16611393146432}, 'timestamp': '2025-09-30 22:18:23.905098', 'step': 3815, 'epoch': 3} {'type': 'loss', 'content': 0.010283276438713074, 'timestamp': '2025-09-30 22:18:23.945250', 'step': 3816, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-09-30 22:18:23.986991', 'step': 3816, 'epoch': 3} {'type': 'loss', 'content': 0.004800677765160799, 'timestamp': '2025-09-30 22:18:24.003662', 'step': 3817, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:18:24.040227', 'step': 3817, 'epoch': 3} {'type': 'loss', 'content': 0.005037254188209772, 'timestamp': '2025-09-30 22:18:24.054232', 'step': 3818, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:24.089345', 'step': 3818, 'epoch': 3} {'type': 'loss', 'content': 0.005796071607619524, 'timestamp': '2025-09-30 22:18:24.101490', 'step': 3819, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:24.139434', 'step': 3819, 'epoch': 3} {'type': 'loss', 'content': 0.008076971396803856, 'timestamp': '2025-09-30 22:18:24.173675', 'step': 3820, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-09-30 22:18:24.216698', 'step': 3820, 'epoch': 3} {'type': 'loss', 'content': 0.008668972179293633, 'timestamp': '2025-09-30 22:18:24.233433', 'step': 3821, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 14712978242368}, 'timestamp': '2025-09-30 22:18:24.278952', 'step': 3821, 'epoch': 3} {'type': 'loss', 'content': 0.005224195308983326, 'timestamp': '2025-09-30 22:18:24.296534', 'step': 3822, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:18:24.339588', 'step': 3822, 'epoch': 3} {'type': 'loss', 'content': 0.010332217440009117, 'timestamp': '2025-09-30 22:18:24.353305', 'step': 3823, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:18:24.393084', 'step': 3823, 'epoch': 3} {'type': 'loss', 'content': 0.01401583757251501, 'timestamp': '2025-09-30 22:18:24.421170', 'step': 3824, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:18:24.460237', 'step': 3824, 'epoch': 3} {'type': 'loss', 'content': 0.004565891809761524, 'timestamp': '2025-09-30 22:18:24.468096', 'step': 3825, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:24.505472', 'step': 3825, 'epoch': 3} {'type': 'loss', 'content': 0.00525381974875927, 'timestamp': '2025-09-30 22:18:24.518059', 'step': 3826, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:24.553698', 'step': 3826, 'epoch': 3} {'type': 'loss', 'content': 0.005812001880258322, 'timestamp': '2025-09-30 22:18:24.564784', 'step': 3827, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:24.596735', 'step': 3827, 'epoch': 3} {'type': 'loss', 'content': 0.011806738562881947, 'timestamp': '2025-09-30 22:18:24.625496', 'step': 3828, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:24.662898', 'step': 3828, 'epoch': 3} {'type': 'loss', 'content': 0.004773608408868313, 'timestamp': '2025-09-30 22:18:24.668565', 'step': 3829, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:24.708771', 'step': 3829, 'epoch': 3} {'type': 'loss', 'content': 0.017275353893637657, 'timestamp': '2025-09-30 22:18:24.722097', 'step': 3830, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-09-30 22:18:24.767901', 'step': 3830, 'epoch': 3} {'type': 'loss', 'content': 0.014677359722554684, 'timestamp': '2025-09-30 22:18:24.784095', 'step': 3831, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:18:24.815545', 'step': 3831, 'epoch': 3} {'type': 'loss', 'content': 0.007634055335074663, 'timestamp': '2025-09-30 22:18:24.844159', 'step': 3832, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:18:24.877349', 'step': 3832, 'epoch': 3} {'type': 'loss', 'content': 0.009904236532747746, 'timestamp': '2025-09-30 22:18:24.885428', 'step': 3833, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:24.923168', 'step': 3833, 'epoch': 3} {'type': 'loss', 'content': 0.01268276758491993, 'timestamp': '2025-09-30 22:18:24.931081', 'step': 3834, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:18:24.963562', 'step': 3834, 'epoch': 3} {'type': 'loss', 'content': 0.011875173076987267, 'timestamp': '2025-09-30 22:18:24.971164', 'step': 3835, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:18:25.015227', 'step': 3835, 'epoch': 3} {'type': 'loss', 'content': 0.0084492526948452, 'timestamp': '2025-09-30 22:18:25.042960', 'step': 3836, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:25.079600', 'step': 3836, 'epoch': 3} {'type': 'loss', 'content': 0.007780611515045166, 'timestamp': '2025-09-30 22:18:25.090165', 'step': 3837, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:25.127255', 'step': 3837, 'epoch': 3} {'type': 'loss', 'content': 0.0065346043556928635, 'timestamp': '2025-09-30 22:18:25.138471', 'step': 3838, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:25.190501', 'step': 3838, 'epoch': 3} {'type': 'loss', 'content': 0.008517752401530743, 'timestamp': '2025-09-30 22:18:25.201696', 'step': 3839, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:18:25.247126', 'step': 3839, 'epoch': 3} {'type': 'loss', 'content': 0.009599901735782623, 'timestamp': '2025-09-30 22:18:25.282003', 'step': 3840, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:25.324605', 'step': 3840, 'epoch': 3} {'type': 'loss', 'content': 0.010341773740947247, 'timestamp': '2025-09-30 22:18:25.335371', 'step': 3841, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:18:25.377590', 'step': 3841, 'epoch': 3} {'type': 'loss', 'content': 0.008768843486905098, 'timestamp': '2025-09-30 22:18:25.391455', 'step': 3842, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:18:25.440471', 'step': 3842, 'epoch': 3} {'type': 'loss', 'content': 0.012737488374114037, 'timestamp': '2025-09-30 22:18:25.454142', 'step': 3843, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:18:25.518195', 'step': 3843, 'epoch': 3} {'type': 'loss', 'content': 0.008995888754725456, 'timestamp': '2025-09-30 22:18:25.554696', 'step': 3844, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:25.587401', 'step': 3844, 'epoch': 3} {'type': 'loss', 'content': 0.008873233571648598, 'timestamp': '2025-09-30 22:18:25.597391', 'step': 3845, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:18:25.653250', 'step': 3845, 'epoch': 3} {'type': 'loss', 'content': 0.005939814727753401, 'timestamp': '2025-09-30 22:18:25.667080', 'step': 3846, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:25.703290', 'step': 3846, 'epoch': 3} {'type': 'loss', 'content': 0.01119601633399725, 'timestamp': '2025-09-30 22:18:25.715584', 'step': 3847, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:18:25.754438', 'step': 3847, 'epoch': 3} {'type': 'loss', 'content': 0.00690952455624938, 'timestamp': '2025-09-30 22:18:25.789268', 'step': 3848, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:25.824949', 'step': 3848, 'epoch': 3} {'type': 'loss', 'content': 0.007601768709719181, 'timestamp': '2025-09-30 22:18:25.830640', 'step': 3849, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:25.866163', 'step': 3849, 'epoch': 3} {'type': 'loss', 'content': 0.00634763902053237, 'timestamp': '2025-09-30 22:18:25.878698', 'step': 3850, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:25.920077', 'step': 3850, 'epoch': 3} {'type': 'loss', 'content': 0.007674683816730976, 'timestamp': '2025-09-30 22:18:25.931010', 'step': 3851, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:25.967808', 'step': 3851, 'epoch': 3} {'type': 'loss', 'content': 0.00828044954687357, 'timestamp': '2025-09-30 22:18:26.001300', 'step': 3852, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:26.044886', 'step': 3852, 'epoch': 3} {'type': 'loss', 'content': 0.009877484291791916, 'timestamp': '2025-09-30 22:18:26.055797', 'step': 3853, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:18:26.093707', 'step': 3853, 'epoch': 3} {'type': 'loss', 'content': 0.00591434957459569, 'timestamp': '2025-09-30 22:18:26.103992', 'step': 3854, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:26.140192', 'step': 3854, 'epoch': 3} {'type': 'loss', 'content': 0.003982429392635822, 'timestamp': '2025-09-30 22:18:26.152462', 'step': 3855, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:26.196468', 'step': 3855, 'epoch': 3} {'type': 'loss', 'content': 0.008871220983564854, 'timestamp': '2025-09-30 22:18:26.225283', 'step': 3856, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:26.260461', 'step': 3856, 'epoch': 3} {'type': 'loss', 'content': 0.009687988087534904, 'timestamp': '2025-09-30 22:18:26.265947', 'step': 3857, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:26.301410', 'step': 3857, 'epoch': 3} {'type': 'loss', 'content': 0.007140415720641613, 'timestamp': '2025-09-30 22:18:26.309245', 'step': 3858, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:18:26.350114', 'step': 3858, 'epoch': 3} {'type': 'loss', 'content': 0.0017469810554757714, 'timestamp': '2025-09-30 22:18:26.363714', 'step': 3859, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:18:26.408542', 'step': 3859, 'epoch': 3} {'type': 'loss', 'content': 0.00501214899122715, 'timestamp': '2025-09-30 22:18:26.436960', 'step': 3860, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:26.481463', 'step': 3860, 'epoch': 3} {'type': 'loss', 'content': 0.006503783632069826, 'timestamp': '2025-09-30 22:18:26.491532', 'step': 3861, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:18:26.533909', 'step': 3861, 'epoch': 3} {'type': 'loss', 'content': 0.011045904830098152, 'timestamp': '2025-09-30 22:18:26.544219', 'step': 3862, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:18:26.582027', 'step': 3862, 'epoch': 3} {'type': 'loss', 'content': 0.007116046734154224, 'timestamp': '2025-09-30 22:18:26.589410', 'step': 3863, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:26.625089', 'step': 3863, 'epoch': 3} {'type': 'loss', 'content': 0.006851586047559977, 'timestamp': '2025-09-30 22:18:26.658113', 'step': 3864, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:18:26.695516', 'step': 3864, 'epoch': 3} {'type': 'loss', 'content': 0.0008578465203754604, 'timestamp': '2025-09-30 22:18:26.708599', 'step': 3865, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:26.755117', 'step': 3865, 'epoch': 3} {'type': 'loss', 'content': 0.006214868277311325, 'timestamp': '2025-09-30 22:18:26.763762', 'step': 3866, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-09-30 22:18:26.811109', 'step': 3866, 'epoch': 3} {'type': 'loss', 'content': 0.004790063947439194, 'timestamp': '2025-09-30 22:18:26.827288', 'step': 3867, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:26.871596', 'step': 3867, 'epoch': 3} {'type': 'loss', 'content': 0.006224909331649542, 'timestamp': '2025-09-30 22:18:26.905803', 'step': 3868, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:26.949964', 'step': 3868, 'epoch': 3} {'type': 'loss', 'content': 0.0054021235555410385, 'timestamp': '2025-09-30 22:18:26.960670', 'step': 3869, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:26.998448', 'step': 3869, 'epoch': 3} {'type': 'loss', 'content': 0.007381778676062822, 'timestamp': '2025-09-30 22:18:27.009466', 'step': 3870, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:27.044310', 'step': 3870, 'epoch': 3} {'type': 'loss', 'content': 0.012521324679255486, 'timestamp': '2025-09-30 22:18:27.052354', 'step': 3871, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:27.090489', 'step': 3871, 'epoch': 3} {'type': 'loss', 'content': 0.007201758679002523, 'timestamp': '2025-09-30 22:18:27.123853', 'step': 3872, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:18:27.166188', 'step': 3872, 'epoch': 3} {'type': 'loss', 'content': 0.004940586630254984, 'timestamp': '2025-09-30 22:18:27.171494', 'step': 3873, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:18:27.210102', 'step': 3873, 'epoch': 3} {'type': 'loss', 'content': 0.008473108522593975, 'timestamp': '2025-09-30 22:18:27.220601', 'step': 3874, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:18:27.260120', 'step': 3874, 'epoch': 3} {'type': 'loss', 'content': 0.005420765373855829, 'timestamp': '2025-09-30 22:18:27.273873', 'step': 3875, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:18:27.306808', 'step': 3875, 'epoch': 3} {'type': 'loss', 'content': 0.011063523590564728, 'timestamp': '2025-09-30 22:18:27.335230', 'step': 3876, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:27.383162', 'step': 3876, 'epoch': 3} {'type': 'loss', 'content': 0.010778565891087055, 'timestamp': '2025-09-30 22:18:27.395787', 'step': 3877, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:27.470218', 'step': 3877, 'epoch': 3} {'type': 'loss', 'content': 0.005143335554748774, 'timestamp': '2025-09-30 22:18:27.482789', 'step': 3878, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:18:27.535931', 'step': 3878, 'epoch': 3} {'type': 'loss', 'content': 0.006342253182083368, 'timestamp': '2025-09-30 22:18:27.551835', 'step': 3879, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:27.589350', 'step': 3879, 'epoch': 3} {'type': 'loss', 'content': 0.0035624420270323753, 'timestamp': '2025-09-30 22:18:27.622817', 'step': 3880, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:18:27.671436', 'step': 3880, 'epoch': 3} {'type': 'loss', 'content': 0.010757009498775005, 'timestamp': '2025-09-30 22:18:27.676138', 'step': 3881, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:18:27.718742', 'step': 3881, 'epoch': 3} {'type': 'loss', 'content': 0.0025167877320200205, 'timestamp': '2025-09-30 22:18:27.729051', 'step': 3882, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:18:27.773613', 'step': 3882, 'epoch': 3} {'type': 'loss', 'content': 0.007043273653835058, 'timestamp': '2025-09-30 22:18:27.789434', 'step': 3883, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:18:27.829959', 'step': 3883, 'epoch': 3} {'type': 'loss', 'content': 0.006122028920799494, 'timestamp': '2025-09-30 22:18:27.864591', 'step': 3884, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:18:27.902894', 'step': 3884, 'epoch': 3} {'type': 'loss', 'content': 0.005711785517632961, 'timestamp': '2025-09-30 22:18:27.915338', 'step': 3885, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:18:27.964986', 'step': 3885, 'epoch': 3} {'type': 'loss', 'content': 0.0050542643293738365, 'timestamp': '2025-09-30 22:18:27.975391', 'step': 3886, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:18:28.025093', 'step': 3886, 'epoch': 3} {'type': 'loss', 'content': 0.012293345294892788, 'timestamp': '2025-09-30 22:18:28.035476', 'step': 3887, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:18:28.074132', 'step': 3887, 'epoch': 3} {'type': 'loss', 'content': 0.005285812076181173, 'timestamp': '2025-09-30 22:18:28.105291', 'step': 3888, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:28.148314', 'step': 3888, 'epoch': 3} {'type': 'loss', 'content': 0.007423700764775276, 'timestamp': '2025-09-30 22:18:28.153994', 'step': 3889, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:28.192415', 'step': 3889, 'epoch': 3} {'type': 'loss', 'content': 0.0060288263484835625, 'timestamp': '2025-09-30 22:18:28.205836', 'step': 3890, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:18:28.248177', 'step': 3890, 'epoch': 3} {'type': 'loss', 'content': 0.004893283825367689, 'timestamp': '2025-09-30 22:18:28.261984', 'step': 3891, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:18:28.305167', 'step': 3891, 'epoch': 3} {'type': 'loss', 'content': 0.0061093950644135475, 'timestamp': '2025-09-30 22:18:28.339808', 'step': 3892, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:28.379637', 'step': 3892, 'epoch': 3} {'type': 'loss', 'content': 0.0057939523831009865, 'timestamp': '2025-09-30 22:18:28.392243', 'step': 3893, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:28.434929', 'step': 3893, 'epoch': 3} {'type': 'loss', 'content': 0.00592371542006731, 'timestamp': '2025-09-30 22:18:28.447321', 'step': 3894, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:28.493332', 'step': 3894, 'epoch': 3} {'type': 'loss', 'content': 0.0073332227766513824, 'timestamp': '2025-09-30 22:18:28.506723', 'step': 3895, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:18:28.560900', 'step': 3895, 'epoch': 3} {'type': 'loss', 'content': 0.009659533388912678, 'timestamp': '2025-09-30 22:18:28.595542', 'step': 3896, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:18:28.659981', 'step': 3896, 'epoch': 3} {'type': 'loss', 'content': 0.011878445744514465, 'timestamp': '2025-09-30 22:18:28.668157', 'step': 3897, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:18:28.713444', 'step': 3897, 'epoch': 3} {'type': 'loss', 'content': 0.004296146798878908, 'timestamp': '2025-09-30 22:18:28.727126', 'step': 3898, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:28.770444', 'step': 3898, 'epoch': 3} {'type': 'loss', 'content': 0.005357000045478344, 'timestamp': '2025-09-30 22:18:28.778436', 'step': 3899, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:28.815814', 'step': 3899, 'epoch': 3} {'type': 'loss', 'content': 0.007863102480769157, 'timestamp': '2025-09-30 22:18:28.850118', 'step': 3900, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:28.901235', 'step': 3900, 'epoch': 3} {'type': 'loss', 'content': 0.008372662588953972, 'timestamp': '2025-09-30 22:18:28.911357', 'step': 3901, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:28.952095', 'step': 3901, 'epoch': 3} {'type': 'loss', 'content': 0.007149435579776764, 'timestamp': '2025-09-30 22:18:28.963217', 'step': 3902, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:29.001344', 'step': 3902, 'epoch': 3} {'type': 'loss', 'content': 0.007008867803961039, 'timestamp': '2025-09-30 22:18:29.013604', 'step': 3903, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:18:29.056577', 'step': 3903, 'epoch': 3} {'type': 'loss', 'content': 0.006546663120388985, 'timestamp': '2025-09-30 22:18:29.091170', 'step': 3904, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:29.141184', 'step': 3904, 'epoch': 3} {'type': 'loss', 'content': 0.004806919023394585, 'timestamp': '2025-09-30 22:18:29.153898', 'step': 3905, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:29.192991', 'step': 3905, 'epoch': 3} {'type': 'loss', 'content': 0.007764711976051331, 'timestamp': '2025-09-30 22:18:29.205574', 'step': 3906, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:29.262414', 'step': 3906, 'epoch': 3} {'type': 'loss', 'content': 0.006050893571227789, 'timestamp': '2025-09-30 22:18:29.275799', 'step': 3907, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:29.319053', 'step': 3907, 'epoch': 3} {'type': 'loss', 'content': 0.005481482483446598, 'timestamp': '2025-09-30 22:18:29.352270', 'step': 3908, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:29.415848', 'step': 3908, 'epoch': 3} {'type': 'loss', 'content': 0.008061875589191914, 'timestamp': '2025-09-30 22:18:29.428530', 'step': 3909, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:18:29.471366', 'step': 3909, 'epoch': 3} {'type': 'loss', 'content': 0.004771022126078606, 'timestamp': '2025-09-30 22:18:29.478610', 'step': 3910, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-09-30 22:18:32.259037', 'step': 3910, 'epoch': 3} {'type': 'pplx', 'content': 5.814706217503361, 'timestamp': '2025-09-30 22:18:32.261672', 'step': 3910, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:32.294648', 'step': 3910, 'epoch': 3} {'type': 'loss', 'content': 0.0073614949360489845, 'timestamp': '2025-09-30 22:18:32.306895', 'step': 3911, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:32.354228', 'step': 3911, 'epoch': 3} {'type': 'loss', 'content': 0.008830001577734947, 'timestamp': '2025-09-30 22:18:32.383121', 'step': 3912, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:32.444790', 'step': 3912, 'epoch': 3} {'type': 'loss', 'content': 0.009699201211333275, 'timestamp': '2025-09-30 22:18:32.455253', 'step': 3913, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:32.491250', 'step': 3913, 'epoch': 3} {'type': 'loss', 'content': 0.007017786148935556, 'timestamp': '2025-09-30 22:18:32.502320', 'step': 3914, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:18:32.568903', 'step': 3914, 'epoch': 3} {'type': 'loss', 'content': 0.009106282144784927, 'timestamp': '2025-09-30 22:18:32.576467', 'step': 3915, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-09-30 22:18:32.632651', 'step': 3915, 'epoch': 3} {'type': 'loss', 'content': 0.0056086317636072636, 'timestamp': '2025-09-30 22:18:32.670588', 'step': 3916, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:32.710909', 'step': 3916, 'epoch': 3} {'type': 'loss', 'content': 0.004006392788141966, 'timestamp': '2025-09-30 22:18:32.723547', 'step': 3917, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:32.765443', 'step': 3917, 'epoch': 3} {'type': 'loss', 'content': 0.010423400439321995, 'timestamp': '2025-09-30 22:18:32.777771', 'step': 3918, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:32.838754', 'step': 3918, 'epoch': 3} {'type': 'loss', 'content': 0.00975536648184061, 'timestamp': '2025-09-30 22:18:32.846598', 'step': 3919, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:32.884965', 'step': 3919, 'epoch': 3} {'type': 'loss', 'content': 0.007408326957374811, 'timestamp': '2025-09-30 22:18:32.918150', 'step': 3920, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:18:32.962424', 'step': 3920, 'epoch': 3} {'type': 'loss', 'content': 0.012103984132409096, 'timestamp': '2025-09-30 22:18:32.970610', 'step': 3921, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:33.011097', 'step': 3921, 'epoch': 3} {'type': 'loss', 'content': 0.0038203338626772165, 'timestamp': '2025-09-30 22:18:33.023633', 'step': 3922, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:33.059940', 'step': 3922, 'epoch': 3} {'type': 'loss', 'content': 0.002586872549727559, 'timestamp': '2025-09-30 22:18:33.067903', 'step': 3923, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:33.106161', 'step': 3923, 'epoch': 3} {'type': 'loss', 'content': 0.0026292195543646812, 'timestamp': '2025-09-30 22:18:33.140392', 'step': 3924, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:33.199633', 'step': 3924, 'epoch': 3} {'type': 'loss', 'content': 0.011017059907317162, 'timestamp': '2025-09-30 22:18:33.208410', 'step': 3925, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:33.256450', 'step': 3925, 'epoch': 3} {'type': 'loss', 'content': 0.006252105347812176, 'timestamp': '2025-09-30 22:18:33.268726', 'step': 3926, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:33.312775', 'step': 3926, 'epoch': 3} {'type': 'loss', 'content': 0.007171166129410267, 'timestamp': '2025-09-30 22:18:33.325119', 'step': 3927, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:33.363839', 'step': 3927, 'epoch': 3} {'type': 'loss', 'content': 0.007750141900032759, 'timestamp': '2025-09-30 22:18:33.398092', 'step': 3928, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:18:33.438349', 'step': 3928, 'epoch': 3} {'type': 'loss', 'content': 0.013963129371404648, 'timestamp': '2025-09-30 22:18:33.451391', 'step': 3929, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:33.495607', 'step': 3929, 'epoch': 3} {'type': 'loss', 'content': 0.008714791387319565, 'timestamp': '2025-09-30 22:18:33.506774', 'step': 3930, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:33.543823', 'step': 3930, 'epoch': 3} {'type': 'loss', 'content': 0.034263599663972855, 'timestamp': '2025-09-30 22:18:33.554882', 'step': 3931, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:18:33.605646', 'step': 3931, 'epoch': 3} {'type': 'loss', 'content': 0.005537922028452158, 'timestamp': '2025-09-30 22:18:33.640301', 'step': 3932, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:18:33.678271', 'step': 3932, 'epoch': 3} {'type': 'loss', 'content': 0.01095846202224493, 'timestamp': '2025-09-30 22:18:33.691235', 'step': 3933, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:18:33.736791', 'step': 3933, 'epoch': 3} {'type': 'loss', 'content': 0.0030966904014348984, 'timestamp': '2025-09-30 22:18:33.752451', 'step': 3934, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:33.800220', 'step': 3934, 'epoch': 3} {'type': 'loss', 'content': 0.002290707314386964, 'timestamp': '2025-09-30 22:18:33.813574', 'step': 3935, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:33.858852', 'step': 3935, 'epoch': 3} {'type': 'loss', 'content': 0.00609651068225503, 'timestamp': '2025-09-30 22:18:33.892243', 'step': 3936, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:18:33.954454', 'step': 3936, 'epoch': 3} {'type': 'loss', 'content': 0.00569250900298357, 'timestamp': '2025-09-30 22:18:33.967821', 'step': 3937, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:18:34.012184', 'step': 3937, 'epoch': 3} {'type': 'loss', 'content': 0.013253239914774895, 'timestamp': '2025-09-30 22:18:34.028048', 'step': 3938, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:18:34.076862', 'step': 3938, 'epoch': 3} {'type': 'loss', 'content': 0.0021241006907075644, 'timestamp': '2025-09-30 22:18:34.090674', 'step': 3939, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:18:34.167622', 'step': 3939, 'epoch': 3} {'type': 'loss', 'content': 0.0024524808395653963, 'timestamp': '2025-09-30 22:18:34.204414', 'step': 3940, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:34.255718', 'step': 3940, 'epoch': 3} {'type': 'loss', 'content': 0.007233856245875359, 'timestamp': '2025-09-30 22:18:34.265573', 'step': 3941, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:34.323678', 'step': 3941, 'epoch': 3} {'type': 'loss', 'content': 0.012206872925162315, 'timestamp': '2025-09-30 22:18:34.336997', 'step': 3942, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:18:34.383903', 'step': 3942, 'epoch': 3} {'type': 'loss', 'content': 0.007985597476363182, 'timestamp': '2025-09-30 22:18:34.397627', 'step': 3943, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:34.434976', 'step': 3943, 'epoch': 3} {'type': 'loss', 'content': 0.008505703881382942, 'timestamp': '2025-09-30 22:18:34.469477', 'step': 3944, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:18:34.521002', 'step': 3944, 'epoch': 3} {'type': 'loss', 'content': 0.008168045431375504, 'timestamp': '2025-09-30 22:18:34.534212', 'step': 3945, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:34.588227', 'step': 3945, 'epoch': 3} {'type': 'loss', 'content': 0.004361226689070463, 'timestamp': '2025-09-30 22:18:34.600636', 'step': 3946, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:34.647458', 'step': 3946, 'epoch': 3} {'type': 'loss', 'content': 0.012169808149337769, 'timestamp': '2025-09-30 22:18:34.660867', 'step': 3947, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:18:34.705853', 'step': 3947, 'epoch': 3} {'type': 'loss', 'content': 0.006339035928249359, 'timestamp': '2025-09-30 22:18:34.740478', 'step': 3948, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 14238374516352}, 'timestamp': '2025-09-30 22:18:34.792364', 'step': 3948, 'epoch': 3} {'type': 'loss', 'content': 0.004964579362422228, 'timestamp': '2025-09-30 22:18:34.809380', 'step': 3949, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:34.870754', 'step': 3949, 'epoch': 3} {'type': 'loss', 'content': 0.0011956646339967847, 'timestamp': '2025-09-30 22:18:34.884161', 'step': 3950, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:34.919046', 'step': 3950, 'epoch': 3} {'type': 'loss', 'content': 0.00508431950584054, 'timestamp': '2025-09-30 22:18:34.926944', 'step': 3951, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:18:34.982616', 'step': 3951, 'epoch': 3} {'type': 'loss', 'content': 0.007752721197903156, 'timestamp': '2025-09-30 22:18:35.013877', 'step': 3952, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:35.053195', 'step': 3952, 'epoch': 3} {'type': 'loss', 'content': 0.024671725928783417, 'timestamp': '2025-09-30 22:18:35.058838', 'step': 3953, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:35.118430', 'step': 3953, 'epoch': 3} {'type': 'loss', 'content': 0.004845732357352972, 'timestamp': '2025-09-30 22:18:35.130985', 'step': 3954, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:35.191301', 'step': 3954, 'epoch': 3} {'type': 'loss', 'content': 0.006331304553896189, 'timestamp': '2025-09-30 22:18:35.204688', 'step': 3955, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:35.246378', 'step': 3955, 'epoch': 3} {'type': 'loss', 'content': 0.0057160560972988605, 'timestamp': '2025-09-30 22:18:35.280800', 'step': 3956, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:35.321808', 'step': 3956, 'epoch': 3} {'type': 'loss', 'content': 0.0051583037711679935, 'timestamp': '2025-09-30 22:18:35.332309', 'step': 3957, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:35.379711', 'step': 3957, 'epoch': 3} {'type': 'loss', 'content': 0.006589068099856377, 'timestamp': '2025-09-30 22:18:35.393015', 'step': 3958, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-09-30 22:18:35.442600', 'step': 3958, 'epoch': 3} {'type': 'loss', 'content': 0.00613076938316226, 'timestamp': '2025-09-30 22:18:35.458904', 'step': 3959, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:18:35.511880', 'step': 3959, 'epoch': 3} {'type': 'loss', 'content': 0.003702125744894147, 'timestamp': '2025-09-30 22:18:35.548579', 'step': 3960, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:18:35.607235', 'step': 3960, 'epoch': 3} {'type': 'loss', 'content': 0.0032431448344141245, 'timestamp': '2025-09-30 22:18:35.620619', 'step': 3961, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:18:35.671986', 'step': 3961, 'epoch': 3} {'type': 'loss', 'content': 0.00313235679641366, 'timestamp': '2025-09-30 22:18:35.682535', 'step': 3962, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-09-30 22:18:35.734631', 'step': 3962, 'epoch': 3} {'type': 'loss', 'content': 0.003487497800961137, 'timestamp': '2025-09-30 22:18:35.750797', 'step': 3963, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:18:35.795430', 'step': 3963, 'epoch': 3} {'type': 'loss', 'content': 0.003668895922601223, 'timestamp': '2025-09-30 22:18:35.830039', 'step': 3964, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:35.864463', 'step': 3964, 'epoch': 3} {'type': 'loss', 'content': 0.0067418343387544155, 'timestamp': '2025-09-30 22:18:35.874247', 'step': 3965, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:18:35.926358', 'step': 3965, 'epoch': 3} {'type': 'loss', 'content': 0.0065846871584653854, 'timestamp': '2025-09-30 22:18:35.934000', 'step': 3966, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:18:35.967325', 'step': 3966, 'epoch': 3} {'type': 'loss', 'content': 0.0036082533188164234, 'timestamp': '2025-09-30 22:18:35.977750', 'step': 3967, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:18:36.014897', 'step': 3967, 'epoch': 3} {'type': 'loss', 'content': 0.004577101673930883, 'timestamp': '2025-09-30 22:18:36.046115', 'step': 3968, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:18:36.092447', 'step': 3968, 'epoch': 3} {'type': 'loss', 'content': 0.012322023510932922, 'timestamp': '2025-09-30 22:18:36.105592', 'step': 3969, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:18:36.141831', 'step': 3969, 'epoch': 3} {'type': 'loss', 'content': 0.002453200053423643, 'timestamp': '2025-09-30 22:18:36.152026', 'step': 3970, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:18:36.205457', 'step': 3970, 'epoch': 3} {'type': 'loss', 'content': 0.007268570829182863, 'timestamp': '2025-09-30 22:18:36.219466', 'step': 3971, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:36.258169', 'step': 3971, 'epoch': 3} {'type': 'loss', 'content': 0.007819430902600288, 'timestamp': '2025-09-30 22:18:36.291295', 'step': 3972, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:18:36.335473', 'step': 3972, 'epoch': 3} {'type': 'loss', 'content': 0.007106750272214413, 'timestamp': '2025-09-30 22:18:36.350616', 'step': 3973, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:18:36.396747', 'step': 3973, 'epoch': 3} {'type': 'loss', 'content': 0.010787696577608585, 'timestamp': '2025-09-30 22:18:36.412628', 'step': 3974, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:36.458589', 'step': 3974, 'epoch': 3} {'type': 'loss', 'content': 0.005603353027254343, 'timestamp': '2025-09-30 22:18:36.469670', 'step': 3975, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:18:36.520178', 'step': 3975, 'epoch': 3} {'type': 'loss', 'content': 0.004301795735955238, 'timestamp': '2025-09-30 22:18:36.551225', 'step': 3976, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:36.587429', 'step': 3976, 'epoch': 3} {'type': 'loss', 'content': 0.004883557092398405, 'timestamp': '2025-09-30 22:18:36.596163', 'step': 3977, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:36.644774', 'step': 3977, 'epoch': 3} {'type': 'loss', 'content': 0.006246346514672041, 'timestamp': '2025-09-30 22:18:36.658201', 'step': 3978, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:36.709559', 'step': 3978, 'epoch': 3} {'type': 'loss', 'content': 0.00462919007986784, 'timestamp': '2025-09-30 22:18:36.722979', 'step': 3979, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:36.769480', 'step': 3979, 'epoch': 3} {'type': 'loss', 'content': 0.008667691610753536, 'timestamp': '2025-09-30 22:18:36.802693', 'step': 3980, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:36.848405', 'step': 3980, 'epoch': 3} {'type': 'loss', 'content': 0.005111383739858866, 'timestamp': '2025-09-30 22:18:36.857147', 'step': 3981, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:36.899671', 'step': 3981, 'epoch': 3} {'type': 'loss', 'content': 0.0035054711624979973, 'timestamp': '2025-09-30 22:18:36.911977', 'step': 3982, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:18:36.957184', 'step': 3982, 'epoch': 3} {'type': 'loss', 'content': 0.0011335327289998531, 'timestamp': '2025-09-30 22:18:36.964399', 'step': 3983, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:18:37.016493', 'step': 3983, 'epoch': 3} {'type': 'loss', 'content': 0.004883974324911833, 'timestamp': '2025-09-30 22:18:37.053224', 'step': 3984, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:18:37.089444', 'step': 3984, 'epoch': 3} {'type': 'loss', 'content': 0.009602776728570461, 'timestamp': '2025-09-30 22:18:37.095338', 'step': 3985, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:18:37.141870', 'step': 3985, 'epoch': 3} {'type': 'loss', 'content': 0.014472908340394497, 'timestamp': '2025-09-30 22:18:37.148915', 'step': 3986, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:18:37.184512', 'step': 3986, 'epoch': 3} {'type': 'loss', 'content': 0.002878964878618717, 'timestamp': '2025-09-30 22:18:37.194883', 'step': 3987, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:37.240943', 'step': 3987, 'epoch': 3} {'type': 'loss', 'content': 0.007471582852303982, 'timestamp': '2025-09-30 22:18:37.272809', 'step': 3988, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:18:37.312852', 'step': 3988, 'epoch': 3} {'type': 'loss', 'content': 0.011246595531702042, 'timestamp': '2025-09-30 22:18:37.326152', 'step': 3989, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:37.361723', 'step': 3989, 'epoch': 3} {'type': 'loss', 'content': 0.004211165476590395, 'timestamp': '2025-09-30 22:18:37.369749', 'step': 3990, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:18:37.407055', 'step': 3990, 'epoch': 3} {'type': 'loss', 'content': 0.00475333584472537, 'timestamp': '2025-09-30 22:18:37.414272', 'step': 3991, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:37.454266', 'step': 3991, 'epoch': 3} {'type': 'loss', 'content': 0.006536404136568308, 'timestamp': '2025-09-30 22:18:37.486217', 'step': 3992, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:37.531183', 'step': 3992, 'epoch': 3} {'type': 'loss', 'content': 0.004319012630730867, 'timestamp': '2025-09-30 22:18:37.543767', 'step': 3993, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:37.578660', 'step': 3993, 'epoch': 3} {'type': 'loss', 'content': 0.004011413082480431, 'timestamp': '2025-09-30 22:18:37.586415', 'step': 3994, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:37.623451', 'step': 3994, 'epoch': 3} {'type': 'loss', 'content': 0.007326128892600536, 'timestamp': '2025-09-30 22:18:37.631307', 'step': 3995, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:18:37.672911', 'step': 3995, 'epoch': 3} {'type': 'loss', 'content': 0.007306681480258703, 'timestamp': '2025-09-30 22:18:37.707546', 'step': 3996, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:18:37.760139', 'step': 3996, 'epoch': 3} {'type': 'loss', 'content': 0.006555972620844841, 'timestamp': '2025-09-30 22:18:37.775552', 'step': 3997, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-09-30 22:18:37.835272', 'step': 3997, 'epoch': 3} {'type': 'loss', 'content': 0.007153916638344526, 'timestamp': '2025-09-30 22:18:37.852431', 'step': 3998, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:18:37.905432', 'step': 3998, 'epoch': 3} {'type': 'loss', 'content': 0.007474151905626059, 'timestamp': '2025-09-30 22:18:37.919245', 'step': 3999, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:18:37.961319', 'step': 3999, 'epoch': 3} {'type': 'loss', 'content': 0.004183096345514059, 'timestamp': '2025-09-30 22:18:37.996197', 'step': 4000, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 4000', 'timestamp': '2025-09-30 22:18:43.644436', 'step': 4000, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:18:43.694862', 'step': 4000, 'epoch': 3} {'type': 'loss', 'content': 0.009015398100018501, 'timestamp': '2025-09-30 22:18:43.701075', 'step': 4001, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:18:43.740755', 'step': 4001, 'epoch': 3} {'type': 'loss', 'content': 0.011754374019801617, 'timestamp': '2025-09-30 22:18:43.754477', 'step': 4002, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:43.791366', 'step': 4002, 'epoch': 3} {'type': 'loss', 'content': 0.006731737405061722, 'timestamp': '2025-09-30 22:18:43.803097', 'step': 4003, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:43.855445', 'step': 4003, 'epoch': 3} {'type': 'loss', 'content': 0.0032603219151496887, 'timestamp': '2025-09-30 22:18:43.889653', 'step': 4004, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:18:43.942019', 'step': 4004, 'epoch': 3} {'type': 'loss', 'content': 0.004216242115944624, 'timestamp': '2025-09-30 22:18:43.955376', 'step': 4005, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:44.001833', 'step': 4005, 'epoch': 3} {'type': 'loss', 'content': 0.006579623557627201, 'timestamp': '2025-09-30 22:18:44.015234', 'step': 4006, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:44.059013', 'step': 4006, 'epoch': 3} {'type': 'loss', 'content': 0.007562713697552681, 'timestamp': '2025-09-30 22:18:44.071626', 'step': 4007, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:18:44.109601', 'step': 4007, 'epoch': 3} {'type': 'loss', 'content': 0.005524170119315386, 'timestamp': '2025-09-30 22:18:44.144261', 'step': 4008, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:18:44.184357', 'step': 4008, 'epoch': 3} {'type': 'loss', 'content': 0.00575696025043726, 'timestamp': '2025-09-30 22:18:44.192374', 'step': 4009, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:44.237343', 'step': 4009, 'epoch': 3} {'type': 'loss', 'content': 0.004735125228762627, 'timestamp': '2025-09-30 22:18:44.249936', 'step': 4010, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:18:44.290493', 'step': 4010, 'epoch': 3} {'type': 'loss', 'content': 0.00848841480910778, 'timestamp': '2025-09-30 22:18:44.297656', 'step': 4011, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:18:44.336032', 'step': 4011, 'epoch': 3} {'type': 'loss', 'content': 0.005274408962577581, 'timestamp': '2025-09-30 22:18:44.364182', 'step': 4012, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:18:44.410082', 'step': 4012, 'epoch': 3} {'type': 'loss', 'content': 0.0018573726993054152, 'timestamp': '2025-09-30 22:18:44.414767', 'step': 4013, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:18:44.462993', 'step': 4013, 'epoch': 3} {'type': 'loss', 'content': 0.010334143415093422, 'timestamp': '2025-09-30 22:18:44.470329', 'step': 4014, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:44.514719', 'step': 4014, 'epoch': 3} {'type': 'loss', 'content': 0.009302409365773201, 'timestamp': '2025-09-30 22:18:44.527262', 'step': 4015, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:18:44.582526', 'step': 4015, 'epoch': 3} {'type': 'loss', 'content': 0.0037581834476441145, 'timestamp': '2025-09-30 22:18:44.607587', 'step': 4016, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:18:44.645433', 'step': 4016, 'epoch': 3} {'type': 'loss', 'content': 0.0035524829290807247, 'timestamp': '2025-09-30 22:18:44.658402', 'step': 4017, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:18:44.696112', 'step': 4017, 'epoch': 3} {'type': 'loss', 'content': 0.003935859072953463, 'timestamp': '2025-09-30 22:18:44.706343', 'step': 4018, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:18:44.744446', 'step': 4018, 'epoch': 3} {'type': 'loss', 'content': 0.005983736366033554, 'timestamp': '2025-09-30 22:18:44.758242', 'step': 4019, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:18:44.812181', 'step': 4019, 'epoch': 3} {'type': 'loss', 'content': 0.007891970686614513, 'timestamp': '2025-09-30 22:18:44.848617', 'step': 4020, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:18:44.894541', 'step': 4020, 'epoch': 3} {'type': 'loss', 'content': 0.008960836566984653, 'timestamp': '2025-09-30 22:18:44.907616', 'step': 4021, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:44.948956', 'step': 4021, 'epoch': 3} {'type': 'loss', 'content': 0.008770717307925224, 'timestamp': '2025-09-30 22:18:44.962472', 'step': 4022, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:18:45.006957', 'step': 4022, 'epoch': 3} {'type': 'loss', 'content': 0.006733729038387537, 'timestamp': '2025-09-30 22:18:45.020930', 'step': 4023, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:45.068308', 'step': 4023, 'epoch': 3} {'type': 'loss', 'content': 0.010043778456747532, 'timestamp': '2025-09-30 22:18:45.102656', 'step': 4024, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:45.153104', 'step': 4024, 'epoch': 3} {'type': 'loss', 'content': 0.011883224360644817, 'timestamp': '2025-09-30 22:18:45.161692', 'step': 4025, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-09-30 22:18:47.846177', 'step': 4025, 'epoch': 3} {'type': 'pplx', 'content': 5.809886067136617, 'timestamp': '2025-09-30 22:18:47.851662', 'step': 4025, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:47.894419', 'step': 4025, 'epoch': 3} {'type': 'loss', 'content': 0.004332016687840223, 'timestamp': '2025-09-30 22:18:47.907791', 'step': 4026, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:18:47.961457', 'step': 4026, 'epoch': 3} {'type': 'loss', 'content': 0.006139960139989853, 'timestamp': '2025-09-30 22:18:47.968640', 'step': 4027, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:48.015492', 'step': 4027, 'epoch': 3} {'type': 'loss', 'content': 0.003429949749261141, 'timestamp': '2025-09-30 22:18:48.044411', 'step': 4028, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:18:48.082663', 'step': 4028, 'epoch': 3} {'type': 'loss', 'content': 0.005339586641639471, 'timestamp': '2025-09-30 22:18:48.100991', 'step': 4029, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:48.136568', 'step': 4029, 'epoch': 3} {'type': 'loss', 'content': 0.0034270884934812784, 'timestamp': '2025-09-30 22:18:48.147577', 'step': 4030, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:18:48.186602', 'step': 4030, 'epoch': 3} {'type': 'loss', 'content': 0.002700925339013338, 'timestamp': '2025-09-30 22:18:48.193394', 'step': 4031, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:18:48.246419', 'step': 4031, 'epoch': 3} {'type': 'loss', 'content': 0.008455545641481876, 'timestamp': '2025-09-30 22:18:48.277518', 'step': 4032, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:18:48.323026', 'step': 4032, 'epoch': 3} {'type': 'loss', 'content': 0.010105387307703495, 'timestamp': '2025-09-30 22:18:48.331881', 'step': 4033, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:18:48.374076', 'step': 4033, 'epoch': 3} {'type': 'loss', 'content': 0.005560815799981356, 'timestamp': '2025-09-30 22:18:48.382397', 'step': 4034, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:48.435088', 'step': 4034, 'epoch': 3} {'type': 'loss', 'content': 0.009280094876885414, 'timestamp': '2025-09-30 22:18:48.445050', 'step': 4035, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:48.482514', 'step': 4035, 'epoch': 3} {'type': 'loss', 'content': 0.0043348814360797405, 'timestamp': '2025-09-30 22:18:48.518953', 'step': 4036, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:18:48.574409', 'step': 4036, 'epoch': 3} {'type': 'loss', 'content': 0.007800571154803038, 'timestamp': '2025-09-30 22:18:48.587588', 'step': 4037, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:48.641475', 'step': 4037, 'epoch': 3} {'type': 'loss', 'content': 0.006277676206082106, 'timestamp': '2025-09-30 22:18:48.652652', 'step': 4038, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:48.709752', 'step': 4038, 'epoch': 3} {'type': 'loss', 'content': 0.006815760396420956, 'timestamp': '2025-09-30 22:18:48.720955', 'step': 4039, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:48.770198', 'step': 4039, 'epoch': 3} {'type': 'loss', 'content': 0.003317673224955797, 'timestamp': '2025-09-30 22:18:48.804525', 'step': 4040, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:48.851356', 'step': 4040, 'epoch': 3} {'type': 'loss', 'content': 0.00768579775467515, 'timestamp': '2025-09-30 22:18:48.863979', 'step': 4041, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:18:48.900751', 'step': 4041, 'epoch': 3} {'type': 'loss', 'content': 0.004484781064093113, 'timestamp': '2025-09-30 22:18:48.914537', 'step': 4042, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:18:48.955563', 'step': 4042, 'epoch': 3} {'type': 'loss', 'content': 0.012383470311760902, 'timestamp': '2025-09-30 22:18:48.969517', 'step': 4043, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:49.018841', 'step': 4043, 'epoch': 3} {'type': 'loss', 'content': 0.007896116003394127, 'timestamp': '2025-09-30 22:18:49.053135', 'step': 4044, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:49.090178', 'step': 4044, 'epoch': 3} {'type': 'loss', 'content': 0.00294464617036283, 'timestamp': '2025-09-30 22:18:49.100143', 'step': 4045, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:49.146117', 'step': 4045, 'epoch': 3} {'type': 'loss', 'content': 0.0064725568518042564, 'timestamp': '2025-09-30 22:18:49.158462', 'step': 4046, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:18:49.193794', 'step': 4046, 'epoch': 3} {'type': 'loss', 'content': 0.005064612254500389, 'timestamp': '2025-09-30 22:18:49.200856', 'step': 4047, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:18:49.245051', 'step': 4047, 'epoch': 3} {'type': 'loss', 'content': 0.003626457182690501, 'timestamp': '2025-09-30 22:18:49.281756', 'step': 4048, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:18:49.341905', 'step': 4048, 'epoch': 3} {'type': 'loss', 'content': 0.006672897841781378, 'timestamp': '2025-09-30 22:18:49.355017', 'step': 4049, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:49.396912', 'step': 4049, 'epoch': 3} {'type': 'loss', 'content': 0.007566211279481649, 'timestamp': '2025-09-30 22:18:49.404700', 'step': 4050, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:18:49.457773', 'step': 4050, 'epoch': 3} {'type': 'loss', 'content': 0.009887561202049255, 'timestamp': '2025-09-30 22:18:49.473638', 'step': 4051, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 14238374516352}, 'timestamp': '2025-09-30 22:18:49.532939', 'step': 4051, 'epoch': 3} {'type': 'loss', 'content': 0.005307478364557028, 'timestamp': '2025-09-30 22:18:49.571205', 'step': 4052, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:49.619346', 'step': 4052, 'epoch': 3} {'type': 'loss', 'content': 0.009220928885042667, 'timestamp': '2025-09-30 22:18:49.629165', 'step': 4053, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:49.674453', 'step': 4053, 'epoch': 3} {'type': 'loss', 'content': 0.01152608823031187, 'timestamp': '2025-09-30 22:18:49.686981', 'step': 4054, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:49.729940', 'step': 4054, 'epoch': 3} {'type': 'loss', 'content': 0.0038574920035898685, 'timestamp': '2025-09-30 22:18:49.740222', 'step': 4055, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:49.788777', 'step': 4055, 'epoch': 3} {'type': 'loss', 'content': 0.010885195806622505, 'timestamp': '2025-09-30 22:18:49.821854', 'step': 4056, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:49.870966', 'step': 4056, 'epoch': 3} {'type': 'loss', 'content': 0.004181805998086929, 'timestamp': '2025-09-30 22:18:49.879445', 'step': 4057, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:18:49.932493', 'step': 4057, 'epoch': 3} {'type': 'loss', 'content': 0.004165918566286564, 'timestamp': '2025-09-30 22:18:49.942936', 'step': 4058, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:49.987852', 'step': 4058, 'epoch': 3} {'type': 'loss', 'content': 0.007592889945954084, 'timestamp': '2025-09-30 22:18:49.996012', 'step': 4059, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:18:50.045488', 'step': 4059, 'epoch': 3} {'type': 'loss', 'content': 0.009840689599514008, 'timestamp': '2025-09-30 22:18:50.076659', 'step': 4060, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:50.119002', 'step': 4060, 'epoch': 3} {'type': 'loss', 'content': 0.011874034069478512, 'timestamp': '2025-09-30 22:18:50.128989', 'step': 4061, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:50.173013', 'step': 4061, 'epoch': 3} {'type': 'loss', 'content': 0.0066198790445923805, 'timestamp': '2025-09-30 22:18:50.185511', 'step': 4062, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:18:50.240226', 'step': 4062, 'epoch': 3} {'type': 'loss', 'content': 0.009846655651926994, 'timestamp': '2025-09-30 22:18:50.254050', 'step': 4063, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:50.296765', 'step': 4063, 'epoch': 3} {'type': 'loss', 'content': 0.001079131499864161, 'timestamp': '2025-09-30 22:18:50.326947', 'step': 4064, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:50.362123', 'step': 4064, 'epoch': 3} {'type': 'loss', 'content': 0.00587060209363699, 'timestamp': '2025-09-30 22:18:50.367558', 'step': 4065, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:50.403145', 'step': 4065, 'epoch': 3} {'type': 'loss', 'content': 0.005940629635006189, 'timestamp': '2025-09-30 22:18:50.415680', 'step': 4066, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:18:50.458616', 'step': 4066, 'epoch': 3} {'type': 'loss', 'content': 0.0011660687159746885, 'timestamp': '2025-09-30 22:18:50.466199', 'step': 4067, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:50.506089', 'step': 4067, 'epoch': 3} {'type': 'loss', 'content': 0.002326065907254815, 'timestamp': '2025-09-30 22:18:50.539234', 'step': 4068, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:18:50.580698', 'step': 4068, 'epoch': 3} {'type': 'loss', 'content': 0.0008512693457305431, 'timestamp': '2025-09-30 22:18:50.589271', 'step': 4069, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:50.627165', 'step': 4069, 'epoch': 3} {'type': 'loss', 'content': 0.010879871435463428, 'timestamp': '2025-09-30 22:18:50.639447', 'step': 4070, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:50.690226', 'step': 4070, 'epoch': 3} {'type': 'loss', 'content': 0.00951231550425291, 'timestamp': '2025-09-30 22:18:50.702775', 'step': 4071, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:50.759843', 'step': 4071, 'epoch': 3} {'type': 'loss', 'content': 0.0026263566687703133, 'timestamp': '2025-09-30 22:18:50.794136', 'step': 4072, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:50.838345', 'step': 4072, 'epoch': 3} {'type': 'loss', 'content': 0.004461831878870726, 'timestamp': '2025-09-30 22:18:50.846894', 'step': 4073, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:18:50.889550', 'step': 4073, 'epoch': 3} {'type': 'loss', 'content': 0.008627448230981827, 'timestamp': '2025-09-30 22:18:50.897676', 'step': 4074, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:18:50.943787', 'step': 4074, 'epoch': 3} {'type': 'loss', 'content': 0.009395519271492958, 'timestamp': '2025-09-30 22:18:50.957506', 'step': 4075, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:18:50.997473', 'step': 4075, 'epoch': 3} {'type': 'loss', 'content': 0.0036263596266508102, 'timestamp': '2025-09-30 22:18:51.025540', 'step': 4076, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:51.058907', 'step': 4076, 'epoch': 3} {'type': 'loss', 'content': 0.01069303136318922, 'timestamp': '2025-09-30 22:18:51.068753', 'step': 4077, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:51.107921', 'step': 4077, 'epoch': 3} {'type': 'loss', 'content': 0.0033190203830599785, 'timestamp': '2025-09-30 22:18:51.120189', 'step': 4078, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:18:51.158668', 'step': 4078, 'epoch': 3} {'type': 'loss', 'content': 0.007404983509331942, 'timestamp': '2025-09-30 22:18:51.168973', 'step': 4079, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:51.209568', 'step': 4079, 'epoch': 3} {'type': 'loss', 'content': 0.002601143904030323, 'timestamp': '2025-09-30 22:18:51.243010', 'step': 4080, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:51.280124', 'step': 4080, 'epoch': 3} {'type': 'loss', 'content': 0.014199675992131233, 'timestamp': '2025-09-30 22:18:51.292770', 'step': 4081, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:51.334351', 'step': 4081, 'epoch': 3} {'type': 'loss', 'content': 0.004337809514254332, 'timestamp': '2025-09-30 22:18:51.346928', 'step': 4082, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:51.382342', 'step': 4082, 'epoch': 3} {'type': 'loss', 'content': 0.006910453084856272, 'timestamp': '2025-09-30 22:18:51.390303', 'step': 4083, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:18:51.435167', 'step': 4083, 'epoch': 3} {'type': 'loss', 'content': 0.0042189303785562515, 'timestamp': '2025-09-30 22:18:51.469746', 'step': 4084, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:51.509572', 'step': 4084, 'epoch': 3} {'type': 'loss', 'content': 0.006959874182939529, 'timestamp': '2025-09-30 22:18:51.519550', 'step': 4085, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:51.565429', 'step': 4085, 'epoch': 3} {'type': 'loss', 'content': 0.005139836110174656, 'timestamp': '2025-09-30 22:18:51.578844', 'step': 4086, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:18:51.628633', 'step': 4086, 'epoch': 3} {'type': 'loss', 'content': 0.0036931203212589025, 'timestamp': '2025-09-30 22:18:51.642624', 'step': 4087, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:51.695619', 'step': 4087, 'epoch': 3} {'type': 'loss', 'content': 0.005903789307922125, 'timestamp': '2025-09-30 22:18:51.729035', 'step': 4088, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:51.773651', 'step': 4088, 'epoch': 3} {'type': 'loss', 'content': 0.006845736410468817, 'timestamp': '2025-09-30 22:18:51.786321', 'step': 4089, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:18:51.834299', 'step': 4089, 'epoch': 3} {'type': 'loss', 'content': 0.006006104405969381, 'timestamp': '2025-09-30 22:18:51.844709', 'step': 4090, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:18:51.882781', 'step': 4090, 'epoch': 3} {'type': 'loss', 'content': 0.004835678264498711, 'timestamp': '2025-09-30 22:18:51.890443', 'step': 4091, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:18:51.931702', 'step': 4091, 'epoch': 3} {'type': 'loss', 'content': 0.008269528858363628, 'timestamp': '2025-09-30 22:18:51.960794', 'step': 4092, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:51.996031', 'step': 4092, 'epoch': 3} {'type': 'loss', 'content': 0.0042832400649785995, 'timestamp': '2025-09-30 22:18:52.003606', 'step': 4093, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:52.050170', 'step': 4093, 'epoch': 3} {'type': 'loss', 'content': 0.0063995858654379845, 'timestamp': '2025-09-30 22:18:52.058243', 'step': 4094, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:18:52.095785', 'step': 4094, 'epoch': 3} {'type': 'loss', 'content': 0.006296542473137379, 'timestamp': '2025-09-30 22:18:52.100027', 'step': 4095, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:18:52.138114', 'step': 4095, 'epoch': 3} {'type': 'loss', 'content': 0.004635804798454046, 'timestamp': '2025-09-30 22:18:52.166234', 'step': 4096, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 15187581968384}, 'timestamp': '2025-09-30 22:18:52.216739', 'step': 4096, 'epoch': 3} {'type': 'loss', 'content': 0.006758654490113258, 'timestamp': '2025-09-30 22:18:52.234103', 'step': 4097, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:18:52.273179', 'step': 4097, 'epoch': 3} {'type': 'loss', 'content': 0.00433450099080801, 'timestamp': '2025-09-30 22:18:52.286924', 'step': 4098, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:18:52.327342', 'step': 4098, 'epoch': 3} {'type': 'loss', 'content': 0.022350069135427475, 'timestamp': '2025-09-30 22:18:52.335559', 'step': 4099, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:52.375723', 'step': 4099, 'epoch': 3} {'type': 'loss', 'content': 0.006762483157217503, 'timestamp': '2025-09-30 22:18:52.407495', 'step': 4100, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:18:52.453283', 'step': 4100, 'epoch': 3} {'type': 'loss', 'content': 0.006077112630009651, 'timestamp': '2025-09-30 22:18:52.459842', 'step': 4101, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:52.508526', 'step': 4101, 'epoch': 3} {'type': 'loss', 'content': 0.009396334178745747, 'timestamp': '2025-09-30 22:18:52.517162', 'step': 4102, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 14712978242368}, 'timestamp': '2025-09-30 22:18:52.584744', 'step': 4102, 'epoch': 3} {'type': 'loss', 'content': 0.0033708529081195593, 'timestamp': '2025-09-30 22:18:52.602388', 'step': 4103, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:18:52.641561', 'step': 4103, 'epoch': 3} {'type': 'loss', 'content': 0.004656896460801363, 'timestamp': '2025-09-30 22:18:52.670170', 'step': 4104, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:52.705993', 'step': 4104, 'epoch': 3} {'type': 'loss', 'content': 0.009769827127456665, 'timestamp': '2025-09-30 22:18:52.711671', 'step': 4105, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:52.748407', 'step': 4105, 'epoch': 3} {'type': 'loss', 'content': 0.001809976645745337, 'timestamp': '2025-09-30 22:18:52.756139', 'step': 4106, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:52.802685', 'step': 4106, 'epoch': 3} {'type': 'loss', 'content': 0.005910452920943499, 'timestamp': '2025-09-30 22:18:52.811137', 'step': 4107, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:18:52.868617', 'step': 4107, 'epoch': 3} {'type': 'loss', 'content': 0.00518727907910943, 'timestamp': '2025-09-30 22:18:52.903250', 'step': 4108, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:18:52.944234', 'step': 4108, 'epoch': 3} {'type': 'loss', 'content': 0.007067784667015076, 'timestamp': '2025-09-30 22:18:52.951454', 'step': 4109, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:18:52.996175', 'step': 4109, 'epoch': 3} {'type': 'loss', 'content': 0.0049131568521261215, 'timestamp': '2025-09-30 22:18:53.009855', 'step': 4110, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:53.053646', 'step': 4110, 'epoch': 3} {'type': 'loss', 'content': 0.0052266800776124, 'timestamp': '2025-09-30 22:18:53.067007', 'step': 4111, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:18:53.127768', 'step': 4111, 'epoch': 3} {'type': 'loss', 'content': 0.004913975950330496, 'timestamp': '2025-09-30 22:18:53.162347', 'step': 4112, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-09-30 22:18:53.211080', 'step': 4112, 'epoch': 3} {'type': 'loss', 'content': 0.003879282856360078, 'timestamp': '2025-09-30 22:18:53.227739', 'step': 4113, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:18:53.268071', 'step': 4113, 'epoch': 3} {'type': 'loss', 'content': 0.004762736149132252, 'timestamp': '2025-09-30 22:18:53.278458', 'step': 4114, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:53.315117', 'step': 4114, 'epoch': 3} {'type': 'loss', 'content': 0.021380791440606117, 'timestamp': '2025-09-30 22:18:53.322995', 'step': 4115, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:53.379341', 'step': 4115, 'epoch': 3} {'type': 'loss', 'content': 0.0066805570386350155, 'timestamp': '2025-09-30 22:18:53.411124', 'step': 4116, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:18:53.447609', 'step': 4116, 'epoch': 3} {'type': 'loss', 'content': 0.006591428071260452, 'timestamp': '2025-09-30 22:18:53.454704', 'step': 4117, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:18:53.504531', 'step': 4117, 'epoch': 3} {'type': 'loss', 'content': 0.0032979913521558046, 'timestamp': '2025-09-30 22:18:53.520418', 'step': 4118, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:18:53.562101', 'step': 4118, 'epoch': 3} {'type': 'loss', 'content': 0.008285662159323692, 'timestamp': '2025-09-30 22:18:53.576008', 'step': 4119, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:18:53.614311', 'step': 4119, 'epoch': 3} {'type': 'loss', 'content': 0.005769718904048204, 'timestamp': '2025-09-30 22:18:53.648824', 'step': 4120, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:53.685187', 'step': 4120, 'epoch': 3} {'type': 'loss', 'content': 0.004447614308446646, 'timestamp': '2025-09-30 22:18:53.693894', 'step': 4121, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:18:53.746694', 'step': 4121, 'epoch': 3} {'type': 'loss', 'content': 0.0015994708519428968, 'timestamp': '2025-09-30 22:18:53.754520', 'step': 4122, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:53.791736', 'step': 4122, 'epoch': 3} {'type': 'loss', 'content': 0.0061613693833351135, 'timestamp': '2025-09-30 22:18:53.804074', 'step': 4123, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:53.844734', 'step': 4123, 'epoch': 3} {'type': 'loss', 'content': 0.00507601723074913, 'timestamp': '2025-09-30 22:18:53.876537', 'step': 4124, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:53.929647', 'step': 4124, 'epoch': 3} {'type': 'loss', 'content': 0.009173394180834293, 'timestamp': '2025-09-30 22:18:53.942329', 'step': 4125, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:18:53.987609', 'step': 4125, 'epoch': 3} {'type': 'loss', 'content': 0.0022419069427996874, 'timestamp': '2025-09-30 22:18:53.995246', 'step': 4126, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:54.037669', 'step': 4126, 'epoch': 3} {'type': 'loss', 'content': 0.008145290426909924, 'timestamp': '2025-09-30 22:18:54.048734', 'step': 4127, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:54.083897', 'step': 4127, 'epoch': 3} {'type': 'loss', 'content': 0.002505325712263584, 'timestamp': '2025-09-30 22:18:54.116004', 'step': 4128, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:54.154350', 'step': 4128, 'epoch': 3} {'type': 'loss', 'content': 0.006119324825704098, 'timestamp': '2025-09-30 22:18:54.164970', 'step': 4129, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:54.210351', 'step': 4129, 'epoch': 3} {'type': 'loss', 'content': 0.01269205380231142, 'timestamp': '2025-09-30 22:18:54.218278', 'step': 4130, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:54.266418', 'step': 4130, 'epoch': 3} {'type': 'loss', 'content': 0.009130466729402542, 'timestamp': '2025-09-30 22:18:54.278943', 'step': 4131, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:18:54.334100', 'step': 4131, 'epoch': 3} {'type': 'loss', 'content': 0.006617514882236719, 'timestamp': '2025-09-30 22:18:54.368788', 'step': 4132, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:18:54.414098', 'step': 4132, 'epoch': 3} {'type': 'loss', 'content': 0.0031109245028346777, 'timestamp': '2025-09-30 22:18:54.427273', 'step': 4133, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:18:54.473602', 'step': 4133, 'epoch': 3} {'type': 'loss', 'content': 0.008121561259031296, 'timestamp': '2025-09-30 22:18:54.487406', 'step': 4134, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-09-30 22:18:54.547693', 'step': 4134, 'epoch': 3} {'type': 'loss', 'content': 0.004598654806613922, 'timestamp': '2025-09-30 22:18:54.563988', 'step': 4135, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:54.601796', 'step': 4135, 'epoch': 3} {'type': 'loss', 'content': 0.0059106419794261456, 'timestamp': '2025-09-30 22:18:54.635169', 'step': 4136, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:54.678363', 'step': 4136, 'epoch': 3} {'type': 'loss', 'content': 0.0061576166190207005, 'timestamp': '2025-09-30 22:18:54.691028', 'step': 4137, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:18:54.741085', 'step': 4137, 'epoch': 3} {'type': 'loss', 'content': 0.0045078047551214695, 'timestamp': '2025-09-30 22:18:54.756988', 'step': 4138, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:18:54.799841', 'step': 4138, 'epoch': 3} {'type': 'loss', 'content': 0.007308666128665209, 'timestamp': '2025-09-30 22:18:54.809892', 'step': 4139, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:54.850422', 'step': 4139, 'epoch': 3} {'type': 'loss', 'content': 0.00295853428542614, 'timestamp': '2025-09-30 22:18:54.882382', 'step': 4140, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-09-30 22:18:57.607852', 'step': 4140, 'epoch': 3} {'type': 'pplx', 'content': 5.773015440842027, 'timestamp': '2025-09-30 22:18:57.612067', 'step': 4140, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:18:57.647540', 'step': 4140, 'epoch': 3} {'type': 'loss', 'content': 0.0040814983658492565, 'timestamp': '2025-09-30 22:18:57.654240', 'step': 4141, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:57.708759', 'step': 4141, 'epoch': 3} {'type': 'loss', 'content': 0.0023968368768692017, 'timestamp': '2025-09-30 22:18:57.722124', 'step': 4142, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:18:57.767755', 'step': 4142, 'epoch': 3} {'type': 'loss', 'content': 0.00340847484767437, 'timestamp': '2025-09-30 22:18:57.781757', 'step': 4143, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:57.819813', 'step': 4143, 'epoch': 3} {'type': 'loss', 'content': 0.005667718127369881, 'timestamp': '2025-09-30 22:18:57.853249', 'step': 4144, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:57.888506', 'step': 4144, 'epoch': 3} {'type': 'loss', 'content': 0.006874716840684414, 'timestamp': '2025-09-30 22:18:57.898433', 'step': 4145, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:57.936487', 'step': 4145, 'epoch': 3} {'type': 'loss', 'content': 0.010788613930344582, 'timestamp': '2025-09-30 22:18:57.949062', 'step': 4146, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:57.988173', 'step': 4146, 'epoch': 3} {'type': 'loss', 'content': 0.0027422248385846615, 'timestamp': '2025-09-30 22:18:57.996094', 'step': 4147, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:18:58.045290', 'step': 4147, 'epoch': 3} {'type': 'loss', 'content': 0.010014859959483147, 'timestamp': '2025-09-30 22:18:58.081802', 'step': 4148, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:18:58.124580', 'step': 4148, 'epoch': 3} {'type': 'loss', 'content': 0.0021662204526364803, 'timestamp': '2025-09-30 22:18:58.139978', 'step': 4149, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:18:58.185982', 'step': 4149, 'epoch': 3} {'type': 'loss', 'content': 0.006480331066995859, 'timestamp': '2025-09-30 22:18:58.199815', 'step': 4150, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:58.237927', 'step': 4150, 'epoch': 3} {'type': 'loss', 'content': 0.010116429068148136, 'timestamp': '2025-09-30 22:18:58.250513', 'step': 4151, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:18:58.306428', 'step': 4151, 'epoch': 3} {'type': 'loss', 'content': 0.0021906248293817043, 'timestamp': '2025-09-30 22:18:58.341313', 'step': 4152, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:18:58.379718', 'step': 4152, 'epoch': 3} {'type': 'loss', 'content': 0.0022437525913119316, 'timestamp': '2025-09-30 22:18:58.388776', 'step': 4153, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:18:58.433062', 'step': 4153, 'epoch': 3} {'type': 'loss', 'content': 0.0015901202568784356, 'timestamp': '2025-09-30 22:18:58.446743', 'step': 4154, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:58.485634', 'step': 4154, 'epoch': 3} {'type': 'loss', 'content': 0.0040994505397975445, 'timestamp': '2025-09-30 22:18:58.493514', 'step': 4155, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:18:58.540178', 'step': 4155, 'epoch': 3} {'type': 'loss', 'content': 0.005521212238818407, 'timestamp': '2025-09-30 22:18:58.568655', 'step': 4156, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 14238374516352}, 'timestamp': '2025-09-30 22:18:58.616000', 'step': 4156, 'epoch': 3} {'type': 'loss', 'content': 0.0036060779821127653, 'timestamp': '2025-09-30 22:18:58.633096', 'step': 4157, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:58.673057', 'step': 4157, 'epoch': 3} {'type': 'loss', 'content': 0.021071704104542732, 'timestamp': '2025-09-30 22:18:58.681170', 'step': 4158, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:18:58.721892', 'step': 4158, 'epoch': 3} {'type': 'loss', 'content': 0.00398464547470212, 'timestamp': '2025-09-30 22:18:58.729544', 'step': 4159, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:18:58.786858', 'step': 4159, 'epoch': 3} {'type': 'loss', 'content': 0.002623825566843152, 'timestamp': '2025-09-30 22:18:58.818126', 'step': 4160, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:58.853711', 'step': 4160, 'epoch': 3} {'type': 'loss', 'content': 0.007493019104003906, 'timestamp': '2025-09-30 22:18:58.864438', 'step': 4161, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-09-30 22:18:58.910465', 'step': 4161, 'epoch': 3} {'type': 'loss', 'content': 0.005534173455089331, 'timestamp': '2025-09-30 22:18:58.926721', 'step': 4162, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:58.973898', 'step': 4162, 'epoch': 3} {'type': 'loss', 'content': 0.0032436889596283436, 'timestamp': '2025-09-30 22:18:58.986187', 'step': 4163, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:59.024285', 'step': 4163, 'epoch': 3} {'type': 'loss', 'content': 0.007711523678153753, 'timestamp': '2025-09-30 22:18:59.057473', 'step': 4164, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:18:59.095498', 'step': 4164, 'epoch': 3} {'type': 'loss', 'content': 0.00906739104539156, 'timestamp': '2025-09-30 22:18:59.103651', 'step': 4165, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:59.157741', 'step': 4165, 'epoch': 3} {'type': 'loss', 'content': 0.00245729461312294, 'timestamp': '2025-09-30 22:18:59.165689', 'step': 4166, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:18:59.205803', 'step': 4166, 'epoch': 3} {'type': 'loss', 'content': 0.00661234138533473, 'timestamp': '2025-09-30 22:18:59.217132', 'step': 4167, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:18:59.259405', 'step': 4167, 'epoch': 3} {'type': 'loss', 'content': 0.0027672341093420982, 'timestamp': '2025-09-30 22:18:59.292541', 'step': 4168, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:59.331997', 'step': 4168, 'epoch': 3} {'type': 'loss', 'content': 0.00099122931715101, 'timestamp': '2025-09-30 22:18:59.344611', 'step': 4169, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:18:59.384233', 'step': 4169, 'epoch': 3} {'type': 'loss', 'content': 0.011586759239435196, 'timestamp': '2025-09-30 22:18:59.396762', 'step': 4170, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:18:59.435086', 'step': 4170, 'epoch': 3} {'type': 'loss', 'content': 0.003056830260902643, 'timestamp': '2025-09-30 22:18:59.448748', 'step': 4171, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:59.484176', 'step': 4171, 'epoch': 3} {'type': 'loss', 'content': 0.00391105841845274, 'timestamp': '2025-09-30 22:18:59.516365', 'step': 4172, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:59.563927', 'step': 4172, 'epoch': 3} {'type': 'loss', 'content': 0.001999323256313801, 'timestamp': '2025-09-30 22:18:59.572685', 'step': 4173, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:18:59.609068', 'step': 4173, 'epoch': 3} {'type': 'loss', 'content': 0.007335766684263945, 'timestamp': '2025-09-30 22:18:59.620178', 'step': 4174, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:18:59.657079', 'step': 4174, 'epoch': 3} {'type': 'loss', 'content': 0.004585715010762215, 'timestamp': '2025-09-30 22:18:59.667494', 'step': 4175, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:18:59.715849', 'step': 4175, 'epoch': 3} {'type': 'loss', 'content': 0.004428436979651451, 'timestamp': '2025-09-30 22:18:59.752327', 'step': 4176, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:18:59.813253', 'step': 4176, 'epoch': 3} {'type': 'loss', 'content': 0.006036048289388418, 'timestamp': '2025-09-30 22:18:59.825849', 'step': 4177, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:18:59.880058', 'step': 4177, 'epoch': 3} {'type': 'loss', 'content': 0.005262099672108889, 'timestamp': '2025-09-30 22:18:59.893721', 'step': 4178, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:18:59.945714', 'step': 4178, 'epoch': 3} {'type': 'loss', 'content': 0.004960318095982075, 'timestamp': '2025-09-30 22:18:59.953669', 'step': 4179, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:18:59.993644', 'step': 4179, 'epoch': 3} {'type': 'loss', 'content': 0.007269757799804211, 'timestamp': '2025-09-30 22:19:00.027731', 'step': 4180, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:19:00.063374', 'step': 4180, 'epoch': 3} {'type': 'loss', 'content': 0.005867009982466698, 'timestamp': '2025-09-30 22:19:00.074190', 'step': 4181, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:19:00.114517', 'step': 4181, 'epoch': 3} {'type': 'loss', 'content': 0.0028793776873499155, 'timestamp': '2025-09-30 22:19:00.128224', 'step': 4182, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:19:00.169338', 'step': 4182, 'epoch': 3} {'type': 'loss', 'content': 0.0024339838419109583, 'timestamp': '2025-09-30 22:19:00.183029', 'step': 4183, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:19:00.222213', 'step': 4183, 'epoch': 3} {'type': 'loss', 'content': 0.002243544440716505, 'timestamp': '2025-09-30 22:19:00.256808', 'step': 4184, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:19:00.302005', 'step': 4184, 'epoch': 3} {'type': 'loss', 'content': 0.005382244009524584, 'timestamp': '2025-09-30 22:19:00.315169', 'step': 4185, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:19:00.348820', 'step': 4185, 'epoch': 3} {'type': 'loss', 'content': 0.008612517267465591, 'timestamp': '2025-09-30 22:19:00.361097', 'step': 4186, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:00.396554', 'step': 4186, 'epoch': 3} {'type': 'loss', 'content': 0.006431583780795336, 'timestamp': '2025-09-30 22:19:00.404542', 'step': 4187, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:19:00.439345', 'step': 4187, 'epoch': 3} {'type': 'loss', 'content': 0.007725206669420004, 'timestamp': '2025-09-30 22:19:00.472501', 'step': 4188, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:00.508777', 'step': 4188, 'epoch': 3} {'type': 'loss', 'content': 0.00806194543838501, 'timestamp': '2025-09-30 22:19:00.517157', 'step': 4189, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:00.553602', 'step': 4189, 'epoch': 3} {'type': 'loss', 'content': 0.003619994968175888, 'timestamp': '2025-09-30 22:19:00.561585', 'step': 4190, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:19:00.617536', 'step': 4190, 'epoch': 3} {'type': 'loss', 'content': 0.001058433554135263, 'timestamp': '2025-09-30 22:19:00.633122', 'step': 4191, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:00.678045', 'step': 4191, 'epoch': 3} {'type': 'loss', 'content': 0.0038083905819803476, 'timestamp': '2025-09-30 22:19:00.706896', 'step': 4192, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:19:00.748299', 'step': 4192, 'epoch': 3} {'type': 'loss', 'content': 0.006782266311347485, 'timestamp': '2025-09-30 22:19:00.760929', 'step': 4193, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:00.800725', 'step': 4193, 'epoch': 3} {'type': 'loss', 'content': 0.008161071687936783, 'timestamp': '2025-09-30 22:19:00.812015', 'step': 4194, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:19:00.853572', 'step': 4194, 'epoch': 3} {'type': 'loss', 'content': 0.001998204505071044, 'timestamp': '2025-09-30 22:19:00.867338', 'step': 4195, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:19:00.923102', 'step': 4195, 'epoch': 3} {'type': 'loss', 'content': 0.001621428644284606, 'timestamp': '2025-09-30 22:19:00.957340', 'step': 4196, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:19:01.003286', 'step': 4196, 'epoch': 3} {'type': 'loss', 'content': 0.003493086202070117, 'timestamp': '2025-09-30 22:19:01.013903', 'step': 4197, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:19:01.049699', 'step': 4197, 'epoch': 3} {'type': 'loss', 'content': 0.001369775622151792, 'timestamp': '2025-09-30 22:19:01.062016', 'step': 4198, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:19:01.104786', 'step': 4198, 'epoch': 3} {'type': 'loss', 'content': 0.0035819520708173513, 'timestamp': '2025-09-30 22:19:01.120693', 'step': 4199, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:19:01.173809', 'step': 4199, 'epoch': 3} {'type': 'loss', 'content': 0.004347503650933504, 'timestamp': '2025-09-30 22:19:01.208632', 'step': 4200, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:01.249813', 'step': 4200, 'epoch': 3} {'type': 'loss', 'content': 0.0013405423378571868, 'timestamp': '2025-09-30 22:19:01.257674', 'step': 4201, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:19:01.317197', 'step': 4201, 'epoch': 3} {'type': 'loss', 'content': 0.00460792938247323, 'timestamp': '2025-09-30 22:19:01.330924', 'step': 4202, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:19:01.373942', 'step': 4202, 'epoch': 3} {'type': 'loss', 'content': 0.0026500532403588295, 'timestamp': '2025-09-30 22:19:01.386073', 'step': 4203, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:01.427312', 'step': 4203, 'epoch': 3} {'type': 'loss', 'content': 0.004506793338805437, 'timestamp': '2025-09-30 22:19:01.459137', 'step': 4204, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:19:01.503612', 'step': 4204, 'epoch': 3} {'type': 'loss', 'content': 0.009327051229774952, 'timestamp': '2025-09-30 22:19:01.513661', 'step': 4205, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:01.561340', 'step': 4205, 'epoch': 3} {'type': 'loss', 'content': 0.0073918732814490795, 'timestamp': '2025-09-30 22:19:01.571678', 'step': 4206, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:01.615606', 'step': 4206, 'epoch': 3} {'type': 'loss', 'content': 0.004413575399667025, 'timestamp': '2025-09-30 22:19:01.626826', 'step': 4207, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:19:01.664611', 'step': 4207, 'epoch': 3} {'type': 'loss', 'content': 0.003055840963497758, 'timestamp': '2025-09-30 22:19:01.692846', 'step': 4208, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:01.727747', 'step': 4208, 'epoch': 3} {'type': 'loss', 'content': 0.013104332610964775, 'timestamp': '2025-09-30 22:19:01.733327', 'step': 4209, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:19:01.782918', 'step': 4209, 'epoch': 3} {'type': 'loss', 'content': 0.011806270107626915, 'timestamp': '2025-09-30 22:19:01.790159', 'step': 4210, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:19:01.834407', 'step': 4210, 'epoch': 3} {'type': 'loss', 'content': 0.009919954463839531, 'timestamp': '2025-09-30 22:19:01.848170', 'step': 4211, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:01.886463', 'step': 4211, 'epoch': 3} {'type': 'loss', 'content': 0.00418841140344739, 'timestamp': '2025-09-30 22:19:01.915289', 'step': 4212, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:19:01.952394', 'step': 4212, 'epoch': 3} {'type': 'loss', 'content': 0.0029787688981741667, 'timestamp': '2025-09-30 22:19:01.965705', 'step': 4213, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:19:02.003590', 'step': 4213, 'epoch': 3} {'type': 'loss', 'content': 0.0026507768779993057, 'timestamp': '2025-09-30 22:19:02.016995', 'step': 4214, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:02.063406', 'step': 4214, 'epoch': 3} {'type': 'loss', 'content': 0.011196503415703773, 'timestamp': '2025-09-30 22:19:02.071327', 'step': 4215, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:19:02.109099', 'step': 4215, 'epoch': 3} {'type': 'loss', 'content': 0.011339684948325157, 'timestamp': '2025-09-30 22:19:02.142230', 'step': 4216, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:19:02.184627', 'step': 4216, 'epoch': 3} {'type': 'loss', 'content': 0.007355344947427511, 'timestamp': '2025-09-30 22:19:02.197893', 'step': 4217, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:19:02.242566', 'step': 4217, 'epoch': 3} {'type': 'loss', 'content': 0.0026141307316720486, 'timestamp': '2025-09-30 22:19:02.255092', 'step': 4218, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:19:02.296408', 'step': 4218, 'epoch': 3} {'type': 'loss', 'content': 0.0013344758190214634, 'timestamp': '2025-09-30 22:19:02.310222', 'step': 4219, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:19:02.345179', 'step': 4219, 'epoch': 3} {'type': 'loss', 'content': 0.004837046377360821, 'timestamp': '2025-09-30 22:19:02.373388', 'step': 4220, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:19:02.407432', 'step': 4220, 'epoch': 3} {'type': 'loss', 'content': 0.011121508665382862, 'timestamp': '2025-09-30 22:19:02.412350', 'step': 4221, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:02.446116', 'step': 4221, 'epoch': 3} {'type': 'loss', 'content': 0.017535170540213585, 'timestamp': '2025-09-30 22:19:02.454025', 'step': 4222, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:19:02.505142', 'step': 4222, 'epoch': 3} {'type': 'loss', 'content': 0.0030088103376328945, 'timestamp': '2025-09-30 22:19:02.524032', 'step': 4223, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:19:02.561067', 'step': 4223, 'epoch': 3} {'type': 'loss', 'content': 0.0013439098838716745, 'timestamp': '2025-09-30 22:19:02.589197', 'step': 4224, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:19:02.628464', 'step': 4224, 'epoch': 3} {'type': 'loss', 'content': 0.004040613770484924, 'timestamp': '2025-09-30 22:19:02.634920', 'step': 4225, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:19:02.678619', 'step': 4225, 'epoch': 3} {'type': 'loss', 'content': 0.0047653415240347385, 'timestamp': '2025-09-30 22:19:02.686253', 'step': 4226, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:19:02.723287', 'step': 4226, 'epoch': 3} {'type': 'loss', 'content': 0.005329497158527374, 'timestamp': '2025-09-30 22:19:02.736630', 'step': 4227, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:02.769458', 'step': 4227, 'epoch': 3} {'type': 'loss', 'content': 0.003973810467869043, 'timestamp': '2025-09-30 22:19:02.798172', 'step': 4228, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:19:02.835397', 'step': 4228, 'epoch': 3} {'type': 'loss', 'content': 0.003771683434024453, 'timestamp': '2025-09-30 22:19:02.848529', 'step': 4229, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:19:02.887001', 'step': 4229, 'epoch': 3} {'type': 'loss', 'content': 0.0023885814007371664, 'timestamp': '2025-09-30 22:19:02.900736', 'step': 4230, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:19:02.935781', 'step': 4230, 'epoch': 3} {'type': 'loss', 'content': 0.021918607875704765, 'timestamp': '2025-09-30 22:19:02.949151', 'step': 4231, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:19:02.985432', 'step': 4231, 'epoch': 3} {'type': 'loss', 'content': 0.004830060061067343, 'timestamp': '2025-09-30 22:19:03.020148', 'step': 4232, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:19:03.053543', 'step': 4232, 'epoch': 3} {'type': 'loss', 'content': 0.0036449062172323465, 'timestamp': '2025-09-30 22:19:03.063585', 'step': 4233, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:03.101175', 'step': 4233, 'epoch': 3} {'type': 'loss', 'content': 0.007518450729548931, 'timestamp': '2025-09-30 22:19:03.112406', 'step': 4234, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:19:03.148105', 'step': 4234, 'epoch': 3} {'type': 'loss', 'content': 0.004306568764150143, 'timestamp': '2025-09-30 22:19:03.160699', 'step': 4235, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:19:03.204916', 'step': 4235, 'epoch': 3} {'type': 'loss', 'content': 0.0038689924404025078, 'timestamp': '2025-09-30 22:19:03.238385', 'step': 4236, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:03.280175', 'step': 4236, 'epoch': 3} {'type': 'loss', 'content': 0.005623604636639357, 'timestamp': '2025-09-30 22:19:03.288049', 'step': 4237, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:19:03.327408', 'step': 4237, 'epoch': 3} {'type': 'loss', 'content': 0.005663564428687096, 'timestamp': '2025-09-30 22:19:03.334697', 'step': 4238, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:19:03.371457', 'step': 4238, 'epoch': 3} {'type': 'loss', 'content': 0.0026802935171872377, 'timestamp': '2025-09-30 22:19:03.379276', 'step': 4239, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:19:03.421167', 'step': 4239, 'epoch': 3} {'type': 'loss', 'content': 0.005176326725631952, 'timestamp': '2025-09-30 22:19:03.452502', 'step': 4240, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:03.488746', 'step': 4240, 'epoch': 3} {'type': 'loss', 'content': 0.009702826850116253, 'timestamp': '2025-09-30 22:19:03.494403', 'step': 4241, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:19:03.529508', 'step': 4241, 'epoch': 3} {'type': 'loss', 'content': 0.008449496701359749, 'timestamp': '2025-09-30 22:19:03.536389', 'step': 4242, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:19:03.576690', 'step': 4242, 'epoch': 3} {'type': 'loss', 'content': 0.006167837418615818, 'timestamp': '2025-09-30 22:19:03.592324', 'step': 4243, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:19:03.638147', 'step': 4243, 'epoch': 3} {'type': 'loss', 'content': 0.0028893337585031986, 'timestamp': '2025-09-30 22:19:03.672817', 'step': 4244, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:19:03.707762', 'step': 4244, 'epoch': 3} {'type': 'loss', 'content': 0.0020776870660483837, 'timestamp': '2025-09-30 22:19:03.713056', 'step': 4245, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:19:03.751860', 'step': 4245, 'epoch': 3} {'type': 'loss', 'content': 0.008614156395196915, 'timestamp': '2025-09-30 22:19:03.756002', 'step': 4246, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:19:03.791138', 'step': 4246, 'epoch': 3} {'type': 'loss', 'content': 0.0047997101210057735, 'timestamp': '2025-09-30 22:19:03.798731', 'step': 4247, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:19:03.837841', 'step': 4247, 'epoch': 3} {'type': 'loss', 'content': 0.006953793577849865, 'timestamp': '2025-09-30 22:19:03.871258', 'step': 4248, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:19:03.912452', 'step': 4248, 'epoch': 3} {'type': 'loss', 'content': 0.0016016876325011253, 'timestamp': '2025-09-30 22:19:03.917656', 'step': 4249, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:19:03.974172', 'step': 4249, 'epoch': 3} {'type': 'loss', 'content': 0.00614569429308176, 'timestamp': '2025-09-30 22:19:03.986584', 'step': 4250, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:04.031277', 'step': 4250, 'epoch': 3} {'type': 'loss', 'content': 0.0028142465744167566, 'timestamp': '2025-09-30 22:19:04.039207', 'step': 4251, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:04.080539', 'step': 4251, 'epoch': 3} {'type': 'loss', 'content': 0.005324391182512045, 'timestamp': '2025-09-30 22:19:04.114159', 'step': 4252, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:04.151741', 'step': 4252, 'epoch': 3} {'type': 'loss', 'content': 0.00806656014174223, 'timestamp': '2025-09-30 22:19:04.157311', 'step': 4253, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:04.226386', 'step': 4253, 'epoch': 3} {'type': 'loss', 'content': 0.005702858325093985, 'timestamp': '2025-09-30 22:19:04.236901', 'step': 4254, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:19:04.277929', 'step': 4254, 'epoch': 3} {'type': 'loss', 'content': 0.006622773595154285, 'timestamp': '2025-09-30 22:19:04.291643', 'step': 4255, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-09-30 22:19:06.850289', 'step': 4255, 'epoch': 3} {'type': 'pplx', 'content': 5.953026891361323, 'timestamp': '2025-09-30 22:19:06.853599', 'step': 4255, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:19:06.897051', 'step': 4255, 'epoch': 3} {'type': 'loss', 'content': 0.0011021263198927045, 'timestamp': '2025-09-30 22:19:06.927939', 'step': 4256, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:06.964283', 'step': 4256, 'epoch': 3} {'type': 'loss', 'content': 0.0023401672951877117, 'timestamp': '2025-09-30 22:19:06.972164', 'step': 4257, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:19:07.016186', 'step': 4257, 'epoch': 3} {'type': 'loss', 'content': 0.01058325543999672, 'timestamp': '2025-09-30 22:19:07.023509', 'step': 4258, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:19:07.066173', 'step': 4258, 'epoch': 3} {'type': 'loss', 'content': 0.0055806296877563, 'timestamp': '2025-09-30 22:19:07.079748', 'step': 4259, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:19:07.121618', 'step': 4259, 'epoch': 3} {'type': 'loss', 'content': 0.0015534240519627929, 'timestamp': '2025-09-30 22:19:07.156187', 'step': 4260, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:19:07.196271', 'step': 4260, 'epoch': 3} {'type': 'loss', 'content': 0.0026457849889993668, 'timestamp': '2025-09-30 22:19:07.208928', 'step': 4261, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:07.253446', 'step': 4261, 'epoch': 3} {'type': 'loss', 'content': 0.0017473185434937477, 'timestamp': '2025-09-30 22:19:07.261431', 'step': 4262, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:19:07.294895', 'step': 4262, 'epoch': 3} {'type': 'loss', 'content': 0.0018314578337594867, 'timestamp': '2025-09-30 22:19:07.305228', 'step': 4263, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:07.346132', 'step': 4263, 'epoch': 3} {'type': 'loss', 'content': 0.00169796880800277, 'timestamp': '2025-09-30 22:19:07.378009', 'step': 4264, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-09-30 22:19:07.422956', 'step': 4264, 'epoch': 3} {'type': 'loss', 'content': 0.00701497495174408, 'timestamp': '2025-09-30 22:19:07.438789', 'step': 4265, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:07.472061', 'step': 4265, 'epoch': 3} {'type': 'loss', 'content': 0.008701121434569359, 'timestamp': '2025-09-30 22:19:07.483172', 'step': 4266, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:19:07.518054', 'step': 4266, 'epoch': 3} {'type': 'loss', 'content': 0.002603327389806509, 'timestamp': '2025-09-30 22:19:07.530208', 'step': 4267, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:07.564482', 'step': 4267, 'epoch': 3} {'type': 'loss', 'content': 0.003772999159991741, 'timestamp': '2025-09-30 22:19:07.596672', 'step': 4268, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 17085996872448}, 'timestamp': '2025-09-30 22:19:07.655042', 'step': 4268, 'epoch': 3} {'type': 'loss', 'content': 0.0037890970706939697, 'timestamp': '2025-09-30 22:19:07.674316', 'step': 4269, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:19:07.719110', 'step': 4269, 'epoch': 3} {'type': 'loss', 'content': 0.004082482308149338, 'timestamp': '2025-09-30 22:19:07.733023', 'step': 4270, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:07.768301', 'step': 4270, 'epoch': 3} {'type': 'loss', 'content': 0.009964211843907833, 'timestamp': '2025-09-30 22:19:07.778591', 'step': 4271, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:07.813755', 'step': 4271, 'epoch': 3} {'type': 'loss', 'content': 0.007102191913872957, 'timestamp': '2025-09-30 22:19:07.845005', 'step': 4272, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:19:07.879765', 'step': 4272, 'epoch': 3} {'type': 'loss', 'content': 0.00931872334331274, 'timestamp': '2025-09-30 22:19:07.890305', 'step': 4273, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:19:07.930522', 'step': 4273, 'epoch': 3} {'type': 'loss', 'content': 0.0022495731245726347, 'timestamp': '2025-09-30 22:19:07.944446', 'step': 4274, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:19:07.985567', 'step': 4274, 'epoch': 3} {'type': 'loss', 'content': 0.004006609320640564, 'timestamp': '2025-09-30 22:19:07.999283', 'step': 4275, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:19:08.035564', 'step': 4275, 'epoch': 3} {'type': 'loss', 'content': 0.008295389823615551, 'timestamp': '2025-09-30 22:19:08.063693', 'step': 4276, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:08.099774', 'step': 4276, 'epoch': 3} {'type': 'loss', 'content': 0.0014035895001143217, 'timestamp': '2025-09-30 22:19:08.108223', 'step': 4277, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:08.150002', 'step': 4277, 'epoch': 3} {'type': 'loss', 'content': 0.0077087851241230965, 'timestamp': '2025-09-30 22:19:08.161070', 'step': 4278, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:08.203994', 'step': 4278, 'epoch': 3} {'type': 'loss', 'content': 0.002847875002771616, 'timestamp': '2025-09-30 22:19:08.215147', 'step': 4279, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:08.251323', 'step': 4279, 'epoch': 3} {'type': 'loss', 'content': 0.004052981734275818, 'timestamp': '2025-09-30 22:19:08.282513', 'step': 4280, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-09-30 22:19:08.325199', 'step': 4280, 'epoch': 3} {'type': 'loss', 'content': 0.0036239621695131063, 'timestamp': '2025-09-30 22:19:08.341919', 'step': 4281, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:19:08.378924', 'step': 4281, 'epoch': 3} {'type': 'loss', 'content': 0.00455652317032218, 'timestamp': '2025-09-30 22:19:08.385965', 'step': 4282, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:08.426331', 'step': 4282, 'epoch': 3} {'type': 'loss', 'content': 0.007087154779583216, 'timestamp': '2025-09-30 22:19:08.436573', 'step': 4283, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:08.492487', 'step': 4283, 'epoch': 3} {'type': 'loss', 'content': 0.008435919880867004, 'timestamp': '2025-09-30 22:19:08.521313', 'step': 4284, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:08.564592', 'step': 4284, 'epoch': 3} {'type': 'loss', 'content': 0.0013751048827543855, 'timestamp': '2025-09-30 22:19:08.570189', 'step': 4285, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:08.628834', 'step': 4285, 'epoch': 3} {'type': 'loss', 'content': 0.010922163724899292, 'timestamp': '2025-09-30 22:19:08.639694', 'step': 4286, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:19:08.695950', 'step': 4286, 'epoch': 3} {'type': 'loss', 'content': 0.002734346082434058, 'timestamp': '2025-09-30 22:19:08.709278', 'step': 4287, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:08.748917', 'step': 4287, 'epoch': 3} {'type': 'loss', 'content': 0.01130365114659071, 'timestamp': '2025-09-30 22:19:08.781022', 'step': 4288, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:19:08.825831', 'step': 4288, 'epoch': 3} {'type': 'loss', 'content': 0.014183576218783855, 'timestamp': '2025-09-30 22:19:08.835895', 'step': 4289, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:19:08.880131', 'step': 4289, 'epoch': 3} {'type': 'loss', 'content': 0.006344980094581842, 'timestamp': '2025-09-30 22:19:08.887420', 'step': 4290, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:19:08.925273', 'step': 4290, 'epoch': 3} {'type': 'loss', 'content': 0.004288821946829557, 'timestamp': '2025-09-30 22:19:08.932865', 'step': 4291, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:08.966631', 'step': 4291, 'epoch': 3} {'type': 'loss', 'content': 0.00489983893930912, 'timestamp': '2025-09-30 22:19:08.998653', 'step': 4292, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:19:09.044869', 'step': 4292, 'epoch': 3} {'type': 'loss', 'content': 0.005966820288449526, 'timestamp': '2025-09-30 22:19:09.050213', 'step': 4293, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:09.089173', 'step': 4293, 'epoch': 3} {'type': 'loss', 'content': 0.0028437243308871984, 'timestamp': '2025-09-30 22:19:09.099550', 'step': 4294, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:19:09.138196', 'step': 4294, 'epoch': 3} {'type': 'loss', 'content': 0.006014253944158554, 'timestamp': '2025-09-30 22:19:09.159570', 'step': 4295, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:19:09.196223', 'step': 4295, 'epoch': 3} {'type': 'loss', 'content': 0.00552031584084034, 'timestamp': '2025-09-30 22:19:09.224110', 'step': 4296, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:09.261283', 'step': 4296, 'epoch': 3} {'type': 'loss', 'content': 0.006494057364761829, 'timestamp': '2025-09-30 22:19:09.269886', 'step': 4297, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:09.304583', 'step': 4297, 'epoch': 3} {'type': 'loss', 'content': 0.006987850181758404, 'timestamp': '2025-09-30 22:19:09.315060', 'step': 4298, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:09.351181', 'step': 4298, 'epoch': 3} {'type': 'loss', 'content': 0.0029523270204663277, 'timestamp': '2025-09-30 22:19:09.362311', 'step': 4299, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:09.408582', 'step': 4299, 'epoch': 3} {'type': 'loss', 'content': 0.010710208676755428, 'timestamp': '2025-09-30 22:19:09.440314', 'step': 4300, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:19:09.474381', 'step': 4300, 'epoch': 3} {'type': 'loss', 'content': 0.0016859716270118952, 'timestamp': '2025-09-30 22:19:09.485510', 'step': 4301, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:09.521687', 'step': 4301, 'epoch': 3} {'type': 'loss', 'content': 0.005892353132367134, 'timestamp': '2025-09-30 22:19:09.532745', 'step': 4302, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:09.566787', 'step': 4302, 'epoch': 3} {'type': 'loss', 'content': 0.0008297090535052121, 'timestamp': '2025-09-30 22:19:09.577986', 'step': 4303, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:19:09.625530', 'step': 4303, 'epoch': 3} {'type': 'loss', 'content': 0.007029360625892878, 'timestamp': '2025-09-30 22:19:09.658953', 'step': 4304, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:09.691460', 'step': 4304, 'epoch': 3} {'type': 'loss', 'content': 0.0020163406152278185, 'timestamp': '2025-09-30 22:19:09.697056', 'step': 4305, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:09.730328', 'step': 4305, 'epoch': 3} {'type': 'loss', 'content': 0.008451982401311398, 'timestamp': '2025-09-30 22:19:09.740647', 'step': 4306, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:09.776823', 'step': 4306, 'epoch': 3} {'type': 'loss', 'content': 0.0029738156590610743, 'timestamp': '2025-09-30 22:19:09.788180', 'step': 4307, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:09.829871', 'step': 4307, 'epoch': 3} {'type': 'loss', 'content': 0.004055047873407602, 'timestamp': '2025-09-30 22:19:09.858695', 'step': 4308, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:19:09.903606', 'step': 4308, 'epoch': 3} {'type': 'loss', 'content': 0.004126362968236208, 'timestamp': '2025-09-30 22:19:09.916608', 'step': 4309, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 14238374516352}, 'timestamp': '2025-09-30 22:19:09.983910', 'step': 4309, 'epoch': 3} {'type': 'loss', 'content': 0.009861967526376247, 'timestamp': '2025-09-30 22:19:10.001168', 'step': 4310, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:19:10.041811', 'step': 4310, 'epoch': 3} {'type': 'loss', 'content': 0.006570646073669195, 'timestamp': '2025-09-30 22:19:10.055837', 'step': 4311, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:19:10.098272', 'step': 4311, 'epoch': 3} {'type': 'loss', 'content': 0.008274688385426998, 'timestamp': '2025-09-30 22:19:10.132784', 'step': 4312, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:19:10.170119', 'step': 4312, 'epoch': 3} {'type': 'loss', 'content': 0.006628716830164194, 'timestamp': '2025-09-30 22:19:10.175276', 'step': 4313, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:19:10.221181', 'step': 4313, 'epoch': 3} {'type': 'loss', 'content': 0.0055550276301801205, 'timestamp': '2025-09-30 22:19:10.234806', 'step': 4314, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-09-30 22:19:10.280874', 'step': 4314, 'epoch': 3} {'type': 'loss', 'content': 0.002743455581367016, 'timestamp': '2025-09-30 22:19:10.297930', 'step': 4315, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:19:10.339639', 'step': 4315, 'epoch': 3} {'type': 'loss', 'content': 0.009410511702299118, 'timestamp': '2025-09-30 22:19:10.374156', 'step': 4316, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:19:10.420579', 'step': 4316, 'epoch': 3} {'type': 'loss', 'content': 0.003582603530958295, 'timestamp': '2025-09-30 22:19:10.433919', 'step': 4317, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-09-30 22:19:10.476659', 'step': 4317, 'epoch': 3} {'type': 'loss', 'content': 0.0032016027253121138, 'timestamp': '2025-09-30 22:19:10.493041', 'step': 4318, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 14238374516352}, 'timestamp': '2025-09-30 22:19:10.545342', 'step': 4318, 'epoch': 3} {'type': 'loss', 'content': 0.0011657862924039364, 'timestamp': '2025-09-30 22:19:10.562565', 'step': 4319, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:19:10.606312', 'step': 4319, 'epoch': 3} {'type': 'loss', 'content': 0.009448532946407795, 'timestamp': '2025-09-30 22:19:10.641172', 'step': 4320, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-09-30 22:19:10.688526', 'step': 4320, 'epoch': 3} {'type': 'loss', 'content': 0.0023872260935604572, 'timestamp': '2025-09-30 22:19:10.704456', 'step': 4321, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:19:10.757888', 'step': 4321, 'epoch': 3} {'type': 'loss', 'content': 0.01825058087706566, 'timestamp': '2025-09-30 22:19:10.765562', 'step': 4322, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:10.798220', 'step': 4322, 'epoch': 3} {'type': 'loss', 'content': 0.0006977806915529072, 'timestamp': '2025-09-30 22:19:10.805991', 'step': 4323, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:19:10.844655', 'step': 4323, 'epoch': 3} {'type': 'loss', 'content': 0.007597570773214102, 'timestamp': '2025-09-30 22:19:10.877710', 'step': 4324, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:19:10.915272', 'step': 4324, 'epoch': 3} {'type': 'loss', 'content': 0.005277728196233511, 'timestamp': '2025-09-30 22:19:10.925181', 'step': 4325, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:19:10.972285', 'step': 4325, 'epoch': 3} {'type': 'loss', 'content': 0.0012779583921656013, 'timestamp': '2025-09-30 22:19:10.984616', 'step': 4326, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:19:11.031885', 'step': 4326, 'epoch': 3} {'type': 'loss', 'content': 0.006616917438805103, 'timestamp': '2025-09-30 22:19:11.044502', 'step': 4327, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:11.080175', 'step': 4327, 'epoch': 3} {'type': 'loss', 'content': 0.002342329826205969, 'timestamp': '2025-09-30 22:19:11.111461', 'step': 4328, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:19:11.150659', 'step': 4328, 'epoch': 3} {'type': 'loss', 'content': 0.002416786039248109, 'timestamp': '2025-09-30 22:19:11.163372', 'step': 4329, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:19:11.210888', 'step': 4329, 'epoch': 3} {'type': 'loss', 'content': 0.0071250577457249165, 'timestamp': '2025-09-30 22:19:11.224268', 'step': 4330, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:19:11.261897', 'step': 4330, 'epoch': 3} {'type': 'loss', 'content': 0.0012249717256054282, 'timestamp': '2025-09-30 22:19:11.269625', 'step': 4331, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:19:11.309379', 'step': 4331, 'epoch': 3} {'type': 'loss', 'content': 0.004613219760358334, 'timestamp': '2025-09-30 22:19:11.337194', 'step': 4332, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:11.378826', 'step': 4332, 'epoch': 3} {'type': 'loss', 'content': 0.007091932464390993, 'timestamp': '2025-09-30 22:19:11.387433', 'step': 4333, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:19:11.430415', 'step': 4333, 'epoch': 3} {'type': 'loss', 'content': 0.0019520223140716553, 'timestamp': '2025-09-30 22:19:11.442713', 'step': 4334, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:19:11.489068', 'step': 4334, 'epoch': 3} {'type': 'loss', 'content': 0.0044762929901480675, 'timestamp': '2025-09-30 22:19:11.496363', 'step': 4335, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:19:11.530196', 'step': 4335, 'epoch': 3} {'type': 'loss', 'content': 0.0015030631329864264, 'timestamp': '2025-09-30 22:19:11.563358', 'step': 4336, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:19:11.603094', 'step': 4336, 'epoch': 3} {'type': 'loss', 'content': 0.005056925117969513, 'timestamp': '2025-09-30 22:19:11.607958', 'step': 4337, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:11.647290', 'step': 4337, 'epoch': 3} {'type': 'loss', 'content': 0.011781950481235981, 'timestamp': '2025-09-30 22:19:11.657607', 'step': 4338, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:19:11.690406', 'step': 4338, 'epoch': 3} {'type': 'loss', 'content': 0.0026247689966112375, 'timestamp': '2025-09-30 22:19:11.697313', 'step': 4339, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:11.750057', 'step': 4339, 'epoch': 3} {'type': 'loss', 'content': 0.003992740530520678, 'timestamp': '2025-09-30 22:19:11.781343', 'step': 4340, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:11.825837', 'step': 4340, 'epoch': 3} {'type': 'loss', 'content': 0.0036123378667980433, 'timestamp': '2025-09-30 22:19:11.833783', 'step': 4341, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:19:11.874985', 'step': 4341, 'epoch': 3} {'type': 'loss', 'content': 0.010466721840202808, 'timestamp': '2025-09-30 22:19:11.887459', 'step': 4342, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:11.924533', 'step': 4342, 'epoch': 3} {'type': 'loss', 'content': 0.004870318807661533, 'timestamp': '2025-09-30 22:19:11.934804', 'step': 4343, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:11.974276', 'step': 4343, 'epoch': 3} {'type': 'loss', 'content': 0.0008079107501544058, 'timestamp': '2025-09-30 22:19:12.002988', 'step': 4344, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:19:12.043087', 'step': 4344, 'epoch': 3} {'type': 'loss', 'content': 0.0037499789614230394, 'timestamp': '2025-09-30 22:19:12.048338', 'step': 4345, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:19:12.087573', 'step': 4345, 'epoch': 3} {'type': 'loss', 'content': 0.0018727704882621765, 'timestamp': '2025-09-30 22:19:12.094474', 'step': 4346, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:12.136157', 'step': 4346, 'epoch': 3} {'type': 'loss', 'content': 0.004115007352083921, 'timestamp': '2025-09-30 22:19:12.144158', 'step': 4347, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:12.178609', 'step': 4347, 'epoch': 3} {'type': 'loss', 'content': 0.0139734772965312, 'timestamp': '2025-09-30 22:19:12.207451', 'step': 4348, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:12.244608', 'step': 4348, 'epoch': 3} {'type': 'loss', 'content': 0.003832270158454776, 'timestamp': '2025-09-30 22:19:12.252692', 'step': 4349, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:19:12.292601', 'step': 4349, 'epoch': 3} {'type': 'loss', 'content': 0.004294030833989382, 'timestamp': '2025-09-30 22:19:12.305194', 'step': 4350, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:19:12.349346', 'step': 4350, 'epoch': 3} {'type': 'loss', 'content': 0.0010019203182309866, 'timestamp': '2025-09-30 22:19:12.353960', 'step': 4351, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:19:12.395269', 'step': 4351, 'epoch': 3} {'type': 'loss', 'content': 0.0012013876112177968, 'timestamp': '2025-09-30 22:19:12.423199', 'step': 4352, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:19:12.463687', 'step': 4352, 'epoch': 3} {'type': 'loss', 'content': 0.005679008085280657, 'timestamp': '2025-09-30 22:19:12.473487', 'step': 4353, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:12.506292', 'step': 4353, 'epoch': 3} {'type': 'loss', 'content': 0.0026292474940419197, 'timestamp': '2025-09-30 22:19:12.516766', 'step': 4354, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:19:12.553864', 'step': 4354, 'epoch': 3} {'type': 'loss', 'content': 0.0029538250528275967, 'timestamp': '2025-09-30 22:19:12.567252', 'step': 4355, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:12.604588', 'step': 4355, 'epoch': 3} {'type': 'loss', 'content': 0.0026939180679619312, 'timestamp': '2025-09-30 22:19:12.633365', 'step': 4356, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:12.668085', 'step': 4356, 'epoch': 3} {'type': 'loss', 'content': 0.003916259855031967, 'timestamp': '2025-09-30 22:19:12.676122', 'step': 4357, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:19:12.726197', 'step': 4357, 'epoch': 3} {'type': 'loss', 'content': 0.005390469450503588, 'timestamp': '2025-09-30 22:19:12.742088', 'step': 4358, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:19:12.784714', 'step': 4358, 'epoch': 3} {'type': 'loss', 'content': 0.0019631667528301477, 'timestamp': '2025-09-30 22:19:12.798065', 'step': 4359, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:19:12.842955', 'step': 4359, 'epoch': 3} {'type': 'loss', 'content': 0.0065919035114347935, 'timestamp': '2025-09-30 22:19:12.877491', 'step': 4360, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:19:12.914866', 'step': 4360, 'epoch': 3} {'type': 'loss', 'content': 0.0034850030206143856, 'timestamp': '2025-09-30 22:19:12.927975', 'step': 4361, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:19:12.966951', 'step': 4361, 'epoch': 3} {'type': 'loss', 'content': 0.0016640233807265759, 'timestamp': '2025-09-30 22:19:12.979305', 'step': 4362, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:13.023953', 'step': 4362, 'epoch': 3} {'type': 'loss', 'content': 0.00251871719956398, 'timestamp': '2025-09-30 22:19:13.032013', 'step': 4363, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:19:13.075520', 'step': 4363, 'epoch': 3} {'type': 'loss', 'content': 0.0030696927569806576, 'timestamp': '2025-09-30 22:19:13.110037', 'step': 4364, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:19:13.153783', 'step': 4364, 'epoch': 3} {'type': 'loss', 'content': 0.00622408976778388, 'timestamp': '2025-09-30 22:19:13.166446', 'step': 4365, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:19:13.211109', 'step': 4365, 'epoch': 3} {'type': 'loss', 'content': 0.003771674120798707, 'timestamp': '2025-09-30 22:19:13.224800', 'step': 4366, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:19:13.258888', 'step': 4366, 'epoch': 3} {'type': 'loss', 'content': 0.0017379183555021882, 'timestamp': '2025-09-30 22:19:13.265803', 'step': 4367, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:19:13.299777', 'step': 4367, 'epoch': 3} {'type': 'loss', 'content': 0.0018934322288259864, 'timestamp': '2025-09-30 22:19:13.328256', 'step': 4368, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:19:13.375821', 'step': 4368, 'epoch': 3} {'type': 'loss', 'content': 0.006382144056260586, 'timestamp': '2025-09-30 22:19:13.390937', 'step': 4369, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:13.439713', 'step': 4369, 'epoch': 3} {'type': 'loss', 'content': 0.0010274213273078203, 'timestamp': '2025-09-30 22:19:13.450159', 'step': 4370, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-09-30 22:19:16.081076', 'step': 4370, 'epoch': 3} {'type': 'pplx', 'content': 5.953068817951526, 'timestamp': '2025-09-30 22:19:16.085603', 'step': 4370, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:19:16.121030', 'step': 4370, 'epoch': 3} {'type': 'loss', 'content': 0.008330886252224445, 'timestamp': '2025-09-30 22:19:16.127589', 'step': 4371, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:16.167944', 'step': 4371, 'epoch': 3} {'type': 'loss', 'content': 0.005211700685322285, 'timestamp': '2025-09-30 22:19:16.199921', 'step': 4372, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:16.237190', 'step': 4372, 'epoch': 3} {'type': 'loss', 'content': 0.004459177143871784, 'timestamp': '2025-09-30 22:19:16.242790', 'step': 4373, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:19:16.277880', 'step': 4373, 'epoch': 3} {'type': 'loss', 'content': 0.010086101479828358, 'timestamp': '2025-09-30 22:19:16.290247', 'step': 4374, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:19:16.327193', 'step': 4374, 'epoch': 3} {'type': 'loss', 'content': 0.0048518106341362, 'timestamp': '2025-09-30 22:19:16.334762', 'step': 4375, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:19:16.371764', 'step': 4375, 'epoch': 3} {'type': 'loss', 'content': 0.004126888699829578, 'timestamp': '2025-09-30 22:19:16.405238', 'step': 4376, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:19:16.442434', 'step': 4376, 'epoch': 3} {'type': 'loss', 'content': 0.002302473410964012, 'timestamp': '2025-09-30 22:19:16.455410', 'step': 4377, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:16.502713', 'step': 4377, 'epoch': 3} {'type': 'loss', 'content': 0.0057604811154305935, 'timestamp': '2025-09-30 22:19:16.513076', 'step': 4378, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:16.547407', 'step': 4378, 'epoch': 3} {'type': 'loss', 'content': 0.004845494404435158, 'timestamp': '2025-09-30 22:19:16.557846', 'step': 4379, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:16.593196', 'step': 4379, 'epoch': 3} {'type': 'loss', 'content': 0.0019276568200439215, 'timestamp': '2025-09-30 22:19:16.625128', 'step': 4380, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:16.668052', 'step': 4380, 'epoch': 3} {'type': 'loss', 'content': 0.002704145386815071, 'timestamp': '2025-09-30 22:19:16.676120', 'step': 4381, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:19:16.711014', 'step': 4381, 'epoch': 3} {'type': 'loss', 'content': 0.004898314829915762, 'timestamp': '2025-09-30 22:19:16.718217', 'step': 4382, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:16.759335', 'step': 4382, 'epoch': 3} {'type': 'loss', 'content': 0.0038968524895608425, 'timestamp': '2025-09-30 22:19:16.770405', 'step': 4383, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-09-30 22:19:16.814699', 'step': 4383, 'epoch': 3} {'type': 'loss', 'content': 0.0018282084492966533, 'timestamp': '2025-09-30 22:19:16.851645', 'step': 4384, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:19:16.888730', 'step': 4384, 'epoch': 3} {'type': 'loss', 'content': 0.0008353728335350752, 'timestamp': '2025-09-30 22:19:16.891394', 'step': 4385, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:19:16.925332', 'step': 4385, 'epoch': 3} {'type': 'loss', 'content': 0.0059968712739646435, 'timestamp': '2025-09-30 22:19:16.932517', 'step': 4386, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:19:16.969629', 'step': 4386, 'epoch': 3} {'type': 'loss', 'content': 0.008428453467786312, 'timestamp': '2025-09-30 22:19:16.977452', 'step': 4387, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:17.013635', 'step': 4387, 'epoch': 3} {'type': 'loss', 'content': 0.0045945607125759125, 'timestamp': '2025-09-30 22:19:17.045716', 'step': 4388, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:17.080013', 'step': 4388, 'epoch': 3} {'type': 'loss', 'content': 0.004945170134305954, 'timestamp': '2025-09-30 22:19:17.085582', 'step': 4389, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:17.131888', 'step': 4389, 'epoch': 3} {'type': 'loss', 'content': 0.006509590428322554, 'timestamp': '2025-09-30 22:19:17.142911', 'step': 4390, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:19:17.176529', 'step': 4390, 'epoch': 3} {'type': 'loss', 'content': 0.0025962223298847675, 'timestamp': '2025-09-30 22:19:17.188775', 'step': 4391, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:19:17.238274', 'step': 4391, 'epoch': 3} {'type': 'loss', 'content': 0.0030034177470952272, 'timestamp': '2025-09-30 22:19:17.266241', 'step': 4392, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:17.304115', 'step': 4392, 'epoch': 3} {'type': 'loss', 'content': 0.007417632266879082, 'timestamp': '2025-09-30 22:19:17.312720', 'step': 4393, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:19:17.358520', 'step': 4393, 'epoch': 3} {'type': 'loss', 'content': 0.0008212727261707187, 'timestamp': '2025-09-30 22:19:17.365434', 'step': 4394, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:19:17.401767', 'step': 4394, 'epoch': 3} {'type': 'loss', 'content': 0.004093260038644075, 'timestamp': '2025-09-30 22:19:17.408354', 'step': 4395, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:19:17.451740', 'step': 4395, 'epoch': 3} {'type': 'loss', 'content': 0.007276744581758976, 'timestamp': '2025-09-30 22:19:17.480296', 'step': 4396, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:19:17.516802', 'step': 4396, 'epoch': 3} {'type': 'loss', 'content': 0.001587934442795813, 'timestamp': '2025-09-30 22:19:17.522876', 'step': 4397, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:19:17.560879', 'step': 4397, 'epoch': 3} {'type': 'loss', 'content': 0.0006083215703256428, 'timestamp': '2025-09-30 22:19:17.567864', 'step': 4398, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:17.602069', 'step': 4398, 'epoch': 3} {'type': 'loss', 'content': 0.0009365178411826491, 'timestamp': '2025-09-30 22:19:17.609715', 'step': 4399, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:17.660608', 'step': 4399, 'epoch': 3} {'type': 'loss', 'content': 0.0034567248076200485, 'timestamp': '2025-09-30 22:19:17.691842', 'step': 4400, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:19:17.734310', 'step': 4400, 'epoch': 3} {'type': 'loss', 'content': 0.004850293975323439, 'timestamp': '2025-09-30 22:19:17.741172', 'step': 4401, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:17.776883', 'step': 4401, 'epoch': 3} {'type': 'loss', 'content': 0.01048145443201065, 'timestamp': '2025-09-30 22:19:17.787274', 'step': 4402, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:19:17.826689', 'step': 4402, 'epoch': 3} {'type': 'loss', 'content': 0.001172828022390604, 'timestamp': '2025-09-30 22:19:17.830842', 'step': 4403, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:19:17.868167', 'step': 4403, 'epoch': 3} {'type': 'loss', 'content': 0.0014994451776146889, 'timestamp': '2025-09-30 22:19:17.901819', 'step': 4404, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:19:17.945820', 'step': 4404, 'epoch': 3} {'type': 'loss', 'content': 0.0007781424792483449, 'timestamp': '2025-09-30 22:19:17.950475', 'step': 4405, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:19:17.993472', 'step': 4405, 'epoch': 3} {'type': 'loss', 'content': 0.018517231568694115, 'timestamp': '2025-09-30 22:19:18.000423', 'step': 4406, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:19:18.035649', 'step': 4406, 'epoch': 3} {'type': 'loss', 'content': 0.004184328485280275, 'timestamp': '2025-09-30 22:19:18.042917', 'step': 4407, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:19:18.090234', 'step': 4407, 'epoch': 3} {'type': 'loss', 'content': 0.0008757567848078907, 'timestamp': '2025-09-30 22:19:18.118391', 'step': 4408, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:18.158044', 'step': 4408, 'epoch': 3} {'type': 'loss', 'content': 0.001239742268808186, 'timestamp': '2025-09-30 22:19:18.164705', 'step': 4409, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:19:18.206995', 'step': 4409, 'epoch': 3} {'type': 'loss', 'content': 0.0006983289495110512, 'timestamp': '2025-09-30 22:19:18.213877', 'step': 4410, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:19:18.255830', 'step': 4410, 'epoch': 3} {'type': 'loss', 'content': 0.00456323241814971, 'timestamp': '2025-09-30 22:19:18.263054', 'step': 4411, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:19:18.308768', 'step': 4411, 'epoch': 3} {'type': 'loss', 'content': 0.007104892283678055, 'timestamp': '2025-09-30 22:19:18.345529', 'step': 4412, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:19:18.385415', 'step': 4412, 'epoch': 3} {'type': 'loss', 'content': 0.00333253457210958, 'timestamp': '2025-09-30 22:19:18.390423', 'step': 4413, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:19:18.424520', 'step': 4413, 'epoch': 3} {'type': 'loss', 'content': 0.0020394609309732914, 'timestamp': '2025-09-30 22:19:18.431756', 'step': 4414, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:18.481909', 'step': 4414, 'epoch': 3} {'type': 'loss', 'content': 0.003956458065658808, 'timestamp': '2025-09-30 22:19:18.492897', 'step': 4415, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:19:18.534671', 'step': 4415, 'epoch': 3} {'type': 'loss', 'content': 0.0042778183706104755, 'timestamp': '2025-09-30 22:19:18.571661', 'step': 4416, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:18.607058', 'step': 4416, 'epoch': 3} {'type': 'loss', 'content': 0.003295444417744875, 'timestamp': '2025-09-30 22:19:18.615551', 'step': 4417, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:19:18.654541', 'step': 4417, 'epoch': 3} {'type': 'loss', 'content': 0.003468771930783987, 'timestamp': '2025-09-30 22:19:18.670743', 'step': 4418, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:19:18.719928', 'step': 4418, 'epoch': 3} {'type': 'loss', 'content': 0.00900344830006361, 'timestamp': '2025-09-30 22:19:18.734348', 'step': 4419, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:18.783891', 'step': 4419, 'epoch': 3} {'type': 'loss', 'content': 0.0028677969239652157, 'timestamp': '2025-09-30 22:19:18.815088', 'step': 4420, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:18.862323', 'step': 4420, 'epoch': 3} {'type': 'loss', 'content': 0.002005532383918762, 'timestamp': '2025-09-30 22:19:18.875823', 'step': 4421, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:19:18.909578', 'step': 4421, 'epoch': 3} {'type': 'loss', 'content': 0.0024478633422404528, 'timestamp': '2025-09-30 22:19:18.922050', 'step': 4422, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:18.955193', 'step': 4422, 'epoch': 3} {'type': 'loss', 'content': 0.007036969996988773, 'timestamp': '2025-09-30 22:19:18.967169', 'step': 4423, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:19:19.009048', 'step': 4423, 'epoch': 3} {'type': 'loss', 'content': 0.009061479941010475, 'timestamp': '2025-09-30 22:19:19.039253', 'step': 4424, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:19:19.091977', 'step': 4424, 'epoch': 3} {'type': 'loss', 'content': 0.002791600301861763, 'timestamp': '2025-09-30 22:19:19.107069', 'step': 4425, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:19.155531', 'step': 4425, 'epoch': 3} {'type': 'loss', 'content': 0.007210858631879091, 'timestamp': '2025-09-30 22:19:19.163385', 'step': 4426, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:19.197677', 'step': 4426, 'epoch': 3} {'type': 'loss', 'content': 0.0018888312624767423, 'timestamp': '2025-09-30 22:19:19.208669', 'step': 4427, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:19.266341', 'step': 4427, 'epoch': 3} {'type': 'loss', 'content': 0.007950611412525177, 'timestamp': '2025-09-30 22:19:19.297482', 'step': 4428, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:19.351458', 'step': 4428, 'epoch': 3} {'type': 'loss', 'content': 0.008298023603856564, 'timestamp': '2025-09-30 22:19:19.357117', 'step': 4429, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 14238374516352}, 'timestamp': '2025-09-30 22:19:19.418515', 'step': 4429, 'epoch': 3} {'type': 'loss', 'content': 0.005394228268414736, 'timestamp': '2025-09-30 22:19:19.435838', 'step': 4430, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:19:19.491633', 'step': 4430, 'epoch': 3} {'type': 'loss', 'content': 0.000268581003183499, 'timestamp': '2025-09-30 22:19:19.504008', 'step': 4431, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:19:19.541651', 'step': 4431, 'epoch': 3} {'type': 'loss', 'content': 0.002814779058098793, 'timestamp': '2025-09-30 22:19:19.570146', 'step': 4432, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:19:19.606405', 'step': 4432, 'epoch': 3} {'type': 'loss', 'content': 0.006743252277374268, 'timestamp': '2025-09-30 22:19:19.616846', 'step': 4433, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:19.664633', 'step': 4433, 'epoch': 3} {'type': 'loss', 'content': 0.002839987864717841, 'timestamp': '2025-09-30 22:19:19.672521', 'step': 4434, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:19:19.712066', 'step': 4434, 'epoch': 3} {'type': 'loss', 'content': 0.0068497913889586926, 'timestamp': '2025-09-30 22:19:19.723016', 'step': 4435, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:19.780263', 'step': 4435, 'epoch': 3} {'type': 'loss', 'content': 0.00204927078448236, 'timestamp': '2025-09-30 22:19:19.808892', 'step': 4436, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:19.865423', 'step': 4436, 'epoch': 3} {'type': 'loss', 'content': 0.0019519092747941613, 'timestamp': '2025-09-30 22:19:19.879232', 'step': 4437, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:19.926929', 'step': 4437, 'epoch': 3} {'type': 'loss', 'content': 0.0030627211090177298, 'timestamp': '2025-09-30 22:19:19.937974', 'step': 4438, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:19.976075', 'step': 4438, 'epoch': 3} {'type': 'loss', 'content': 0.003808629233390093, 'timestamp': '2025-09-30 22:19:19.987073', 'step': 4439, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:19:20.026474', 'step': 4439, 'epoch': 3} {'type': 'loss', 'content': 0.004874465521425009, 'timestamp': '2025-09-30 22:19:20.061089', 'step': 4440, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:20.103790', 'step': 4440, 'epoch': 3} {'type': 'loss', 'content': 0.00819898210465908, 'timestamp': '2025-09-30 22:19:20.112264', 'step': 4441, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:20.149733', 'step': 4441, 'epoch': 3} {'type': 'loss', 'content': 0.007409947458654642, 'timestamp': '2025-09-30 22:19:20.160775', 'step': 4442, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:20.208537', 'step': 4442, 'epoch': 3} {'type': 'loss', 'content': 0.0024290422443300486, 'timestamp': '2025-09-30 22:19:20.220307', 'step': 4443, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:19:20.261432', 'step': 4443, 'epoch': 3} {'type': 'loss', 'content': 0.00824358407407999, 'timestamp': '2025-09-30 22:19:20.294862', 'step': 4444, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:20.333650', 'step': 4444, 'epoch': 3} {'type': 'loss', 'content': 0.0036941298749297857, 'timestamp': '2025-09-30 22:19:20.344272', 'step': 4445, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:20.380994', 'step': 4445, 'epoch': 3} {'type': 'loss', 'content': 0.000923198414966464, 'timestamp': '2025-09-30 22:19:20.392046', 'step': 4446, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:19:20.442016', 'step': 4446, 'epoch': 3} {'type': 'loss', 'content': 0.006019048858433962, 'timestamp': '2025-09-30 22:19:20.455848', 'step': 4447, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:19:20.492765', 'step': 4447, 'epoch': 3} {'type': 'loss', 'content': 0.010113263502717018, 'timestamp': '2025-09-30 22:19:20.527319', 'step': 4448, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:19:20.563947', 'step': 4448, 'epoch': 3} {'type': 'loss', 'content': 0.004297421779483557, 'timestamp': '2025-09-30 22:19:20.577071', 'step': 4449, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:20.618789', 'step': 4449, 'epoch': 3} {'type': 'loss', 'content': 0.0028478566091507673, 'timestamp': '2025-09-30 22:19:20.633013', 'step': 4450, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:19:20.672329', 'step': 4450, 'epoch': 3} {'type': 'loss', 'content': 0.003244199324399233, 'timestamp': '2025-09-30 22:19:20.686099', 'step': 4451, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:20.722592', 'step': 4451, 'epoch': 3} {'type': 'loss', 'content': 0.004726372193545103, 'timestamp': '2025-09-30 22:19:20.754393', 'step': 4452, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:19:20.798280', 'step': 4452, 'epoch': 3} {'type': 'loss', 'content': 0.004119518678635359, 'timestamp': '2025-09-30 22:19:20.813189', 'step': 4453, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 18984411776512}, 'timestamp': '2025-09-30 22:19:20.891633', 'step': 4453, 'epoch': 3} {'type': 'loss', 'content': 0.0023224842734634876, 'timestamp': '2025-09-30 22:19:20.913372', 'step': 4454, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:20.955799', 'step': 4454, 'epoch': 3} {'type': 'loss', 'content': 0.0011501448461785913, 'timestamp': '2025-09-30 22:19:20.966264', 'step': 4455, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:19:21.008962', 'step': 4455, 'epoch': 3} {'type': 'loss', 'content': 0.0017262777546420693, 'timestamp': '2025-09-30 22:19:21.034266', 'step': 4456, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:19:21.068207', 'step': 4456, 'epoch': 3} {'type': 'loss', 'content': 0.0025809993967413902, 'timestamp': '2025-09-30 22:19:21.082493', 'step': 4457, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:19:21.124441', 'step': 4457, 'epoch': 3} {'type': 'loss', 'content': 0.0014467294095084071, 'timestamp': '2025-09-30 22:19:21.134682', 'step': 4458, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:19:21.177834', 'step': 4458, 'epoch': 3} {'type': 'loss', 'content': 0.004279765300452709, 'timestamp': '2025-09-30 22:19:21.184853', 'step': 4459, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:19:21.224372', 'step': 4459, 'epoch': 3} {'type': 'loss', 'content': 0.0008534055668860674, 'timestamp': '2025-09-30 22:19:21.258934', 'step': 4460, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:21.302589', 'step': 4460, 'epoch': 3} {'type': 'loss', 'content': 0.004186238162219524, 'timestamp': '2025-09-30 22:19:21.312027', 'step': 4461, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:19:21.363409', 'step': 4461, 'epoch': 3} {'type': 'loss', 'content': 0.003403113689273596, 'timestamp': '2025-09-30 22:19:21.376764', 'step': 4462, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:19:21.413346', 'step': 4462, 'epoch': 3} {'type': 'loss', 'content': 0.0028697995003312826, 'timestamp': '2025-09-30 22:19:21.428278', 'step': 4463, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:19:21.490774', 'step': 4463, 'epoch': 3} {'type': 'loss', 'content': 0.007357324473559856, 'timestamp': '2025-09-30 22:19:21.525447', 'step': 4464, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:19:21.569338', 'step': 4464, 'epoch': 3} {'type': 'loss', 'content': 0.004044624045491219, 'timestamp': '2025-09-30 22:19:21.582653', 'step': 4465, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:21.628087', 'step': 4465, 'epoch': 3} {'type': 'loss', 'content': 0.0034907220397144556, 'timestamp': '2025-09-30 22:19:21.636082', 'step': 4466, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:19:21.674342', 'step': 4466, 'epoch': 3} {'type': 'loss', 'content': 0.005186514463275671, 'timestamp': '2025-09-30 22:19:21.681889', 'step': 4467, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:19:21.724822', 'step': 4467, 'epoch': 3} {'type': 'loss', 'content': 0.0030226618982851505, 'timestamp': '2025-09-30 22:19:21.758221', 'step': 4468, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:19:21.798078', 'step': 4468, 'epoch': 3} {'type': 'loss', 'content': 0.002067551016807556, 'timestamp': '2025-09-30 22:19:21.813482', 'step': 4469, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:21.861177', 'step': 4469, 'epoch': 3} {'type': 'loss', 'content': 0.0033264169469475746, 'timestamp': '2025-09-30 22:19:21.869032', 'step': 4470, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:19:21.915432', 'step': 4470, 'epoch': 3} {'type': 'loss', 'content': 0.0025733960792422295, 'timestamp': '2025-09-30 22:19:21.928758', 'step': 4471, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:19:21.969013', 'step': 4471, 'epoch': 3} {'type': 'loss', 'content': 0.049540817737579346, 'timestamp': '2025-09-30 22:19:22.002427', 'step': 4472, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:22.049690', 'step': 4472, 'epoch': 3} {'type': 'loss', 'content': 0.005924369674175978, 'timestamp': '2025-09-30 22:19:22.060279', 'step': 4473, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:19:22.128334', 'step': 4473, 'epoch': 3} {'type': 'loss', 'content': 0.0011542935390025377, 'timestamp': '2025-09-30 22:19:22.135574', 'step': 4474, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 16136789420416}, 'timestamp': '2025-09-30 22:19:22.187271', 'step': 4474, 'epoch': 3} {'type': 'loss', 'content': 0.0035477534402161837, 'timestamp': '2025-09-30 22:19:22.206285', 'step': 4475, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:19:22.262631', 'step': 4475, 'epoch': 3} {'type': 'loss', 'content': 0.010069957002997398, 'timestamp': '2025-09-30 22:19:22.290553', 'step': 4476, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:22.331817', 'step': 4476, 'epoch': 3} {'type': 'loss', 'content': 0.0012309798039495945, 'timestamp': '2025-09-30 22:19:22.337509', 'step': 4477, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:19:22.377893', 'step': 4477, 'epoch': 3} {'type': 'loss', 'content': 0.010741564445197582, 'timestamp': '2025-09-30 22:19:22.391732', 'step': 4478, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:19:22.447550', 'step': 4478, 'epoch': 3} {'type': 'loss', 'content': 0.00607129605486989, 'timestamp': '2025-09-30 22:19:22.455257', 'step': 4479, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:22.503074', 'step': 4479, 'epoch': 3} {'type': 'loss', 'content': 0.005703271832317114, 'timestamp': '2025-09-30 22:19:22.532020', 'step': 4480, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:19:22.568373', 'step': 4480, 'epoch': 3} {'type': 'loss', 'content': 0.0015396331436932087, 'timestamp': '2025-09-30 22:19:22.578988', 'step': 4481, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:22.614057', 'step': 4481, 'epoch': 3} {'type': 'loss', 'content': 0.003246902721002698, 'timestamp': '2025-09-30 22:19:22.624342', 'step': 4482, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:22.659338', 'step': 4482, 'epoch': 3} {'type': 'loss', 'content': 0.004048208240419626, 'timestamp': '2025-09-30 22:19:22.670807', 'step': 4483, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:22.712806', 'step': 4483, 'epoch': 3} {'type': 'loss', 'content': 0.008900952525436878, 'timestamp': '2025-09-30 22:19:22.744077', 'step': 4484, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:19:22.780796', 'step': 4484, 'epoch': 3} {'type': 'loss', 'content': 0.0058488743379712105, 'timestamp': '2025-09-30 22:19:22.791327', 'step': 4485, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-09-30 22:19:25.343183', 'step': 4485, 'epoch': 3} {'type': 'pplx', 'content': 5.795513532637302, 'timestamp': '2025-09-30 22:19:25.355820', 'step': 4485, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:19:25.394515', 'step': 4485, 'epoch': 3} {'type': 'loss', 'content': 0.002202738542109728, 'timestamp': '2025-09-30 22:19:25.400758', 'step': 4486, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:25.438632', 'step': 4486, 'epoch': 3} {'type': 'loss', 'content': 0.001024956232868135, 'timestamp': '2025-09-30 22:19:25.449111', 'step': 4487, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:19:25.489632', 'step': 4487, 'epoch': 3} {'type': 'loss', 'content': 0.001970956800505519, 'timestamp': '2025-09-30 22:19:25.523713', 'step': 4488, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:25.562660', 'step': 4488, 'epoch': 3} {'type': 'loss', 'content': 0.010650524869561195, 'timestamp': '2025-09-30 22:19:25.571471', 'step': 4489, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:25.607493', 'step': 4489, 'epoch': 3} {'type': 'loss', 'content': 0.003535045078024268, 'timestamp': '2025-09-30 22:19:25.617205', 'step': 4490, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:19:25.656034', 'step': 4490, 'epoch': 3} {'type': 'loss', 'content': 0.0014868737198412418, 'timestamp': '2025-09-30 22:19:25.668287', 'step': 4491, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:25.707603', 'step': 4491, 'epoch': 3} {'type': 'loss', 'content': 0.004532901104539633, 'timestamp': '2025-09-30 22:19:25.740018', 'step': 4492, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:25.777722', 'step': 4492, 'epoch': 3} {'type': 'loss', 'content': 0.009004509076476097, 'timestamp': '2025-09-30 22:19:25.785619', 'step': 4493, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:19:25.845473', 'step': 4493, 'epoch': 3} {'type': 'loss', 'content': 0.001528367749415338, 'timestamp': '2025-09-30 22:19:25.856924', 'step': 4494, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:19:25.910174', 'step': 4494, 'epoch': 3} {'type': 'loss', 'content': 0.004119920544326305, 'timestamp': '2025-09-30 22:19:25.925795', 'step': 4495, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:19:25.972438', 'step': 4495, 'epoch': 3} {'type': 'loss', 'content': 0.005202796310186386, 'timestamp': '2025-09-30 22:19:26.000510', 'step': 4496, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:19:26.061552', 'step': 4496, 'epoch': 3} {'type': 'loss', 'content': 0.002058766083791852, 'timestamp': '2025-09-30 22:19:26.074841', 'step': 4497, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 14238374516352}, 'timestamp': '2025-09-30 22:19:26.141053', 'step': 4497, 'epoch': 3} {'type': 'loss', 'content': 0.00176495430059731, 'timestamp': '2025-09-30 22:19:26.158449', 'step': 4498, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:26.202911', 'step': 4498, 'epoch': 3} {'type': 'loss', 'content': 0.008682888932526112, 'timestamp': '2025-09-30 22:19:26.215560', 'step': 4499, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:26.252182', 'step': 4499, 'epoch': 3} {'type': 'loss', 'content': 0.00501663563773036, 'timestamp': '2025-09-30 22:19:26.283402', 'step': 4500, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 4500', 'timestamp': '2025-09-30 22:19:31.309914', 'step': 4500, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:19:31.359179', 'step': 4500, 'epoch': 3} {'type': 'loss', 'content': 0.003126199124380946, 'timestamp': '2025-09-30 22:19:31.372107', 'step': 4501, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:31.406558', 'step': 4501, 'epoch': 3} {'type': 'loss', 'content': 0.01422073319554329, 'timestamp': '2025-09-30 22:19:31.417430', 'step': 4502, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:31.463988', 'step': 4502, 'epoch': 3} {'type': 'loss', 'content': 0.004692488815635443, 'timestamp': '2025-09-30 22:19:31.471643', 'step': 4503, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:19:31.520737', 'step': 4503, 'epoch': 3} {'type': 'loss', 'content': 0.0015276926569640636, 'timestamp': '2025-09-30 22:19:31.555586', 'step': 4504, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:19:31.594486', 'step': 4504, 'epoch': 3} {'type': 'loss', 'content': 0.004904737696051598, 'timestamp': '2025-09-30 22:19:31.605122', 'step': 4505, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:31.644218', 'step': 4505, 'epoch': 3} {'type': 'loss', 'content': 0.005495449062436819, 'timestamp': '2025-09-30 22:19:31.655271', 'step': 4506, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:19:31.697492', 'step': 4506, 'epoch': 3} {'type': 'loss', 'content': 0.008430728688836098, 'timestamp': '2025-09-30 22:19:31.711452', 'step': 4507, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:19:31.756307', 'step': 4507, 'epoch': 3} {'type': 'loss', 'content': 0.007869812659919262, 'timestamp': '2025-09-30 22:19:31.784700', 'step': 4508, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:19:31.830203', 'step': 4508, 'epoch': 3} {'type': 'loss', 'content': 0.002214880893006921, 'timestamp': '2025-09-30 22:19:31.840630', 'step': 4509, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:19:31.878999', 'step': 4509, 'epoch': 3} {'type': 'loss', 'content': 0.0030361057724803686, 'timestamp': '2025-09-30 22:19:31.886720', 'step': 4510, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:19:31.926991', 'step': 4510, 'epoch': 3} {'type': 'loss', 'content': 0.002759290626272559, 'timestamp': '2025-09-30 22:19:31.934745', 'step': 4511, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:31.973961', 'step': 4511, 'epoch': 3} {'type': 'loss', 'content': 0.0018279353389516473, 'timestamp': '2025-09-30 22:19:32.005389', 'step': 4512, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:32.045766', 'step': 4512, 'epoch': 3} {'type': 'loss', 'content': 0.011083441786468029, 'timestamp': '2025-09-30 22:19:32.051414', 'step': 4513, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:19:32.093994', 'step': 4513, 'epoch': 3} {'type': 'loss', 'content': 0.0008053503697738051, 'timestamp': '2025-09-30 22:19:32.106568', 'step': 4514, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:19:32.146869', 'step': 4514, 'epoch': 3} {'type': 'loss', 'content': 0.0013490010751411319, 'timestamp': '2025-09-30 22:19:32.159453', 'step': 4515, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:32.228080', 'step': 4515, 'epoch': 3} {'type': 'loss', 'content': 0.003745648544281721, 'timestamp': '2025-09-30 22:19:32.260156', 'step': 4516, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:19:32.311933', 'step': 4516, 'epoch': 3} {'type': 'loss', 'content': 0.0028096225578337908, 'timestamp': '2025-09-30 22:19:32.318028', 'step': 4517, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:32.360083', 'step': 4517, 'epoch': 3} {'type': 'loss', 'content': 0.004467432387173176, 'timestamp': '2025-09-30 22:19:32.371190', 'step': 4518, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:19:32.416700', 'step': 4518, 'epoch': 3} {'type': 'loss', 'content': 0.00493756914511323, 'timestamp': '2025-09-30 22:19:32.430430', 'step': 4519, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:32.478984', 'step': 4519, 'epoch': 3} {'type': 'loss', 'content': 0.004701991565525532, 'timestamp': '2025-09-30 22:19:32.510094', 'step': 4520, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:32.557141', 'step': 4520, 'epoch': 3} {'type': 'loss', 'content': 0.008878360502421856, 'timestamp': '2025-09-30 22:19:32.562893', 'step': 4521, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:32.597872', 'step': 4521, 'epoch': 3} {'type': 'loss', 'content': 0.003462841734290123, 'timestamp': '2025-09-30 22:19:32.608997', 'step': 4522, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:32.665017', 'step': 4522, 'epoch': 3} {'type': 'loss', 'content': 0.015827376395463943, 'timestamp': '2025-09-30 22:19:32.675269', 'step': 4523, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:19:32.711444', 'step': 4523, 'epoch': 3} {'type': 'loss', 'content': 0.003019185969606042, 'timestamp': '2025-09-30 22:19:32.744561', 'step': 4524, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:32.789171', 'step': 4524, 'epoch': 3} {'type': 'loss', 'content': 0.0021726477425545454, 'timestamp': '2025-09-30 22:19:32.797833', 'step': 4525, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:32.837458', 'step': 4525, 'epoch': 3} {'type': 'loss', 'content': 0.0042687393724918365, 'timestamp': '2025-09-30 22:19:32.845431', 'step': 4526, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:19:32.882653', 'step': 4526, 'epoch': 3} {'type': 'loss', 'content': 0.004522709175944328, 'timestamp': '2025-09-30 22:19:32.889794', 'step': 4527, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:19:32.922349', 'step': 4527, 'epoch': 3} {'type': 'loss', 'content': 0.005108777899295092, 'timestamp': '2025-09-30 22:19:32.950849', 'step': 4528, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:19:32.988486', 'step': 4528, 'epoch': 3} {'type': 'loss', 'content': 0.0053230454213917255, 'timestamp': '2025-09-30 22:19:32.998325', 'step': 4529, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:19:33.032370', 'step': 4529, 'epoch': 3} {'type': 'loss', 'content': 0.005572678055614233, 'timestamp': '2025-09-30 22:19:33.044810', 'step': 4530, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:19:33.085025', 'step': 4530, 'epoch': 3} {'type': 'loss', 'content': 0.003167995484545827, 'timestamp': '2025-09-30 22:19:33.098737', 'step': 4531, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:19:33.137690', 'step': 4531, 'epoch': 3} {'type': 'loss', 'content': 0.0014535030350089073, 'timestamp': '2025-09-30 22:19:33.166124', 'step': 4532, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:33.200752', 'step': 4532, 'epoch': 3} {'type': 'loss', 'content': 0.010079230181872845, 'timestamp': '2025-09-30 22:19:33.206397', 'step': 4533, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:33.247966', 'step': 4533, 'epoch': 3} {'type': 'loss', 'content': 0.0051791672594845295, 'timestamp': '2025-09-30 22:19:33.258505', 'step': 4534, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:33.297368', 'step': 4534, 'epoch': 3} {'type': 'loss', 'content': 0.005085056647658348, 'timestamp': '2025-09-30 22:19:33.308492', 'step': 4535, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:33.348339', 'step': 4535, 'epoch': 3} {'type': 'loss', 'content': 0.0078585809096694, 'timestamp': '2025-09-30 22:19:33.377208', 'step': 4536, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 15662185694400}, 'timestamp': '2025-09-30 22:19:33.423072', 'step': 4536, 'epoch': 3} {'type': 'loss', 'content': 0.004234223160892725, 'timestamp': '2025-09-30 22:19:33.442306', 'step': 4537, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:19:33.481406', 'step': 4537, 'epoch': 3} {'type': 'loss', 'content': 0.002334218705072999, 'timestamp': '2025-09-30 22:19:33.493957', 'step': 4538, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:19:33.531691', 'step': 4538, 'epoch': 3} {'type': 'loss', 'content': 0.011637231335043907, 'timestamp': '2025-09-30 22:19:33.545518', 'step': 4539, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:19:33.584546', 'step': 4539, 'epoch': 3} {'type': 'loss', 'content': 0.0036545591428875923, 'timestamp': '2025-09-30 22:19:33.618787', 'step': 4540, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:19:33.658187', 'step': 4540, 'epoch': 3} {'type': 'loss', 'content': 0.0018542211037129164, 'timestamp': '2025-09-30 22:19:33.667590', 'step': 4541, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:19:33.719858', 'step': 4541, 'epoch': 3} {'type': 'loss', 'content': 0.0016864044591784477, 'timestamp': '2025-09-30 22:19:33.733520', 'step': 4542, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:19:33.767590', 'step': 4542, 'epoch': 3} {'type': 'loss', 'content': 0.0020412022713571787, 'timestamp': '2025-09-30 22:19:33.779761', 'step': 4543, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:19:33.825168', 'step': 4543, 'epoch': 3} {'type': 'loss', 'content': 0.004759989213198423, 'timestamp': '2025-09-30 22:19:33.858343', 'step': 4544, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:19:33.898086', 'step': 4544, 'epoch': 3} {'type': 'loss', 'content': 0.00546990055590868, 'timestamp': '2025-09-30 22:19:33.908074', 'step': 4545, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:19:33.962281', 'step': 4545, 'epoch': 3} {'type': 'loss', 'content': 0.00273419008590281, 'timestamp': '2025-09-30 22:19:33.974656', 'step': 4546, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:19:34.016467', 'step': 4546, 'epoch': 3} {'type': 'loss', 'content': 0.010769542306661606, 'timestamp': '2025-09-30 22:19:34.030472', 'step': 4547, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:34.074285', 'step': 4547, 'epoch': 3} {'type': 'loss', 'content': 0.0031620825175195932, 'timestamp': '2025-09-30 22:19:34.103023', 'step': 4548, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:19:34.147515', 'step': 4548, 'epoch': 3} {'type': 'loss', 'content': 0.003730887547135353, 'timestamp': '2025-09-30 22:19:34.163035', 'step': 4549, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:19:34.203874', 'step': 4549, 'epoch': 3} {'type': 'loss', 'content': 0.002586633199825883, 'timestamp': '2025-09-30 22:19:34.216436', 'step': 4550, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:19:34.255486', 'step': 4550, 'epoch': 3} {'type': 'loss', 'content': 0.0012792575871571898, 'timestamp': '2025-09-30 22:19:34.263083', 'step': 4551, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:34.302280', 'step': 4551, 'epoch': 3} {'type': 'loss', 'content': 0.003631437197327614, 'timestamp': '2025-09-30 22:19:34.331090', 'step': 4552, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:19:34.368618', 'step': 4552, 'epoch': 3} {'type': 'loss', 'content': 0.004950222093611956, 'timestamp': '2025-09-30 22:19:34.379083', 'step': 4553, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:19:34.422989', 'step': 4553, 'epoch': 3} {'type': 'loss', 'content': 0.0036054716911166906, 'timestamp': '2025-09-30 22:19:34.436797', 'step': 4554, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:19:34.477069', 'step': 4554, 'epoch': 3} {'type': 'loss', 'content': 0.0020827697589993477, 'timestamp': '2025-09-30 22:19:34.489445', 'step': 4555, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:19:34.525800', 'step': 4555, 'epoch': 3} {'type': 'loss', 'content': 0.003716694889590144, 'timestamp': '2025-09-30 22:19:34.559014', 'step': 4556, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:19:34.600518', 'step': 4556, 'epoch': 3} {'type': 'loss', 'content': 0.0007190012838691473, 'timestamp': '2025-09-30 22:19:34.610510', 'step': 4557, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:19:34.657518', 'step': 4557, 'epoch': 3} {'type': 'loss', 'content': 0.005343511700630188, 'timestamp': '2025-09-30 22:19:34.669788', 'step': 4558, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:34.713873', 'step': 4558, 'epoch': 3} {'type': 'loss', 'content': 0.002123891608789563, 'timestamp': '2025-09-30 22:19:34.724974', 'step': 4559, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:19:34.765335', 'step': 4559, 'epoch': 3} {'type': 'loss', 'content': 0.0020689836237579584, 'timestamp': '2025-09-30 22:19:34.793883', 'step': 4560, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:19:34.832596', 'step': 4560, 'epoch': 3} {'type': 'loss', 'content': 0.005065195262432098, 'timestamp': '2025-09-30 22:19:34.845951', 'step': 4561, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:19:34.884767', 'step': 4561, 'epoch': 3} {'type': 'loss', 'content': 0.00638920022174716, 'timestamp': '2025-09-30 22:19:34.898497', 'step': 4562, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:19:34.937061', 'step': 4562, 'epoch': 3} {'type': 'loss', 'content': 0.0024130765814334154, 'timestamp': '2025-09-30 22:19:34.949688', 'step': 4563, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:19:34.991363', 'step': 4563, 'epoch': 3} {'type': 'loss', 'content': 0.009124759584665298, 'timestamp': '2025-09-30 22:19:35.025565', 'step': 4564, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:19:35.064008', 'step': 4564, 'epoch': 3} {'type': 'loss', 'content': 0.0013621627585962415, 'timestamp': '2025-09-30 22:19:35.073866', 'step': 4565, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:35.115166', 'step': 4565, 'epoch': 3} {'type': 'loss', 'content': 0.005780582316219807, 'timestamp': '2025-09-30 22:19:35.125457', 'step': 4566, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:19:35.161516', 'step': 4566, 'epoch': 3} {'type': 'loss', 'content': 0.004457356408238411, 'timestamp': '2025-09-30 22:19:35.168562', 'step': 4567, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:19:35.218344', 'step': 4567, 'epoch': 3} {'type': 'loss', 'content': 0.000518388522323221, 'timestamp': '2025-09-30 22:19:35.253237', 'step': 4568, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:19:35.294771', 'step': 4568, 'epoch': 3} {'type': 'loss', 'content': 0.005421688314527273, 'timestamp': '2025-09-30 22:19:35.304611', 'step': 4569, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:19:35.346315', 'step': 4569, 'epoch': 3} {'type': 'loss', 'content': 0.0036547333002090454, 'timestamp': '2025-09-30 22:19:35.360070', 'step': 4570, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:19:35.396271', 'step': 4570, 'epoch': 3} {'type': 'loss', 'content': 0.007334040943533182, 'timestamp': '2025-09-30 22:19:35.408659', 'step': 4571, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:35.446802', 'step': 4571, 'epoch': 3} {'type': 'loss', 'content': 0.0014599317219108343, 'timestamp': '2025-09-30 22:19:35.478668', 'step': 4572, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:35.514337', 'step': 4572, 'epoch': 3} {'type': 'loss', 'content': 0.0040939245373010635, 'timestamp': '2025-09-30 22:19:35.522476', 'step': 4573, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:19:35.564827', 'step': 4573, 'epoch': 3} {'type': 'loss', 'content': 0.0051482319831848145, 'timestamp': '2025-09-30 22:19:35.578547', 'step': 4574, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:19:35.622731', 'step': 4574, 'epoch': 3} {'type': 'loss', 'content': 0.002121998928487301, 'timestamp': '2025-09-30 22:19:35.635040', 'step': 4575, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:19:35.684360', 'step': 4575, 'epoch': 3} {'type': 'loss', 'content': 0.00047555830678902566, 'timestamp': '2025-09-30 22:19:35.720822', 'step': 4576, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:19:35.760202', 'step': 4576, 'epoch': 3} {'type': 'loss', 'content': 0.0017047971487045288, 'timestamp': '2025-09-30 22:19:35.770457', 'step': 4577, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:19:35.810577', 'step': 4577, 'epoch': 3} {'type': 'loss', 'content': 0.0023253338877111673, 'timestamp': '2025-09-30 22:19:35.822895', 'step': 4578, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:19:35.856316', 'step': 4578, 'epoch': 3} {'type': 'loss', 'content': 0.002054597483947873, 'timestamp': '2025-09-30 22:19:35.870507', 'step': 4579, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:19:35.908699', 'step': 4579, 'epoch': 3} {'type': 'loss', 'content': 0.0006414930685423315, 'timestamp': '2025-09-30 22:19:35.936668', 'step': 4580, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:19:35.971631', 'step': 4580, 'epoch': 3} {'type': 'loss', 'content': 0.0006736897048540413, 'timestamp': '2025-09-30 22:19:35.976553', 'step': 4581, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:36.021701', 'step': 4581, 'epoch': 3} {'type': 'loss', 'content': 0.002001025015488267, 'timestamp': '2025-09-30 22:19:36.032984', 'step': 4582, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:36.067324', 'step': 4582, 'epoch': 3} {'type': 'loss', 'content': 0.0002340712962904945, 'timestamp': '2025-09-30 22:19:36.077736', 'step': 4583, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:36.112009', 'step': 4583, 'epoch': 3} {'type': 'loss', 'content': 0.0006811887142248452, 'timestamp': '2025-09-30 22:19:36.143946', 'step': 4584, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:36.184921', 'step': 4584, 'epoch': 3} {'type': 'loss', 'content': 0.0010683821747079492, 'timestamp': '2025-09-30 22:19:36.193513', 'step': 4585, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:19:36.235100', 'step': 4585, 'epoch': 3} {'type': 'loss', 'content': 0.005461221560835838, 'timestamp': '2025-09-30 22:19:36.248455', 'step': 4586, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:36.281608', 'step': 4586, 'epoch': 3} {'type': 'loss', 'content': 0.0006103842169977725, 'timestamp': '2025-09-30 22:19:36.292621', 'step': 4587, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:19:36.333461', 'step': 4587, 'epoch': 3} {'type': 'loss', 'content': 0.0017355261370539665, 'timestamp': '2025-09-30 22:19:36.360914', 'step': 4588, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:36.397875', 'step': 4588, 'epoch': 3} {'type': 'loss', 'content': 0.0022532320581376553, 'timestamp': '2025-09-30 22:19:36.406639', 'step': 4589, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:36.455523', 'step': 4589, 'epoch': 3} {'type': 'loss', 'content': 0.004017478786408901, 'timestamp': '2025-09-30 22:19:36.466679', 'step': 4590, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:36.505417', 'step': 4590, 'epoch': 3} {'type': 'loss', 'content': 0.0012854663655161858, 'timestamp': '2025-09-30 22:19:36.515902', 'step': 4591, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:19:36.579035', 'step': 4591, 'epoch': 3} {'type': 'loss', 'content': 0.008219798095524311, 'timestamp': '2025-09-30 22:19:36.608591', 'step': 4592, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:19:36.657192', 'step': 4592, 'epoch': 3} {'type': 'loss', 'content': 0.0021332986652851105, 'timestamp': '2025-09-30 22:19:36.666962', 'step': 4593, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:19:36.715361', 'step': 4593, 'epoch': 3} {'type': 'loss', 'content': 0.0011237307917326689, 'timestamp': '2025-09-30 22:19:36.719861', 'step': 4594, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:36.763505', 'step': 4594, 'epoch': 3} {'type': 'loss', 'content': 0.0002990429929923266, 'timestamp': '2025-09-30 22:19:36.771576', 'step': 4595, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:19:36.814773', 'step': 4595, 'epoch': 3} {'type': 'loss', 'content': 0.003179828403517604, 'timestamp': '2025-09-30 22:19:36.840896', 'step': 4596, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:19:36.881684', 'step': 4596, 'epoch': 3} {'type': 'loss', 'content': 0.0031162381637841463, 'timestamp': '2025-09-30 22:19:36.886663', 'step': 4597, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:19:36.929348', 'step': 4597, 'epoch': 3} {'type': 'loss', 'content': 0.000504596158862114, 'timestamp': '2025-09-30 22:19:36.936979', 'step': 4598, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:36.989186', 'step': 4598, 'epoch': 3} {'type': 'loss', 'content': 0.0007450035191141069, 'timestamp': '2025-09-30 22:19:36.997008', 'step': 4599, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:37.033174', 'step': 4599, 'epoch': 3} {'type': 'loss', 'content': 0.00017571434727869928, 'timestamp': '2025-09-30 22:19:37.064481', 'step': 4600, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-09-30 22:19:39.766720', 'step': 4600, 'epoch': 3} {'type': 'pplx', 'content': 5.656691005321658, 'timestamp': '2025-09-30 22:19:39.775260', 'step': 4600, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:39.805167', 'step': 4600, 'epoch': 3} {'type': 'loss', 'content': 0.0016262733843177557, 'timestamp': '2025-09-30 22:19:39.812645', 'step': 4601, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:39.857086', 'step': 4601, 'epoch': 3} {'type': 'loss', 'content': 0.0020167173352092505, 'timestamp': '2025-09-30 22:19:39.867368', 'step': 4602, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 15187581968384}, 'timestamp': '2025-09-30 22:19:39.912503', 'step': 4602, 'epoch': 3} {'type': 'loss', 'content': 0.005614968482404947, 'timestamp': '2025-09-30 22:19:39.930231', 'step': 4603, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:19:39.968839', 'step': 4603, 'epoch': 3} {'type': 'loss', 'content': 0.005495783872902393, 'timestamp': '2025-09-30 22:19:40.002221', 'step': 4604, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:19:40.046830', 'step': 4604, 'epoch': 3} {'type': 'loss', 'content': 0.018731797114014626, 'timestamp': '2025-09-30 22:19:40.059484', 'step': 4605, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:40.103731', 'step': 4605, 'epoch': 3} {'type': 'loss', 'content': 0.0006927954382263124, 'timestamp': '2025-09-30 22:19:40.113991', 'step': 4606, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:40.152335', 'step': 4606, 'epoch': 3} {'type': 'loss', 'content': 0.0013639118988066912, 'timestamp': '2025-09-30 22:19:40.162699', 'step': 4607, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:40.197857', 'step': 4607, 'epoch': 3} {'type': 'loss', 'content': 0.0010228747269138694, 'timestamp': '2025-09-30 22:19:40.233554', 'step': 4608, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:40.280263', 'step': 4608, 'epoch': 3} {'type': 'loss', 'content': 0.0013792128302156925, 'timestamp': '2025-09-30 22:19:40.288248', 'step': 4609, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:19:40.323107', 'step': 4609, 'epoch': 3} {'type': 'loss', 'content': 0.002117737429216504, 'timestamp': '2025-09-30 22:19:40.335460', 'step': 4610, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-09-30 22:19:40.381733', 'step': 4610, 'epoch': 3} {'type': 'loss', 'content': 0.0012078828876838088, 'timestamp': '2025-09-30 22:19:40.398857', 'step': 4611, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:40.437781', 'step': 4611, 'epoch': 3} {'type': 'loss', 'content': 0.002663145773112774, 'timestamp': '2025-09-30 22:19:40.469111', 'step': 4612, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:19:40.503691', 'step': 4612, 'epoch': 3} {'type': 'loss', 'content': 0.001201400882564485, 'timestamp': '2025-09-30 22:19:40.508981', 'step': 4613, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:19:40.543120', 'step': 4613, 'epoch': 3} {'type': 'loss', 'content': 0.0009282511891797185, 'timestamp': '2025-09-30 22:19:40.550316', 'step': 4614, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:40.589720', 'step': 4614, 'epoch': 3} {'type': 'loss', 'content': 0.005988210439682007, 'timestamp': '2025-09-30 22:19:40.600119', 'step': 4615, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:19:40.644818', 'step': 4615, 'epoch': 3} {'type': 'loss', 'content': 7.458464096998796e-05, 'timestamp': '2025-09-30 22:19:40.670151', 'step': 4616, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:19:40.713428', 'step': 4616, 'epoch': 3} {'type': 'loss', 'content': 0.0013050598790869117, 'timestamp': '2025-09-30 22:19:40.718264', 'step': 4617, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:19:40.770462', 'step': 4617, 'epoch': 3} {'type': 'loss', 'content': 0.0040612309239804745, 'timestamp': '2025-09-30 22:19:40.784326', 'step': 4618, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:19:40.841919', 'step': 4618, 'epoch': 3} {'type': 'loss', 'content': 0.000192474210052751, 'timestamp': '2025-09-30 22:19:40.858243', 'step': 4619, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:40.891975', 'step': 4619, 'epoch': 3} {'type': 'loss', 'content': 0.00040366427856497467, 'timestamp': '2025-09-30 22:19:40.920797', 'step': 4620, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:19:40.961687', 'step': 4620, 'epoch': 3} {'type': 'loss', 'content': 0.005431856960058212, 'timestamp': '2025-09-30 22:19:40.974238', 'step': 4621, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:19:41.023215', 'step': 4621, 'epoch': 3} {'type': 'loss', 'content': 0.009540246799588203, 'timestamp': '2025-09-30 22:19:41.028508', 'step': 4622, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:19:41.068857', 'step': 4622, 'epoch': 3} {'type': 'loss', 'content': 0.00016501954814884812, 'timestamp': '2025-09-30 22:19:41.076044', 'step': 4623, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:19:41.131513', 'step': 4623, 'epoch': 3} {'type': 'loss', 'content': 0.0013068681582808495, 'timestamp': '2025-09-30 22:19:41.165789', 'step': 4624, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:19:41.206596', 'step': 4624, 'epoch': 3} {'type': 'loss', 'content': 0.005706985015422106, 'timestamp': '2025-09-30 22:19:41.221978', 'step': 4625, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:41.282173', 'step': 4625, 'epoch': 3} {'type': 'loss', 'content': 0.002301833126693964, 'timestamp': '2025-09-30 22:19:41.292744', 'step': 4626, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:19:41.342512', 'step': 4626, 'epoch': 3} {'type': 'loss', 'content': 0.010077630169689655, 'timestamp': '2025-09-30 22:19:41.356219', 'step': 4627, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-09-30 22:19:41.408316', 'step': 4627, 'epoch': 3} {'type': 'loss', 'content': 0.003587552113458514, 'timestamp': '2025-09-30 22:19:41.446230', 'step': 4628, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:19:41.485501', 'step': 4628, 'epoch': 3} {'type': 'loss', 'content': 0.002079867059364915, 'timestamp': '2025-09-30 22:19:41.495288', 'step': 4629, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:41.534082', 'step': 4629, 'epoch': 3} {'type': 'loss', 'content': 0.005988697987049818, 'timestamp': '2025-09-30 22:19:41.546494', 'step': 4630, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:19:41.589623', 'step': 4630, 'epoch': 3} {'type': 'loss', 'content': 0.0020013691391795874, 'timestamp': '2025-09-30 22:19:41.603360', 'step': 4631, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:41.638617', 'step': 4631, 'epoch': 3} {'type': 'loss', 'content': 0.0010503423400223255, 'timestamp': '2025-09-30 22:19:41.669780', 'step': 4632, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:41.707403', 'step': 4632, 'epoch': 3} {'type': 'loss', 'content': 0.002322463784366846, 'timestamp': '2025-09-30 22:19:41.715325', 'step': 4633, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:19:41.760474', 'step': 4633, 'epoch': 3} {'type': 'loss', 'content': 0.0016447566449642181, 'timestamp': '2025-09-30 22:19:41.776142', 'step': 4634, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:19:41.810510', 'step': 4634, 'epoch': 3} {'type': 'loss', 'content': 0.003105347277596593, 'timestamp': '2025-09-30 22:19:41.823078', 'step': 4635, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:19:41.860386', 'step': 4635, 'epoch': 3} {'type': 'loss', 'content': 0.004947391804307699, 'timestamp': '2025-09-30 22:19:41.895038', 'step': 4636, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:19:41.950048', 'step': 4636, 'epoch': 3} {'type': 'loss', 'content': 0.0003328040475025773, 'timestamp': '2025-09-30 22:19:41.959936', 'step': 4637, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:19:41.993101', 'step': 4637, 'epoch': 3} {'type': 'loss', 'content': 0.0006075861747376621, 'timestamp': '2025-09-30 22:19:42.000545', 'step': 4638, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:19:42.057172', 'step': 4638, 'epoch': 3} {'type': 'loss', 'content': 0.0014844061806797981, 'timestamp': '2025-09-30 22:19:42.064867', 'step': 4639, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:19:42.106063', 'step': 4639, 'epoch': 3} {'type': 'loss', 'content': 0.005587319377809763, 'timestamp': '2025-09-30 22:19:42.133826', 'step': 4640, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:19:42.183493', 'step': 4640, 'epoch': 3} {'type': 'loss', 'content': 0.0011380692012608051, 'timestamp': '2025-09-30 22:19:42.188688', 'step': 4641, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:42.238509', 'step': 4641, 'epoch': 3} {'type': 'loss', 'content': 0.0002756123139988631, 'timestamp': '2025-09-30 22:19:42.249595', 'step': 4642, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:42.316745', 'step': 4642, 'epoch': 3} {'type': 'loss', 'content': 0.0017162506701424718, 'timestamp': '2025-09-30 22:19:42.328400', 'step': 4643, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:42.378517', 'step': 4643, 'epoch': 3} {'type': 'loss', 'content': 0.0008271224214695394, 'timestamp': '2025-09-30 22:19:42.410499', 'step': 4644, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:19:42.446342', 'step': 4644, 'epoch': 3} {'type': 'loss', 'content': 0.0012134123826399446, 'timestamp': '2025-09-30 22:19:42.456770', 'step': 4645, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:42.496616', 'step': 4645, 'epoch': 3} {'type': 'loss', 'content': 0.007950005121529102, 'timestamp': '2025-09-30 22:19:42.507694', 'step': 4646, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:19:42.549047', 'step': 4646, 'epoch': 3} {'type': 'loss', 'content': 0.0023825380485504866, 'timestamp': '2025-09-30 22:19:42.561643', 'step': 4647, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:19:42.602439', 'step': 4647, 'epoch': 3} {'type': 'loss', 'content': 0.0019718753173947334, 'timestamp': '2025-09-30 22:19:42.632142', 'step': 4648, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:42.673672', 'step': 4648, 'epoch': 3} {'type': 'loss', 'content': 0.014634182676672935, 'timestamp': '2025-09-30 22:19:42.682308', 'step': 4649, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:19:42.716078', 'step': 4649, 'epoch': 3} {'type': 'loss', 'content': 0.0015215821331366897, 'timestamp': '2025-09-30 22:19:42.723182', 'step': 4650, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:42.767185', 'step': 4650, 'epoch': 3} {'type': 'loss', 'content': 0.007147200405597687, 'timestamp': '2025-09-30 22:19:42.778329', 'step': 4651, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:19:42.820664', 'step': 4651, 'epoch': 3} {'type': 'loss', 'content': 5.3520649089477956e-05, 'timestamp': '2025-09-30 22:19:42.852633', 'step': 4652, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:19:42.891055', 'step': 4652, 'epoch': 3} {'type': 'loss', 'content': 0.0012179957702755928, 'timestamp': '2025-09-30 22:19:42.904463', 'step': 4653, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:42.946556', 'step': 4653, 'epoch': 3} {'type': 'loss', 'content': 0.010880755260586739, 'timestamp': '2025-09-30 22:19:42.958239', 'step': 4654, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:19:43.005839', 'step': 4654, 'epoch': 3} {'type': 'loss', 'content': 0.001264199847355485, 'timestamp': '2025-09-30 22:19:43.012997', 'step': 4655, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:19:43.059069', 'step': 4655, 'epoch': 3} {'type': 'loss', 'content': 0.001374272396788001, 'timestamp': '2025-09-30 22:19:43.093699', 'step': 4656, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:19:43.131002', 'step': 4656, 'epoch': 3} {'type': 'loss', 'content': 0.0026250374503433704, 'timestamp': '2025-09-30 22:19:43.143605', 'step': 4657, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:19:43.183035', 'step': 4657, 'epoch': 3} {'type': 'loss', 'content': 0.001169042312540114, 'timestamp': '2025-09-30 22:19:43.195286', 'step': 4658, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:19:43.229861', 'step': 4658, 'epoch': 3} {'type': 'loss', 'content': 0.0010033926228061318, 'timestamp': '2025-09-30 22:19:43.242385', 'step': 4659, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:19:43.281345', 'step': 4659, 'epoch': 3} {'type': 'loss', 'content': 0.007937717251479626, 'timestamp': '2025-09-30 22:19:43.315966', 'step': 4660, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 15662185694400}, 'timestamp': '2025-09-30 22:19:43.367063', 'step': 4660, 'epoch': 3} {'type': 'loss', 'content': 0.002611410105600953, 'timestamp': '2025-09-30 22:19:43.386394', 'step': 4661, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:19:43.428308', 'step': 4661, 'epoch': 3} {'type': 'loss', 'content': 0.001537830918096006, 'timestamp': '2025-09-30 22:19:43.443976', 'step': 4662, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:43.484425', 'step': 4662, 'epoch': 3} {'type': 'loss', 'content': 0.0025844555348157883, 'timestamp': '2025-09-30 22:19:43.495595', 'step': 4663, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:19:43.539062', 'step': 4663, 'epoch': 3} {'type': 'loss', 'content': 0.007824945263564587, 'timestamp': '2025-09-30 22:19:43.573655', 'step': 4664, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:43.614690', 'step': 4664, 'epoch': 3} {'type': 'loss', 'content': 0.0009558065794408321, 'timestamp': '2025-09-30 22:19:43.623966', 'step': 4665, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:19:43.663348', 'step': 4665, 'epoch': 3} {'type': 'loss', 'content': 0.01655222289264202, 'timestamp': '2025-09-30 22:19:43.675841', 'step': 4666, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:19:43.719324', 'step': 4666, 'epoch': 3} {'type': 'loss', 'content': 0.0012177954195067286, 'timestamp': '2025-09-30 22:19:43.734899', 'step': 4667, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:19:43.786009', 'step': 4667, 'epoch': 3} {'type': 'loss', 'content': 0.0031382772140204906, 'timestamp': '2025-09-30 22:19:43.820731', 'step': 4668, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:43.860008', 'step': 4668, 'epoch': 3} {'type': 'loss', 'content': 0.0005979533889330924, 'timestamp': '2025-09-30 22:19:43.868549', 'step': 4669, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:19:43.920216', 'step': 4669, 'epoch': 3} {'type': 'loss', 'content': 0.0010151851456612349, 'timestamp': '2025-09-30 22:19:43.933517', 'step': 4670, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:19:43.970539', 'step': 4670, 'epoch': 3} {'type': 'loss', 'content': 0.0009803612483665347, 'timestamp': '2025-09-30 22:19:43.977699', 'step': 4671, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:44.017376', 'step': 4671, 'epoch': 3} {'type': 'loss', 'content': 0.0012478310381993651, 'timestamp': '2025-09-30 22:19:44.048548', 'step': 4672, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:19:44.087850', 'step': 4672, 'epoch': 3} {'type': 'loss', 'content': 0.008808085694909096, 'timestamp': '2025-09-30 22:19:44.099007', 'step': 4673, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:19:44.134380', 'step': 4673, 'epoch': 3} {'type': 'loss', 'content': 0.0010443759383633733, 'timestamp': '2025-09-30 22:19:44.141579', 'step': 4674, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:44.188309', 'step': 4674, 'epoch': 3} {'type': 'loss', 'content': 0.002289071911945939, 'timestamp': '2025-09-30 22:19:44.199260', 'step': 4675, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 16136789420416}, 'timestamp': '2025-09-30 22:19:44.248904', 'step': 4675, 'epoch': 3} {'type': 'loss', 'content': 0.0038310252130031586, 'timestamp': '2025-09-30 22:19:44.288856', 'step': 4676, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:44.338180', 'step': 4676, 'epoch': 3} {'type': 'loss', 'content': 0.0003230969305150211, 'timestamp': '2025-09-30 22:19:44.346774', 'step': 4677, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:19:44.384951', 'step': 4677, 'epoch': 3} {'type': 'loss', 'content': 0.00023587503528688103, 'timestamp': '2025-09-30 22:19:44.392194', 'step': 4678, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-09-30 22:19:44.440927', 'step': 4678, 'epoch': 3} {'type': 'loss', 'content': 0.0006135299918241799, 'timestamp': '2025-09-30 22:19:44.457240', 'step': 4679, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:44.506173', 'step': 4679, 'epoch': 3} {'type': 'loss', 'content': 0.0010174872586503625, 'timestamp': '2025-09-30 22:19:44.535051', 'step': 4680, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:19:44.569337', 'step': 4680, 'epoch': 3} {'type': 'loss', 'content': 0.0034730613697320223, 'timestamp': '2025-09-30 22:19:44.579241', 'step': 4681, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:44.613427', 'step': 4681, 'epoch': 3} {'type': 'loss', 'content': 0.003659060224890709, 'timestamp': '2025-09-30 22:19:44.621172', 'step': 4682, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:44.665480', 'step': 4682, 'epoch': 3} {'type': 'loss', 'content': 0.004465331789106131, 'timestamp': '2025-09-30 22:19:44.673396', 'step': 4683, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:19:44.710044', 'step': 4683, 'epoch': 3} {'type': 'loss', 'content': 0.0053755613043904305, 'timestamp': '2025-09-30 22:19:44.744291', 'step': 4684, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:19:44.777990', 'step': 4684, 'epoch': 3} {'type': 'loss', 'content': 0.001365147065371275, 'timestamp': '2025-09-30 22:19:44.783298', 'step': 4685, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:19:44.826299', 'step': 4685, 'epoch': 3} {'type': 'loss', 'content': 0.0025931692216545343, 'timestamp': '2025-09-30 22:19:44.837848', 'step': 4686, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:19:44.879541', 'step': 4686, 'epoch': 3} {'type': 'loss', 'content': 0.004931640345603228, 'timestamp': '2025-09-30 22:19:44.891238', 'step': 4687, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:44.933341', 'step': 4687, 'epoch': 3} {'type': 'loss', 'content': 0.004364571999758482, 'timestamp': '2025-09-30 22:19:44.965204', 'step': 4688, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:19:45.003072', 'step': 4688, 'epoch': 3} {'type': 'loss', 'content': 0.00468175346031785, 'timestamp': '2025-09-30 22:19:45.005771', 'step': 4689, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:45.047548', 'step': 4689, 'epoch': 3} {'type': 'loss', 'content': 0.0012734520714730024, 'timestamp': '2025-09-30 22:19:45.058551', 'step': 4690, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:19:45.092072', 'step': 4690, 'epoch': 3} {'type': 'loss', 'content': 0.002087516477331519, 'timestamp': '2025-09-30 22:19:45.101670', 'step': 4691, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:19:45.136529', 'step': 4691, 'epoch': 3} {'type': 'loss', 'content': 0.0008861172827892005, 'timestamp': '2025-09-30 22:19:45.166658', 'step': 4692, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:19:45.204268', 'step': 4692, 'epoch': 3} {'type': 'loss', 'content': 0.0002589194045867771, 'timestamp': '2025-09-30 22:19:45.212857', 'step': 4693, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:19:45.244925', 'step': 4693, 'epoch': 3} {'type': 'loss', 'content': 0.004392869770526886, 'timestamp': '2025-09-30 22:19:45.249359', 'step': 4694, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:45.283113', 'step': 4694, 'epoch': 3} {'type': 'loss', 'content': 0.001689436612650752, 'timestamp': '2025-09-30 22:19:45.293505', 'step': 4695, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:19:45.331621', 'step': 4695, 'epoch': 3} {'type': 'loss', 'content': 0.0020471159368753433, 'timestamp': '2025-09-30 22:19:45.366301', 'step': 4696, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:19:45.403012', 'step': 4696, 'epoch': 3} {'type': 'loss', 'content': 0.003873740555718541, 'timestamp': '2025-09-30 22:19:45.416051', 'step': 4697, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:19:45.456013', 'step': 4697, 'epoch': 3} {'type': 'loss', 'content': 0.0026147381868213415, 'timestamp': '2025-09-30 22:19:45.463286', 'step': 4698, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:19:45.509674', 'step': 4698, 'epoch': 3} {'type': 'loss', 'content': 0.0006632203003391623, 'timestamp': '2025-09-30 22:19:45.517249', 'step': 4699, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:45.557208', 'step': 4699, 'epoch': 3} {'type': 'loss', 'content': 0.004550919868052006, 'timestamp': '2025-09-30 22:19:45.591516', 'step': 4700, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:45.641723', 'step': 4700, 'epoch': 3} {'type': 'loss', 'content': 0.0008368680137209594, 'timestamp': '2025-09-30 22:19:45.650357', 'step': 4701, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:45.686989', 'step': 4701, 'epoch': 3} {'type': 'loss', 'content': 0.002739719580858946, 'timestamp': '2025-09-30 22:19:45.698049', 'step': 4702, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:19:45.737941', 'step': 4702, 'epoch': 3} {'type': 'loss', 'content': 0.000583208107855171, 'timestamp': '2025-09-30 22:19:45.751669', 'step': 4703, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:45.793151', 'step': 4703, 'epoch': 3} {'type': 'loss', 'content': 0.010152159258723259, 'timestamp': '2025-09-30 22:19:45.825217', 'step': 4704, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:19:45.873179', 'step': 4704, 'epoch': 3} {'type': 'loss', 'content': 0.0006749342428520322, 'timestamp': '2025-09-30 22:19:45.877914', 'step': 4705, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:19:45.924602', 'step': 4705, 'epoch': 3} {'type': 'loss', 'content': 0.0002722602221183479, 'timestamp': '2025-09-30 22:19:45.932173', 'step': 4706, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:19:45.996214', 'step': 4706, 'epoch': 3} {'type': 'loss', 'content': 0.002924448810517788, 'timestamp': '2025-09-30 22:19:46.010194', 'step': 4707, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 17560600598464}, 'timestamp': '2025-09-30 22:19:46.068069', 'step': 4707, 'epoch': 3} {'type': 'loss', 'content': 0.002039122162386775, 'timestamp': '2025-09-30 22:19:46.110020', 'step': 4708, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:19:46.146288', 'step': 4708, 'epoch': 3} {'type': 'loss', 'content': 0.0011562302242964506, 'timestamp': '2025-09-30 22:19:46.155943', 'step': 4709, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:19:46.203784', 'step': 4709, 'epoch': 3} {'type': 'loss', 'content': 0.00036889605689793825, 'timestamp': '2025-09-30 22:19:46.217473', 'step': 4710, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:19:46.270105', 'step': 4710, 'epoch': 3} {'type': 'loss', 'content': 0.0008371215080842376, 'timestamp': '2025-09-30 22:19:46.276936', 'step': 4711, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:46.321107', 'step': 4711, 'epoch': 3} {'type': 'loss', 'content': 0.0016866663936525583, 'timestamp': '2025-09-30 22:19:46.352194', 'step': 4712, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:19:46.396317', 'step': 4712, 'epoch': 3} {'type': 'loss', 'content': 0.0002764615637715906, 'timestamp': '2025-09-30 22:19:46.401667', 'step': 4713, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:19:46.436917', 'step': 4713, 'epoch': 3} {'type': 'loss', 'content': 0.00044769837404601276, 'timestamp': '2025-09-30 22:19:46.446931', 'step': 4714, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:19:46.495594', 'step': 4714, 'epoch': 3} {'type': 'loss', 'content': 0.003973923623561859, 'timestamp': '2025-09-30 22:19:46.502644', 'step': 4715, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-09-30 22:19:49.159786', 'step': 4715, 'epoch': 3} {'type': 'pplx', 'content': 5.916283949698796, 'timestamp': '2025-09-30 22:19:49.165907', 'step': 4715, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:19:49.198024', 'step': 4715, 'epoch': 3} {'type': 'loss', 'content': 0.008571183308959007, 'timestamp': '2025-09-30 22:19:49.231133', 'step': 4716, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:49.280396', 'step': 4716, 'epoch': 3} {'type': 'loss', 'content': 0.0006484166369773448, 'timestamp': '2025-09-30 22:19:49.288754', 'step': 4717, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:19:49.335668', 'step': 4717, 'epoch': 3} {'type': 'loss', 'content': 0.0005155609687790275, 'timestamp': '2025-09-30 22:19:49.348306', 'step': 4718, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:19:49.392772', 'step': 4718, 'epoch': 3} {'type': 'loss', 'content': 0.003010595915839076, 'timestamp': '2025-09-30 22:19:49.400358', 'step': 4719, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:49.440611', 'step': 4719, 'epoch': 3} {'type': 'loss', 'content': 0.002692155074328184, 'timestamp': '2025-09-30 22:19:49.471788', 'step': 4720, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:19:49.504329', 'step': 4720, 'epoch': 3} {'type': 'loss', 'content': 0.001199776423163712, 'timestamp': '2025-09-30 22:19:49.506514', 'step': 4721, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:19:49.541885', 'step': 4721, 'epoch': 3} {'type': 'loss', 'content': 0.0006059607258066535, 'timestamp': '2025-09-30 22:19:49.549424', 'step': 4722, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:19:49.591228', 'step': 4722, 'epoch': 3} {'type': 'loss', 'content': 0.0011583056766539812, 'timestamp': '2025-09-30 22:19:49.604659', 'step': 4723, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:49.643155', 'step': 4723, 'epoch': 3} {'type': 'loss', 'content': 0.0011154541280120611, 'timestamp': '2025-09-30 22:19:49.671796', 'step': 4724, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:49.706889', 'step': 4724, 'epoch': 3} {'type': 'loss', 'content': 0.001738877734169364, 'timestamp': '2025-09-30 22:19:49.714893', 'step': 4725, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:49.748139', 'step': 4725, 'epoch': 3} {'type': 'loss', 'content': 0.0023189731873571873, 'timestamp': '2025-09-30 22:19:49.756106', 'step': 4726, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:19:49.792938', 'step': 4726, 'epoch': 3} {'type': 'loss', 'content': 0.003144277259707451, 'timestamp': '2025-09-30 22:19:49.805038', 'step': 4727, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:19:49.849840', 'step': 4727, 'epoch': 3} {'type': 'loss', 'content': 0.009790902957320213, 'timestamp': '2025-09-30 22:19:49.884156', 'step': 4728, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:49.924978', 'step': 4728, 'epoch': 3} {'type': 'loss', 'content': 0.000623914529569447, 'timestamp': '2025-09-30 22:19:49.930454', 'step': 4729, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:49.963705', 'step': 4729, 'epoch': 3} {'type': 'loss', 'content': 0.010114804841578007, 'timestamp': '2025-09-30 22:19:49.971723', 'step': 4730, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:50.010049', 'step': 4730, 'epoch': 3} {'type': 'loss', 'content': 0.0009162042988464236, 'timestamp': '2025-09-30 22:19:50.018097', 'step': 4731, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:19:50.056047', 'step': 4731, 'epoch': 3} {'type': 'loss', 'content': 0.00019842210167553276, 'timestamp': '2025-09-30 22:19:50.089084', 'step': 4732, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:19:50.137549', 'step': 4732, 'epoch': 3} {'type': 'loss', 'content': 0.002871887059882283, 'timestamp': '2025-09-30 22:19:50.152815', 'step': 4733, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:19:50.189714', 'step': 4733, 'epoch': 3} {'type': 'loss', 'content': 0.003444153117015958, 'timestamp': '2025-09-30 22:19:50.203455', 'step': 4734, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:19:50.241485', 'step': 4734, 'epoch': 3} {'type': 'loss', 'content': 0.004989412147551775, 'timestamp': '2025-09-30 22:19:50.248331', 'step': 4735, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-09-30 22:19:50.302747', 'step': 4735, 'epoch': 3} {'type': 'loss', 'content': 0.006008580792695284, 'timestamp': '2025-09-30 22:19:50.339932', 'step': 4736, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:19:50.383337', 'step': 4736, 'epoch': 3} {'type': 'loss', 'content': 0.0024037344846874475, 'timestamp': '2025-09-30 22:19:50.395881', 'step': 4737, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:19:50.435039', 'step': 4737, 'epoch': 3} {'type': 'loss', 'content': 0.0007503728847950697, 'timestamp': '2025-09-30 22:19:50.448716', 'step': 4738, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:50.485018', 'step': 4738, 'epoch': 3} {'type': 'loss', 'content': 0.0027330864686518908, 'timestamp': '2025-09-30 22:19:50.492885', 'step': 4739, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:19:50.530740', 'step': 4739, 'epoch': 3} {'type': 'loss', 'content': 0.005178096238523722, 'timestamp': '2025-09-30 22:19:50.565531', 'step': 4740, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:19:50.624153', 'step': 4740, 'epoch': 3} {'type': 'loss', 'content': 0.0003495750133879483, 'timestamp': '2025-09-30 22:19:50.634871', 'step': 4741, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:50.679545', 'step': 4741, 'epoch': 3} {'type': 'loss', 'content': 0.0003605498350225389, 'timestamp': '2025-09-30 22:19:50.687500', 'step': 4742, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:50.729502', 'step': 4742, 'epoch': 3} {'type': 'loss', 'content': 0.002601269632577896, 'timestamp': '2025-09-30 22:19:50.740075', 'step': 4743, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:19:50.787436', 'step': 4743, 'epoch': 3} {'type': 'loss', 'content': 0.0007903297082521021, 'timestamp': '2025-09-30 22:19:50.820818', 'step': 4744, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-09-30 22:19:50.864280', 'step': 4744, 'epoch': 3} {'type': 'loss', 'content': 0.000361712125595659, 'timestamp': '2025-09-30 22:19:50.880085', 'step': 4745, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:50.919572', 'step': 4745, 'epoch': 3} {'type': 'loss', 'content': 0.01337670162320137, 'timestamp': '2025-09-30 22:19:50.929988', 'step': 4746, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:50.962672', 'step': 4746, 'epoch': 3} {'type': 'loss', 'content': 0.0022669811733067036, 'timestamp': '2025-09-30 22:19:50.970607', 'step': 4747, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:19:51.013616', 'step': 4747, 'epoch': 3} {'type': 'loss', 'content': 0.0003634454042185098, 'timestamp': '2025-09-30 22:19:51.042074', 'step': 4748, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:19:51.077298', 'step': 4748, 'epoch': 3} {'type': 'loss', 'content': 0.004122935235500336, 'timestamp': '2025-09-30 22:19:51.090513', 'step': 4749, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:19:51.129413', 'step': 4749, 'epoch': 3} {'type': 'loss', 'content': 0.0012344943825155497, 'timestamp': '2025-09-30 22:19:51.136228', 'step': 4750, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:19:51.170077', 'step': 4750, 'epoch': 3} {'type': 'loss', 'content': 0.0031255418434739113, 'timestamp': '2025-09-30 22:19:51.174527', 'step': 4751, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:51.208859', 'step': 4751, 'epoch': 3} {'type': 'loss', 'content': 0.0018705984111875296, 'timestamp': '2025-09-30 22:19:51.240840', 'step': 4752, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:51.277634', 'step': 4752, 'epoch': 3} {'type': 'loss', 'content': 0.00835074856877327, 'timestamp': '2025-09-30 22:19:51.283274', 'step': 4753, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:51.321212', 'step': 4753, 'epoch': 3} {'type': 'loss', 'content': 0.0031465431675314903, 'timestamp': '2025-09-30 22:19:51.331623', 'step': 4754, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:19:51.384832', 'step': 4754, 'epoch': 3} {'type': 'loss', 'content': 0.0033585333731025457, 'timestamp': '2025-09-30 22:19:51.389860', 'step': 4755, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:19:51.429842', 'step': 4755, 'epoch': 3} {'type': 'loss', 'content': 0.00037267437437549233, 'timestamp': '2025-09-30 22:19:51.464158', 'step': 4756, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:51.509366', 'step': 4756, 'epoch': 3} {'type': 'loss', 'content': 0.002945550484582782, 'timestamp': '2025-09-30 22:19:51.517312', 'step': 4757, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:19:51.553013', 'step': 4757, 'epoch': 3} {'type': 'loss', 'content': 0.003742699744179845, 'timestamp': '2025-09-30 22:19:51.560746', 'step': 4758, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:19:51.594390', 'step': 4758, 'epoch': 3} {'type': 'loss', 'content': 0.0038363654166460037, 'timestamp': '2025-09-30 22:19:51.598863', 'step': 4759, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:51.642669', 'step': 4759, 'epoch': 3} {'type': 'loss', 'content': 0.0011934564681723714, 'timestamp': '2025-09-30 22:19:51.673819', 'step': 4760, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:19:51.708526', 'step': 4760, 'epoch': 3} {'type': 'loss', 'content': 0.0024356788489967585, 'timestamp': '2025-09-30 22:19:51.711917', 'step': 4761, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:19:51.751471', 'step': 4761, 'epoch': 3} {'type': 'loss', 'content': 0.00025372591335326433, 'timestamp': '2025-09-30 22:19:51.758390', 'step': 4762, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:19:51.791196', 'step': 4762, 'epoch': 3} {'type': 'loss', 'content': 0.0007249244372360408, 'timestamp': '2025-09-30 22:19:51.795468', 'step': 4763, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:19:51.839825', 'step': 4763, 'epoch': 3} {'type': 'loss', 'content': 0.0019645406864583492, 'timestamp': '2025-09-30 22:19:51.873195', 'step': 4764, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:19:51.920353', 'step': 4764, 'epoch': 3} {'type': 'loss', 'content': 0.0048051439225673676, 'timestamp': '2025-09-30 22:19:51.927563', 'step': 4765, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:19:51.962335', 'step': 4765, 'epoch': 3} {'type': 'loss', 'content': 0.0010794244008138776, 'timestamp': '2025-09-30 22:19:51.966738', 'step': 4766, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:19:51.999503', 'step': 4766, 'epoch': 3} {'type': 'loss', 'content': 0.00022505532251670957, 'timestamp': '2025-09-30 22:19:52.003882', 'step': 4767, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:52.038392', 'step': 4767, 'epoch': 3} {'type': 'loss', 'content': 0.0019170681480318308, 'timestamp': '2025-09-30 22:19:52.067010', 'step': 4768, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:19:52.106339', 'step': 4768, 'epoch': 3} {'type': 'loss', 'content': 0.0008401435916312039, 'timestamp': '2025-09-30 22:19:52.111424', 'step': 4769, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:19:52.150870', 'step': 4769, 'epoch': 3} {'type': 'loss', 'content': 0.0033256863243877888, 'timestamp': '2025-09-30 22:19:52.160827', 'step': 4770, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:19:52.197730', 'step': 4770, 'epoch': 3} {'type': 'loss', 'content': 0.004256395623087883, 'timestamp': '2025-09-30 22:19:52.205367', 'step': 4771, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:19:52.239837', 'step': 4771, 'epoch': 3} {'type': 'loss', 'content': 0.001991269877180457, 'timestamp': '2025-09-30 22:19:52.267819', 'step': 4772, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:19:52.301773', 'step': 4772, 'epoch': 3} {'type': 'loss', 'content': 0.0005228807567618787, 'timestamp': '2025-09-30 22:19:52.306328', 'step': 4773, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:52.346044', 'step': 4773, 'epoch': 3} {'type': 'loss', 'content': 0.0021591621916741133, 'timestamp': '2025-09-30 22:19:52.356981', 'step': 4774, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:19:52.390548', 'step': 4774, 'epoch': 3} {'type': 'loss', 'content': 0.002881488995626569, 'timestamp': '2025-09-30 22:19:52.402822', 'step': 4775, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:19:52.439950', 'step': 4775, 'epoch': 3} {'type': 'loss', 'content': 0.004071081057190895, 'timestamp': '2025-09-30 22:19:52.474829', 'step': 4776, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:19:52.514831', 'step': 4776, 'epoch': 3} {'type': 'loss', 'content': 0.0008434464689344168, 'timestamp': '2025-09-30 22:19:52.525133', 'step': 4777, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:52.558323', 'step': 4777, 'epoch': 3} {'type': 'loss', 'content': 0.0006649593124166131, 'timestamp': '2025-09-30 22:19:52.569433', 'step': 4778, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:52.602032', 'step': 4778, 'epoch': 3} {'type': 'loss', 'content': 0.0019085907842963934, 'timestamp': '2025-09-30 22:19:52.612269', 'step': 4779, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:19:52.645900', 'step': 4779, 'epoch': 3} {'type': 'loss', 'content': 0.0013334425166249275, 'timestamp': '2025-09-30 22:19:52.674176', 'step': 4780, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:19:52.713040', 'step': 4780, 'epoch': 3} {'type': 'loss', 'content': 0.004540843889117241, 'timestamp': '2025-09-30 22:19:52.728256', 'step': 4781, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:19:52.767615', 'step': 4781, 'epoch': 3} {'type': 'loss', 'content': 0.002883870154619217, 'timestamp': '2025-09-30 22:19:52.781370', 'step': 4782, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:52.817438', 'step': 4782, 'epoch': 3} {'type': 'loss', 'content': 0.004271261394023895, 'timestamp': '2025-09-30 22:19:52.827972', 'step': 4783, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:52.862346', 'step': 4783, 'epoch': 3} {'type': 'loss', 'content': 0.002068759873509407, 'timestamp': '2025-09-30 22:19:52.891048', 'step': 4784, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:19:52.926739', 'step': 4784, 'epoch': 3} {'type': 'loss', 'content': 0.002156310947611928, 'timestamp': '2025-09-30 22:19:52.931933', 'step': 4785, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:19:52.969114', 'step': 4785, 'epoch': 3} {'type': 'loss', 'content': 0.0015850166091695428, 'timestamp': '2025-09-30 22:19:52.976453', 'step': 4786, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:19:53.019116', 'step': 4786, 'epoch': 3} {'type': 'loss', 'content': 0.0010362897301092744, 'timestamp': '2025-09-30 22:19:53.033165', 'step': 4787, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:19:53.073942', 'step': 4787, 'epoch': 3} {'type': 'loss', 'content': 0.0005819321959279478, 'timestamp': '2025-09-30 22:19:53.102558', 'step': 4788, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 14238374516352}, 'timestamp': '2025-09-30 22:19:53.145141', 'step': 4788, 'epoch': 3} {'type': 'loss', 'content': 0.005462463945150375, 'timestamp': '2025-09-30 22:19:53.162067', 'step': 4789, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:53.197579', 'step': 4789, 'epoch': 3} {'type': 'loss', 'content': 0.0010457502212375402, 'timestamp': '2025-09-30 22:19:53.205505', 'step': 4790, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:19:53.241714', 'step': 4790, 'epoch': 3} {'type': 'loss', 'content': 0.0010831314139068127, 'timestamp': '2025-09-30 22:19:53.248644', 'step': 4791, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:19:53.285422', 'step': 4791, 'epoch': 3} {'type': 'loss', 'content': 0.005183074623346329, 'timestamp': '2025-09-30 22:19:53.313812', 'step': 4792, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:53.348349', 'step': 4792, 'epoch': 3} {'type': 'loss', 'content': 0.0035677261184901, 'timestamp': '2025-09-30 22:19:53.353841', 'step': 4793, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:19:53.386978', 'step': 4793, 'epoch': 3} {'type': 'loss', 'content': 0.0006080110324546695, 'timestamp': '2025-09-30 22:19:53.394277', 'step': 4794, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:19:53.429953', 'step': 4794, 'epoch': 3} {'type': 'loss', 'content': 0.002011936390772462, 'timestamp': '2025-09-30 22:19:53.442165', 'step': 4795, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:19:53.475436', 'step': 4795, 'epoch': 3} {'type': 'loss', 'content': 0.001812252216041088, 'timestamp': '2025-09-30 22:19:53.503743', 'step': 4796, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:19:53.569862', 'step': 4796, 'epoch': 3} {'type': 'loss', 'content': 0.0021046362817287445, 'timestamp': '2025-09-30 22:19:53.574914', 'step': 4797, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:19:53.608040', 'step': 4797, 'epoch': 3} {'type': 'loss', 'content': 0.0029523340053856373, 'timestamp': '2025-09-30 22:19:53.620683', 'step': 4798, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:53.653481', 'step': 4798, 'epoch': 3} {'type': 'loss', 'content': 0.0034778157714754343, 'timestamp': '2025-09-30 22:19:53.662876', 'step': 4799, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:19:53.701581', 'step': 4799, 'epoch': 3} {'type': 'loss', 'content': 0.0006414828239940107, 'timestamp': '2025-09-30 22:19:53.729273', 'step': 4800, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:19:53.778220', 'step': 4800, 'epoch': 3} {'type': 'loss', 'content': 0.0027236840687692165, 'timestamp': '2025-09-30 22:19:53.791260', 'step': 4801, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:19:53.825221', 'step': 4801, 'epoch': 3} {'type': 'loss', 'content': 0.001283834339119494, 'timestamp': '2025-09-30 22:19:53.832145', 'step': 4802, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:53.874548', 'step': 4802, 'epoch': 3} {'type': 'loss', 'content': 0.0005156965926289558, 'timestamp': '2025-09-30 22:19:53.884804', 'step': 4803, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:19:53.927121', 'step': 4803, 'epoch': 3} {'type': 'loss', 'content': 0.0042557427659630775, 'timestamp': '2025-09-30 22:19:53.955294', 'step': 4804, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-09-30 22:19:53.994287', 'step': 4804, 'epoch': 3} {'type': 'loss', 'content': 0.0015035402029752731, 'timestamp': '2025-09-30 22:19:54.010280', 'step': 4805, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:19:54.047057', 'step': 4805, 'epoch': 3} {'type': 'loss', 'content': 0.002708726329728961, 'timestamp': '2025-09-30 22:19:54.054508', 'step': 4806, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:19:54.098052', 'step': 4806, 'epoch': 3} {'type': 'loss', 'content': 0.001241795253008604, 'timestamp': '2025-09-30 22:19:54.110691', 'step': 4807, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:19:54.154326', 'step': 4807, 'epoch': 3} {'type': 'loss', 'content': 0.00076678377809003, 'timestamp': '2025-09-30 22:19:54.185596', 'step': 4808, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:19:54.227596', 'step': 4808, 'epoch': 3} {'type': 'loss', 'content': 0.0068432642146945, 'timestamp': '2025-09-30 22:19:54.240998', 'step': 4809, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:19:54.285095', 'step': 4809, 'epoch': 3} {'type': 'loss', 'content': 0.0026549880858510733, 'timestamp': '2025-09-30 22:19:54.299135', 'step': 4810, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:19:54.333302', 'step': 4810, 'epoch': 3} {'type': 'loss', 'content': 0.0008714778232388198, 'timestamp': '2025-09-30 22:19:54.345554', 'step': 4811, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:19:54.379315', 'step': 4811, 'epoch': 3} {'type': 'loss', 'content': 0.003857155330479145, 'timestamp': '2025-09-30 22:19:54.407641', 'step': 4812, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:19:54.443622', 'step': 4812, 'epoch': 3} {'type': 'loss', 'content': 0.010509241372346878, 'timestamp': '2025-09-30 22:19:54.456980', 'step': 4813, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:19:54.495788', 'step': 4813, 'epoch': 3} {'type': 'loss', 'content': 0.013247305527329445, 'timestamp': '2025-09-30 22:19:54.508415', 'step': 4814, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:19:54.555100', 'step': 4814, 'epoch': 3} {'type': 'loss', 'content': 0.008652194403111935, 'timestamp': '2025-09-30 22:19:54.570734', 'step': 4815, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:54.608167', 'step': 4815, 'epoch': 3} {'type': 'loss', 'content': 0.006328233517706394, 'timestamp': '2025-09-30 22:19:54.638271', 'step': 4816, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:19:54.672896', 'step': 4816, 'epoch': 3} {'type': 'loss', 'content': 0.006439377553761005, 'timestamp': '2025-09-30 22:19:54.685917', 'step': 4817, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:19:54.724449', 'step': 4817, 'epoch': 3} {'type': 'loss', 'content': 0.007747083902359009, 'timestamp': '2025-09-30 22:19:54.738261', 'step': 4818, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:19:54.777415', 'step': 4818, 'epoch': 3} {'type': 'loss', 'content': 0.00303424964658916, 'timestamp': '2025-09-30 22:19:54.787128', 'step': 4819, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:19:54.827660', 'step': 4819, 'epoch': 3} {'type': 'loss', 'content': 0.0028589745052158833, 'timestamp': '2025-09-30 22:19:54.861879', 'step': 4820, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:19:54.898352', 'step': 4820, 'epoch': 3} {'type': 'loss', 'content': 0.002803644398227334, 'timestamp': '2025-09-30 22:19:54.908489', 'step': 4821, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:19:54.944604', 'step': 4821, 'epoch': 3} {'type': 'loss', 'content': 0.004195516929030418, 'timestamp': '2025-09-30 22:19:54.956569', 'step': 4822, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:54.993327', 'step': 4822, 'epoch': 3} {'type': 'loss', 'content': 0.005452101118862629, 'timestamp': '2025-09-30 22:19:55.004232', 'step': 4823, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:19:55.041894', 'step': 4823, 'epoch': 3} {'type': 'loss', 'content': 0.003121099667623639, 'timestamp': '2025-09-30 22:19:55.072230', 'step': 4824, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:19:55.115066', 'step': 4824, 'epoch': 3} {'type': 'loss', 'content': 0.00045138385030440986, 'timestamp': '2025-09-30 22:19:55.119470', 'step': 4825, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:19:55.162302', 'step': 4825, 'epoch': 3} {'type': 'loss', 'content': 0.004608877934515476, 'timestamp': '2025-09-30 22:19:55.174368', 'step': 4826, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:55.207428', 'step': 4826, 'epoch': 3} {'type': 'loss', 'content': 0.01124146394431591, 'timestamp': '2025-09-30 22:19:55.218269', 'step': 4827, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:55.258376', 'step': 4827, 'epoch': 3} {'type': 'loss', 'content': 0.0018338344525545835, 'timestamp': '2025-09-30 22:19:55.289988', 'step': 4828, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:19:55.327265', 'step': 4828, 'epoch': 3} {'type': 'loss', 'content': 0.0024090490769594908, 'timestamp': '2025-09-30 22:19:55.340246', 'step': 4829, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:19:55.382022', 'step': 4829, 'epoch': 3} {'type': 'loss', 'content': 0.007087676785886288, 'timestamp': '2025-09-30 22:19:55.389352', 'step': 4830, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-09-30 22:19:57.831717', 'step': 4830, 'epoch': 3} {'type': 'pplx', 'content': 6.00403182745848, 'timestamp': '2025-09-30 22:19:57.835117', 'step': 4830, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:19:57.867527', 'step': 4830, 'epoch': 3} {'type': 'loss', 'content': 0.0020713661797344685, 'timestamp': '2025-09-30 22:19:57.877508', 'step': 4831, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:57.911376', 'step': 4831, 'epoch': 3} {'type': 'loss', 'content': 0.003641373012214899, 'timestamp': '2025-09-30 22:19:57.942477', 'step': 4832, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:19:57.983911', 'step': 4832, 'epoch': 3} {'type': 'loss', 'content': 0.00029382065986283123, 'timestamp': '2025-09-30 22:19:57.992717', 'step': 4833, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:19:58.032411', 'step': 4833, 'epoch': 3} {'type': 'loss', 'content': 0.0003123684728052467, 'timestamp': '2025-09-30 22:19:58.044397', 'step': 4834, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:19:58.080813', 'step': 4834, 'epoch': 3} {'type': 'loss', 'content': 0.0014360976638272405, 'timestamp': '2025-09-30 22:19:58.087632', 'step': 4835, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:19:58.130984', 'step': 4835, 'epoch': 3} {'type': 'loss', 'content': 0.0006729178712703288, 'timestamp': '2025-09-30 22:19:58.165329', 'step': 4836, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:58.198399', 'step': 4836, 'epoch': 3} {'type': 'loss', 'content': 0.0005326576065272093, 'timestamp': '2025-09-30 22:19:58.212066', 'step': 4837, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:19:58.256949', 'step': 4837, 'epoch': 3} {'type': 'loss', 'content': 0.0012045191833749413, 'timestamp': '2025-09-30 22:19:58.270290', 'step': 4838, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:58.305562', 'step': 4838, 'epoch': 3} {'type': 'loss', 'content': 0.003865152597427368, 'timestamp': '2025-09-30 22:19:58.316500', 'step': 4839, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:58.361056', 'step': 4839, 'epoch': 3} {'type': 'loss', 'content': 0.010859113186597824, 'timestamp': '2025-09-30 22:19:58.392643', 'step': 4840, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-09-30 22:19:58.438372', 'step': 4840, 'epoch': 3} {'type': 'loss', 'content': 0.0014040463138371706, 'timestamp': '2025-09-30 22:19:58.456456', 'step': 4841, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:19:58.522837', 'step': 4841, 'epoch': 3} {'type': 'loss', 'content': 0.0022020265460014343, 'timestamp': '2025-09-30 22:19:58.542824', 'step': 4842, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:19:58.596703', 'step': 4842, 'epoch': 3} {'type': 'loss', 'content': 0.002500901697203517, 'timestamp': '2025-09-30 22:19:58.614464', 'step': 4843, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:19:58.662835', 'step': 4843, 'epoch': 3} {'type': 'loss', 'content': 0.009635166265070438, 'timestamp': '2025-09-30 22:19:58.702492', 'step': 4844, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:19:58.742975', 'step': 4844, 'epoch': 3} {'type': 'loss', 'content': 0.000577495142351836, 'timestamp': '2025-09-30 22:19:58.756188', 'step': 4845, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:19:58.803952', 'step': 4845, 'epoch': 3} {'type': 'loss', 'content': 0.003777291625738144, 'timestamp': '2025-09-30 22:19:58.816538', 'step': 4846, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:19:58.861487', 'step': 4846, 'epoch': 3} {'type': 'loss', 'content': 0.0007583480328321457, 'timestamp': '2025-09-30 22:19:58.873666', 'step': 4847, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:19:58.921695', 'step': 4847, 'epoch': 3} {'type': 'loss', 'content': 0.002258284017443657, 'timestamp': '2025-09-30 22:19:58.956370', 'step': 4848, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-09-30 22:19:59.001847', 'step': 4848, 'epoch': 3} {'type': 'loss', 'content': 0.004016295075416565, 'timestamp': '2025-09-30 22:19:59.017499', 'step': 4849, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:59.065748', 'step': 4849, 'epoch': 3} {'type': 'loss', 'content': 0.005633024498820305, 'timestamp': '2025-09-30 22:19:59.081262', 'step': 4850, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:19:59.123684', 'step': 4850, 'epoch': 3} {'type': 'loss', 'content': 0.0025410479865968227, 'timestamp': '2025-09-30 22:19:59.136214', 'step': 4851, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:19:59.188446', 'step': 4851, 'epoch': 3} {'type': 'loss', 'content': 0.0015249482821673155, 'timestamp': '2025-09-30 22:19:59.223137', 'step': 4852, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:19:59.267398', 'step': 4852, 'epoch': 3} {'type': 'loss', 'content': 0.006067855749279261, 'timestamp': '2025-09-30 22:19:59.282725', 'step': 4853, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:19:59.317344', 'step': 4853, 'epoch': 3} {'type': 'loss', 'content': 0.002697502262890339, 'timestamp': '2025-09-30 22:19:59.340439', 'step': 4854, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:19:59.393458', 'step': 4854, 'epoch': 3} {'type': 'loss', 'content': 0.0076017738319933414, 'timestamp': '2025-09-30 22:19:59.418794', 'step': 4855, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:19:59.460281', 'step': 4855, 'epoch': 3} {'type': 'loss', 'content': 0.0035977442748844624, 'timestamp': '2025-09-30 22:19:59.500647', 'step': 4856, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:19:59.540118', 'step': 4856, 'epoch': 3} {'type': 'loss', 'content': 0.003823881270363927, 'timestamp': '2025-09-30 22:19:59.550554', 'step': 4857, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:19:59.598185', 'step': 4857, 'epoch': 3} {'type': 'loss', 'content': 0.008830385282635689, 'timestamp': '2025-09-30 22:19:59.612014', 'step': 4858, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:19:59.658641', 'step': 4858, 'epoch': 3} {'type': 'loss', 'content': 0.010811922140419483, 'timestamp': '2025-09-30 22:19:59.674249', 'step': 4859, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:19:59.718442', 'step': 4859, 'epoch': 3} {'type': 'loss', 'content': 0.003709632670506835, 'timestamp': '2025-09-30 22:19:59.753149', 'step': 4860, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 15187581968384}, 'timestamp': '2025-09-30 22:19:59.796300', 'step': 4860, 'epoch': 3} {'type': 'loss', 'content': 0.0012815343216061592, 'timestamp': '2025-09-30 22:19:59.813689', 'step': 4861, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:19:59.857633', 'step': 4861, 'epoch': 3} {'type': 'loss', 'content': 0.011897546239197254, 'timestamp': '2025-09-30 22:19:59.870973', 'step': 4862, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:19:59.913239', 'step': 4862, 'epoch': 3} {'type': 'loss', 'content': 0.004625137895345688, 'timestamp': '2025-09-30 22:19:59.924051', 'step': 4863, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:19:59.972091', 'step': 4863, 'epoch': 3} {'type': 'loss', 'content': 0.004630404058843851, 'timestamp': '2025-09-30 22:20:00.005276', 'step': 4864, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:20:00.059337', 'step': 4864, 'epoch': 3} {'type': 'loss', 'content': 0.0031580578070133924, 'timestamp': '2025-09-30 22:20:00.071931', 'step': 4865, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:20:00.112764', 'step': 4865, 'epoch': 3} {'type': 'loss', 'content': 0.005686325021088123, 'timestamp': '2025-09-30 22:20:00.126599', 'step': 4866, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:20:00.161908', 'step': 4866, 'epoch': 3} {'type': 'loss', 'content': 0.0021785860881209373, 'timestamp': '2025-09-30 22:20:00.175636', 'step': 4867, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:20:00.217110', 'step': 4867, 'epoch': 3} {'type': 'loss', 'content': 0.0067062643356621265, 'timestamp': '2025-09-30 22:20:00.251278', 'step': 4868, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:00.291486', 'step': 4868, 'epoch': 3} {'type': 'loss', 'content': 0.0007970409351401031, 'timestamp': '2025-09-30 22:20:00.301439', 'step': 4869, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:00.335479', 'step': 4869, 'epoch': 3} {'type': 'loss', 'content': 0.0006516333087347448, 'timestamp': '2025-09-30 22:20:00.346272', 'step': 4870, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:00.379224', 'step': 4870, 'epoch': 3} {'type': 'loss', 'content': 0.002726598409935832, 'timestamp': '2025-09-30 22:20:00.389978', 'step': 4871, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:00.422906', 'step': 4871, 'epoch': 3} {'type': 'loss', 'content': 0.002618856495246291, 'timestamp': '2025-09-30 22:20:00.456113', 'step': 4872, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:20:00.489336', 'step': 4872, 'epoch': 3} {'type': 'loss', 'content': 0.0014631540980190039, 'timestamp': '2025-09-30 22:20:00.493963', 'step': 4873, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:00.534430', 'step': 4873, 'epoch': 3} {'type': 'loss', 'content': 0.0011044855928048491, 'timestamp': '2025-09-30 22:20:00.545435', 'step': 4874, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-09-30 22:20:00.596335', 'step': 4874, 'epoch': 3} {'type': 'loss', 'content': 0.014512617141008377, 'timestamp': '2025-09-30 22:20:00.613349', 'step': 4875, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:00.653202', 'step': 4875, 'epoch': 3} {'type': 'loss', 'content': 0.0019889986142516136, 'timestamp': '2025-09-30 22:20:00.686346', 'step': 4876, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:20:00.729198', 'step': 4876, 'epoch': 3} {'type': 'loss', 'content': 0.0013323453022167087, 'timestamp': '2025-09-30 22:20:00.741932', 'step': 4877, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:20:00.788681', 'step': 4877, 'epoch': 3} {'type': 'loss', 'content': 0.0036682288628071547, 'timestamp': '2025-09-30 22:20:00.804398', 'step': 4878, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 16136789420416}, 'timestamp': '2025-09-30 22:20:00.858563', 'step': 4878, 'epoch': 3} {'type': 'loss', 'content': 0.004160485230386257, 'timestamp': '2025-09-30 22:20:00.877643', 'step': 4879, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:20:00.918275', 'step': 4879, 'epoch': 3} {'type': 'loss', 'content': 0.003462860593572259, 'timestamp': '2025-09-30 22:20:00.953097', 'step': 4880, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:00.996869', 'step': 4880, 'epoch': 3} {'type': 'loss', 'content': 0.004931773990392685, 'timestamp': '2025-09-30 22:20:01.007667', 'step': 4881, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:01.055795', 'step': 4881, 'epoch': 3} {'type': 'loss', 'content': 0.0018768792506307364, 'timestamp': '2025-09-30 22:20:01.068120', 'step': 4882, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:20:01.119732', 'step': 4882, 'epoch': 3} {'type': 'loss', 'content': 0.0018635703017935157, 'timestamp': '2025-09-30 22:20:01.133437', 'step': 4883, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:20:01.179444', 'step': 4883, 'epoch': 3} {'type': 'loss', 'content': 0.0034107728861272335, 'timestamp': '2025-09-30 22:20:01.214207', 'step': 4884, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:20:01.272375', 'step': 4884, 'epoch': 3} {'type': 'loss', 'content': 0.001060501323081553, 'timestamp': '2025-09-30 22:20:01.287779', 'step': 4885, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:20:01.334182', 'step': 4885, 'epoch': 3} {'type': 'loss', 'content': 0.0005893037305213511, 'timestamp': '2025-09-30 22:20:01.347887', 'step': 4886, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:01.380956', 'step': 4886, 'epoch': 3} {'type': 'loss', 'content': 0.0013945624232292175, 'timestamp': '2025-09-30 22:20:01.391880', 'step': 4887, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:01.439785', 'step': 4887, 'epoch': 3} {'type': 'loss', 'content': 0.0019568903371691704, 'timestamp': '2025-09-30 22:20:01.473211', 'step': 4888, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-09-30 22:20:01.513333', 'step': 4888, 'epoch': 3} {'type': 'loss', 'content': 0.005334537476301193, 'timestamp': '2025-09-30 22:20:01.529157', 'step': 4889, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:01.562951', 'step': 4889, 'epoch': 3} {'type': 'loss', 'content': 0.004985298495739698, 'timestamp': '2025-09-30 22:20:01.574186', 'step': 4890, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:20:01.613354', 'step': 4890, 'epoch': 3} {'type': 'loss', 'content': 0.004774213302880526, 'timestamp': '2025-09-30 22:20:01.626748', 'step': 4891, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-09-30 22:20:01.674578', 'step': 4891, 'epoch': 3} {'type': 'loss', 'content': 0.001032113330438733, 'timestamp': '2025-09-30 22:20:01.711652', 'step': 4892, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:20:01.750767', 'step': 4892, 'epoch': 3} {'type': 'loss', 'content': 0.0032533309422433376, 'timestamp': '2025-09-30 22:20:01.763906', 'step': 4893, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:01.805856', 'step': 4893, 'epoch': 3} {'type': 'loss', 'content': 0.0033527498599141836, 'timestamp': '2025-09-30 22:20:01.818433', 'step': 4894, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 14238374516352}, 'timestamp': '2025-09-30 22:20:01.863286', 'step': 4894, 'epoch': 3} {'type': 'loss', 'content': 0.0034811438526958227, 'timestamp': '2025-09-30 22:20:01.880580', 'step': 4895, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:20:01.922598', 'step': 4895, 'epoch': 3} {'type': 'loss', 'content': 0.0022857696749269962, 'timestamp': '2025-09-30 22:20:01.957522', 'step': 4896, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:01.992399', 'step': 4896, 'epoch': 3} {'type': 'loss', 'content': 0.0008828876889310777, 'timestamp': '2025-09-30 22:20:01.997708', 'step': 4897, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:20:02.038404', 'step': 4897, 'epoch': 3} {'type': 'loss', 'content': 0.006894740276038647, 'timestamp': '2025-09-30 22:20:02.047934', 'step': 4898, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:02.082137', 'step': 4898, 'epoch': 3} {'type': 'loss', 'content': 0.0022010121028870344, 'timestamp': '2025-09-30 22:20:02.093282', 'step': 4899, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:02.126092', 'step': 4899, 'epoch': 3} {'type': 'loss', 'content': 0.003999904729425907, 'timestamp': '2025-09-30 22:20:02.154926', 'step': 4900, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:02.196354', 'step': 4900, 'epoch': 3} {'type': 'loss', 'content': 0.002763181459158659, 'timestamp': '2025-09-30 22:20:02.201974', 'step': 4901, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:02.233901', 'step': 4901, 'epoch': 3} {'type': 'loss', 'content': 0.0005524757434614003, 'timestamp': '2025-09-30 22:20:02.242211', 'step': 4902, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:20:02.280643', 'step': 4902, 'epoch': 3} {'type': 'loss', 'content': 0.0009317506337538362, 'timestamp': '2025-09-30 22:20:02.288603', 'step': 4903, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:02.325818', 'step': 4903, 'epoch': 3} {'type': 'loss', 'content': 0.0023062448017299175, 'timestamp': '2025-09-30 22:20:02.357054', 'step': 4904, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:20:02.396426', 'step': 4904, 'epoch': 3} {'type': 'loss', 'content': 0.0009080026648007333, 'timestamp': '2025-09-30 22:20:02.404353', 'step': 4905, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:02.447633', 'step': 4905, 'epoch': 3} {'type': 'loss', 'content': 0.0010759907308965921, 'timestamp': '2025-09-30 22:20:02.458052', 'step': 4906, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:02.497621', 'step': 4906, 'epoch': 3} {'type': 'loss', 'content': 0.002068843925371766, 'timestamp': '2025-09-30 22:20:02.510208', 'step': 4907, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:02.557110', 'step': 4907, 'epoch': 3} {'type': 'loss', 'content': 0.0005585406324826181, 'timestamp': '2025-09-30 22:20:02.589242', 'step': 4908, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:20:02.624360', 'step': 4908, 'epoch': 3} {'type': 'loss', 'content': 0.0036755427718162537, 'timestamp': '2025-09-30 22:20:02.636168', 'step': 4909, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:02.676745', 'step': 4909, 'epoch': 3} {'type': 'loss', 'content': 0.0012291098246350884, 'timestamp': '2025-09-30 22:20:02.689557', 'step': 4910, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:20:02.753063', 'step': 4910, 'epoch': 3} {'type': 'loss', 'content': 0.0008901917026378214, 'timestamp': '2025-09-30 22:20:02.760310', 'step': 4911, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:20:02.801478', 'step': 4911, 'epoch': 3} {'type': 'loss', 'content': 0.0024204845540225506, 'timestamp': '2025-09-30 22:20:02.829224', 'step': 4912, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:20:02.866527', 'step': 4912, 'epoch': 3} {'type': 'loss', 'content': 0.002846227027475834, 'timestamp': '2025-09-30 22:20:02.876580', 'step': 4913, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:20:02.916586', 'step': 4913, 'epoch': 3} {'type': 'loss', 'content': 0.002848659874871373, 'timestamp': '2025-09-30 22:20:02.925383', 'step': 4914, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:02.968181', 'step': 4914, 'epoch': 3} {'type': 'loss', 'content': 0.000996431801468134, 'timestamp': '2025-09-30 22:20:02.980357', 'step': 4915, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:03.017893', 'step': 4915, 'epoch': 3} {'type': 'loss', 'content': 0.0005112317157909274, 'timestamp': '2025-09-30 22:20:03.047229', 'step': 4916, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:20:03.088400', 'step': 4916, 'epoch': 3} {'type': 'loss', 'content': 0.000986065249890089, 'timestamp': '2025-09-30 22:20:03.100841', 'step': 4917, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:03.134455', 'step': 4917, 'epoch': 3} {'type': 'loss', 'content': 0.002913024974986911, 'timestamp': '2025-09-30 22:20:03.146750', 'step': 4918, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:20:03.188641', 'step': 4918, 'epoch': 3} {'type': 'loss', 'content': 0.0006965301581658423, 'timestamp': '2025-09-30 22:20:03.202012', 'step': 4919, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:20:03.236631', 'step': 4919, 'epoch': 3} {'type': 'loss', 'content': 0.002137935720384121, 'timestamp': '2025-09-30 22:20:03.266008', 'step': 4920, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:03.300349', 'step': 4920, 'epoch': 3} {'type': 'loss', 'content': 0.0022465272340923548, 'timestamp': '2025-09-30 22:20:03.306086', 'step': 4921, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:03.340703', 'step': 4921, 'epoch': 3} {'type': 'loss', 'content': 0.0003808860492426902, 'timestamp': '2025-09-30 22:20:03.354710', 'step': 4922, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:03.389992', 'step': 4922, 'epoch': 3} {'type': 'loss', 'content': 0.0016280546551570296, 'timestamp': '2025-09-30 22:20:03.401028', 'step': 4923, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:03.438833', 'step': 4923, 'epoch': 3} {'type': 'loss', 'content': 0.002310381270945072, 'timestamp': '2025-09-30 22:20:03.470120', 'step': 4924, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:20:03.506525', 'step': 4924, 'epoch': 3} {'type': 'loss', 'content': 0.008243824355304241, 'timestamp': '2025-09-30 22:20:03.514154', 'step': 4925, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:20:03.561807', 'step': 4925, 'epoch': 3} {'type': 'loss', 'content': 0.0007182768895290792, 'timestamp': '2025-09-30 22:20:03.568986', 'step': 4926, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:20:03.631333', 'step': 4926, 'epoch': 3} {'type': 'loss', 'content': 0.0027369027957320213, 'timestamp': '2025-09-30 22:20:03.638561', 'step': 4927, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:03.685843', 'step': 4927, 'epoch': 3} {'type': 'loss', 'content': 0.0006846529431641102, 'timestamp': '2025-09-30 22:20:03.726993', 'step': 4928, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:03.766064', 'step': 4928, 'epoch': 3} {'type': 'loss', 'content': 0.0016806876519694924, 'timestamp': '2025-09-30 22:20:03.783157', 'step': 4929, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:20:03.842748', 'step': 4929, 'epoch': 3} {'type': 'loss', 'content': 0.0005215826095081866, 'timestamp': '2025-09-30 22:20:03.863782', 'step': 4930, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:20:03.905420', 'step': 4930, 'epoch': 3} {'type': 'loss', 'content': 0.0003204425738658756, 'timestamp': '2025-09-30 22:20:03.913001', 'step': 4931, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:03.953071', 'step': 4931, 'epoch': 3} {'type': 'loss', 'content': 0.0004890135023742914, 'timestamp': '2025-09-30 22:20:03.983205', 'step': 4932, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:20:04.048277', 'step': 4932, 'epoch': 3} {'type': 'loss', 'content': 0.00029881924274377525, 'timestamp': '2025-09-30 22:20:04.053114', 'step': 4933, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:04.103877', 'step': 4933, 'epoch': 3} {'type': 'loss', 'content': 0.0007652129279449582, 'timestamp': '2025-09-30 22:20:04.111772', 'step': 4934, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:04.145907', 'step': 4934, 'epoch': 3} {'type': 'loss', 'content': 0.002809490542858839, 'timestamp': '2025-09-30 22:20:04.156944', 'step': 4935, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:04.192975', 'step': 4935, 'epoch': 3} {'type': 'loss', 'content': 0.001373854698613286, 'timestamp': '2025-09-30 22:20:04.224343', 'step': 4936, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:04.258773', 'step': 4936, 'epoch': 3} {'type': 'loss', 'content': 0.0024177818559110165, 'timestamp': '2025-09-30 22:20:04.267563', 'step': 4937, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:20:04.308463', 'step': 4937, 'epoch': 3} {'type': 'loss', 'content': 0.000511349004227668, 'timestamp': '2025-09-30 22:20:04.325857', 'step': 4938, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:20:04.365670', 'step': 4938, 'epoch': 3} {'type': 'loss', 'content': 0.013990801759064198, 'timestamp': '2025-09-30 22:20:04.373393', 'step': 4939, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:04.425924', 'step': 4939, 'epoch': 3} {'type': 'loss', 'content': 0.0008692088886164129, 'timestamp': '2025-09-30 22:20:04.454526', 'step': 4940, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:04.487584', 'step': 4940, 'epoch': 3} {'type': 'loss', 'content': 0.0023386774118989706, 'timestamp': '2025-09-30 22:20:04.501836', 'step': 4941, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:04.543872', 'step': 4941, 'epoch': 3} {'type': 'loss', 'content': 0.0004770174855366349, 'timestamp': '2025-09-30 22:20:04.554180', 'step': 4942, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:04.586014', 'step': 4942, 'epoch': 3} {'type': 'loss', 'content': 0.0015950370579957962, 'timestamp': '2025-09-30 22:20:04.596529', 'step': 4943, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:04.630914', 'step': 4943, 'epoch': 3} {'type': 'loss', 'content': 0.0012634805170819163, 'timestamp': '2025-09-30 22:20:04.662667', 'step': 4944, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:04.697185', 'step': 4944, 'epoch': 3} {'type': 'loss', 'content': 0.0018261810764670372, 'timestamp': '2025-09-30 22:20:04.707109', 'step': 4945, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-09-30 22:20:07.195307', 'step': 4945, 'epoch': 3} {'type': 'pplx', 'content': 6.016984404568712, 'timestamp': '2025-09-30 22:20:07.199300', 'step': 4945, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:07.231856', 'step': 4945, 'epoch': 3} {'type': 'loss', 'content': 0.0005609581130556762, 'timestamp': '2025-09-30 22:20:07.241691', 'step': 4946, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:07.290686', 'step': 4946, 'epoch': 3} {'type': 'loss', 'content': 0.0029263258911669254, 'timestamp': '2025-09-30 22:20:07.304404', 'step': 4947, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:07.348977', 'step': 4947, 'epoch': 3} {'type': 'loss', 'content': 0.0006811014609411359, 'timestamp': '2025-09-30 22:20:07.380755', 'step': 4948, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:07.413620', 'step': 4948, 'epoch': 3} {'type': 'loss', 'content': 0.0005025799619033933, 'timestamp': '2025-09-30 22:20:07.422260', 'step': 4949, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:07.459848', 'step': 4949, 'epoch': 3} {'type': 'loss', 'content': 0.004274230916053057, 'timestamp': '2025-09-30 22:20:07.472374', 'step': 4950, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:07.516625', 'step': 4950, 'epoch': 3} {'type': 'loss', 'content': 0.0005864065024070442, 'timestamp': '2025-09-30 22:20:07.529218', 'step': 4951, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:07.579379', 'step': 4951, 'epoch': 3} {'type': 'loss', 'content': 0.00278579187579453, 'timestamp': '2025-09-30 22:20:07.611270', 'step': 4952, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:07.645371', 'step': 4952, 'epoch': 3} {'type': 'loss', 'content': 0.0011230125091969967, 'timestamp': '2025-09-30 22:20:07.653947', 'step': 4953, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:07.686018', 'step': 4953, 'epoch': 3} {'type': 'loss', 'content': 0.00035609089536592364, 'timestamp': '2025-09-30 22:20:07.697189', 'step': 4954, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:07.730898', 'step': 4954, 'epoch': 3} {'type': 'loss', 'content': 0.0022988419514149427, 'timestamp': '2025-09-30 22:20:07.743240', 'step': 4955, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:07.778623', 'step': 4955, 'epoch': 3} {'type': 'loss', 'content': 0.0022796562407165766, 'timestamp': '2025-09-30 22:20:07.807476', 'step': 4956, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:20:07.840947', 'step': 4956, 'epoch': 3} {'type': 'loss', 'content': 0.002927593421190977, 'timestamp': '2025-09-30 22:20:07.853662', 'step': 4957, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:07.886443', 'step': 4957, 'epoch': 3} {'type': 'loss', 'content': 0.0007647660095244646, 'timestamp': '2025-09-30 22:20:07.897514', 'step': 4958, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:07.941372', 'step': 4958, 'epoch': 3} {'type': 'loss', 'content': 0.0022333392407745123, 'timestamp': '2025-09-30 22:20:07.953674', 'step': 4959, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:20:07.998548', 'step': 4959, 'epoch': 3} {'type': 'loss', 'content': 0.0015479567227885127, 'timestamp': '2025-09-30 22:20:08.032839', 'step': 4960, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:08.072423', 'step': 4960, 'epoch': 3} {'type': 'loss', 'content': 9.858738485490903e-05, 'timestamp': '2025-09-30 22:20:08.080491', 'step': 4961, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:08.124165', 'step': 4961, 'epoch': 3} {'type': 'loss', 'content': 0.0009962372714653611, 'timestamp': '2025-09-30 22:20:08.136749', 'step': 4962, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:08.173761', 'step': 4962, 'epoch': 3} {'type': 'loss', 'content': 0.0013241246342658997, 'timestamp': '2025-09-30 22:20:08.186043', 'step': 4963, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:08.221733', 'step': 4963, 'epoch': 3} {'type': 'loss', 'content': 0.00218255165964365, 'timestamp': '2025-09-30 22:20:08.255168', 'step': 4964, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:20:08.316475', 'step': 4964, 'epoch': 3} {'type': 'loss', 'content': 0.0009654370369389653, 'timestamp': '2025-09-30 22:20:08.329147', 'step': 4965, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:08.363268', 'step': 4965, 'epoch': 3} {'type': 'loss', 'content': 0.000785224256105721, 'timestamp': '2025-09-30 22:20:08.374789', 'step': 4966, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:08.418988', 'step': 4966, 'epoch': 3} {'type': 'loss', 'content': 0.0007269098423421383, 'timestamp': '2025-09-30 22:20:08.433512', 'step': 4967, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:08.471546', 'step': 4967, 'epoch': 3} {'type': 'loss', 'content': 0.0024295735638588667, 'timestamp': '2025-09-30 22:20:08.504778', 'step': 4968, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:08.545227', 'step': 4968, 'epoch': 3} {'type': 'loss', 'content': 0.0007269516936503351, 'timestamp': '2025-09-30 22:20:08.553999', 'step': 4969, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:08.592788', 'step': 4969, 'epoch': 3} {'type': 'loss', 'content': 0.001029142295010388, 'timestamp': '2025-09-30 22:20:08.603925', 'step': 4970, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:08.641414', 'step': 4970, 'epoch': 3} {'type': 'loss', 'content': 0.0019772483501583338, 'timestamp': '2025-09-30 22:20:08.653986', 'step': 4971, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:08.699420', 'step': 4971, 'epoch': 3} {'type': 'loss', 'content': 0.003952382132411003, 'timestamp': '2025-09-30 22:20:08.731474', 'step': 4972, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:08.764520', 'step': 4972, 'epoch': 3} {'type': 'loss', 'content': 0.0005930595798417926, 'timestamp': '2025-09-30 22:20:08.775895', 'step': 4973, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:08.810330', 'step': 4973, 'epoch': 3} {'type': 'loss', 'content': 0.0009736541542224586, 'timestamp': '2025-09-30 22:20:08.820721', 'step': 4974, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:20:08.860718', 'step': 4974, 'epoch': 3} {'type': 'loss', 'content': 0.011569741182029247, 'timestamp': '2025-09-30 22:20:08.868208', 'step': 4975, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:08.901806', 'step': 4975, 'epoch': 3} {'type': 'loss', 'content': 0.0005655374843627214, 'timestamp': '2025-09-30 22:20:08.935015', 'step': 4976, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:08.977844', 'step': 4976, 'epoch': 3} {'type': 'loss', 'content': 0.002230295678600669, 'timestamp': '2025-09-30 22:20:08.988062', 'step': 4977, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:09.027488', 'step': 4977, 'epoch': 3} {'type': 'loss', 'content': 0.0006978387827984989, 'timestamp': '2025-09-30 22:20:09.039013', 'step': 4978, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:09.085753', 'step': 4978, 'epoch': 3} {'type': 'loss', 'content': 0.0005524156731553376, 'timestamp': '2025-09-30 22:20:09.098137', 'step': 4979, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:09.131276', 'step': 4979, 'epoch': 3} {'type': 'loss', 'content': 0.00037273086491040885, 'timestamp': '2025-09-30 22:20:09.166188', 'step': 4980, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:09.199316', 'step': 4980, 'epoch': 3} {'type': 'loss', 'content': 0.00028574716998264194, 'timestamp': '2025-09-30 22:20:09.210012', 'step': 4981, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:09.258417', 'step': 4981, 'epoch': 3} {'type': 'loss', 'content': 0.001474974793381989, 'timestamp': '2025-09-30 22:20:09.270690', 'step': 4982, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:09.303613', 'step': 4982, 'epoch': 3} {'type': 'loss', 'content': 0.0012398564722388983, 'timestamp': '2025-09-30 22:20:09.315884', 'step': 4983, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:09.356394', 'step': 4983, 'epoch': 3} {'type': 'loss', 'content': 0.0007058014743961394, 'timestamp': '2025-09-30 22:20:09.389849', 'step': 4984, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:20:09.424168', 'step': 4984, 'epoch': 3} {'type': 'loss', 'content': 0.0010078602936118841, 'timestamp': '2025-09-30 22:20:09.436862', 'step': 4985, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:09.471482', 'step': 4985, 'epoch': 3} {'type': 'loss', 'content': 0.0005812669987790287, 'timestamp': '2025-09-30 22:20:09.484021', 'step': 4986, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:09.519168', 'step': 4986, 'epoch': 3} {'type': 'loss', 'content': 0.002003656467422843, 'timestamp': '2025-09-30 22:20:09.530368', 'step': 4987, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:09.568815', 'step': 4987, 'epoch': 3} {'type': 'loss', 'content': 0.011207995936274529, 'timestamp': '2025-09-30 22:20:09.602096', 'step': 4988, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:20:09.636364', 'step': 4988, 'epoch': 3} {'type': 'loss', 'content': 0.0014819592470303178, 'timestamp': '2025-09-30 22:20:09.649034', 'step': 4989, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:20:09.685931', 'step': 4989, 'epoch': 3} {'type': 'loss', 'content': 0.004838210996240377, 'timestamp': '2025-09-30 22:20:09.699631', 'step': 4990, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:09.733798', 'step': 4990, 'epoch': 3} {'type': 'loss', 'content': 0.00016354575927834958, 'timestamp': '2025-09-30 22:20:09.746366', 'step': 4991, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:09.783152', 'step': 4991, 'epoch': 3} {'type': 'loss', 'content': 0.0002751017455011606, 'timestamp': '2025-09-30 22:20:09.816582', 'step': 4992, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:09.855504', 'step': 4992, 'epoch': 3} {'type': 'loss', 'content': 0.001686186995357275, 'timestamp': '2025-09-30 22:20:09.866405', 'step': 4993, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:09.909331', 'step': 4993, 'epoch': 3} {'type': 'loss', 'content': 0.0007971947197802365, 'timestamp': '2025-09-30 22:20:09.921978', 'step': 4994, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:09.954729', 'step': 4994, 'epoch': 3} {'type': 'loss', 'content': 0.0003467929782345891, 'timestamp': '2025-09-30 22:20:09.967319', 'step': 4995, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:09.999720', 'step': 4995, 'epoch': 3} {'type': 'loss', 'content': 0.000309546769130975, 'timestamp': '2025-09-30 22:20:10.032822', 'step': 4996, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:20:10.078952', 'step': 4996, 'epoch': 3} {'type': 'loss', 'content': 0.0012426752364262938, 'timestamp': '2025-09-30 22:20:10.098018', 'step': 4997, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:10.145792', 'step': 4997, 'epoch': 3} {'type': 'loss', 'content': 0.0015795336803421378, 'timestamp': '2025-09-30 22:20:10.158002', 'step': 4998, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:10.210350', 'step': 4998, 'epoch': 3} {'type': 'loss', 'content': 0.005597970448434353, 'timestamp': '2025-09-30 22:20:10.222900', 'step': 4999, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:10.273464', 'step': 4999, 'epoch': 3} {'type': 'loss', 'content': 0.001970804063603282, 'timestamp': '2025-09-30 22:20:10.305057', 'step': 5000, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 5000', 'timestamp': '2025-09-30 22:20:15.807087', 'step': 5000, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:15.845853', 'step': 5000, 'epoch': 3} {'type': 'loss', 'content': 0.0008431835449300706, 'timestamp': '2025-09-30 22:20:15.853094', 'step': 5001, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:15.888096', 'step': 5001, 'epoch': 3} {'type': 'loss', 'content': 0.00022042497585061938, 'timestamp': '2025-09-30 22:20:15.898327', 'step': 5002, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:20:15.949923', 'step': 5002, 'epoch': 3} {'type': 'loss', 'content': 0.0012976464349776506, 'timestamp': '2025-09-30 22:20:15.958730', 'step': 5003, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:20:15.998396', 'step': 5003, 'epoch': 3} {'type': 'loss', 'content': 0.0027910852804780006, 'timestamp': '2025-09-30 22:20:16.027833', 'step': 5004, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:20:16.061025', 'step': 5004, 'epoch': 3} {'type': 'loss', 'content': 0.003565196180716157, 'timestamp': '2025-09-30 22:20:16.066345', 'step': 5005, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:20:16.104647', 'step': 5005, 'epoch': 3} {'type': 'loss', 'content': 0.0007262213621288538, 'timestamp': '2025-09-30 22:20:16.112284', 'step': 5006, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:20:16.151245', 'step': 5006, 'epoch': 3} {'type': 'loss', 'content': 0.0004352441756054759, 'timestamp': '2025-09-30 22:20:16.158990', 'step': 5007, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:20:16.199166', 'step': 5007, 'epoch': 3} {'type': 'loss', 'content': 0.0002115533861797303, 'timestamp': '2025-09-30 22:20:16.227796', 'step': 5008, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:16.264336', 'step': 5008, 'epoch': 3} {'type': 'loss', 'content': 0.0023786493111401796, 'timestamp': '2025-09-30 22:20:16.272357', 'step': 5009, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:16.311612', 'step': 5009, 'epoch': 3} {'type': 'loss', 'content': 0.002945238258689642, 'timestamp': '2025-09-30 22:20:16.321793', 'step': 5010, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:20:16.357312', 'step': 5010, 'epoch': 3} {'type': 'loss', 'content': 0.0014447633875533938, 'timestamp': '2025-09-30 22:20:16.366645', 'step': 5011, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:20:16.415344', 'step': 5011, 'epoch': 3} {'type': 'loss', 'content': 0.005410191603004932, 'timestamp': '2025-09-30 22:20:16.445607', 'step': 5012, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:20:16.477476', 'step': 5012, 'epoch': 3} {'type': 'loss', 'content': 0.0032125164289027452, 'timestamp': '2025-09-30 22:20:16.483049', 'step': 5013, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:20:16.514767', 'step': 5013, 'epoch': 3} {'type': 'loss', 'content': 0.004742915742099285, 'timestamp': '2025-09-30 22:20:16.522775', 'step': 5014, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:16.561876', 'step': 5014, 'epoch': 3} {'type': 'loss', 'content': 0.0009069226798601449, 'timestamp': '2025-09-30 22:20:16.572292', 'step': 5015, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:16.605187', 'step': 5015, 'epoch': 3} {'type': 'loss', 'content': 0.00934299360960722, 'timestamp': '2025-09-30 22:20:16.638305', 'step': 5016, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:20:16.676225', 'step': 5016, 'epoch': 3} {'type': 'loss', 'content': 0.003764711320400238, 'timestamp': '2025-09-30 22:20:16.689222', 'step': 5017, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:20:16.740946', 'step': 5017, 'epoch': 3} {'type': 'loss', 'content': 0.0002709300024434924, 'timestamp': '2025-09-30 22:20:16.748265', 'step': 5018, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:16.791772', 'step': 5018, 'epoch': 3} {'type': 'loss', 'content': 0.005229764152318239, 'timestamp': '2025-09-30 22:20:16.804128', 'step': 5019, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:20:16.841050', 'step': 5019, 'epoch': 3} {'type': 'loss', 'content': 0.0009594621951691806, 'timestamp': '2025-09-30 22:20:16.883195', 'step': 5020, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:16.915666', 'step': 5020, 'epoch': 3} {'type': 'loss', 'content': 0.0020179443527013063, 'timestamp': '2025-09-30 22:20:16.924516', 'step': 5021, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:20:16.981422', 'step': 5021, 'epoch': 3} {'type': 'loss', 'content': 0.0021082826424390078, 'timestamp': '2025-09-30 22:20:16.994866', 'step': 5022, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:17.027975', 'step': 5022, 'epoch': 3} {'type': 'loss', 'content': 0.011751977726817131, 'timestamp': '2025-09-30 22:20:17.040371', 'step': 5023, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:17.074558', 'step': 5023, 'epoch': 3} {'type': 'loss', 'content': 0.0023646140471100807, 'timestamp': '2025-09-30 22:20:17.106742', 'step': 5024, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:17.153377', 'step': 5024, 'epoch': 3} {'type': 'loss', 'content': 0.0007554754265584052, 'timestamp': '2025-09-30 22:20:17.161507', 'step': 5025, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:17.197600', 'step': 5025, 'epoch': 3} {'type': 'loss', 'content': 0.0005631324602290988, 'timestamp': '2025-09-30 22:20:17.210151', 'step': 5026, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:17.266489', 'step': 5026, 'epoch': 3} {'type': 'loss', 'content': 0.0003204508393537253, 'timestamp': '2025-09-30 22:20:17.278794', 'step': 5027, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:17.317502', 'step': 5027, 'epoch': 3} {'type': 'loss', 'content': 0.008249454200267792, 'timestamp': '2025-09-30 22:20:17.350687', 'step': 5028, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:17.387331', 'step': 5028, 'epoch': 3} {'type': 'loss', 'content': 0.005023043602705002, 'timestamp': '2025-09-30 22:20:17.396077', 'step': 5029, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:17.435520', 'step': 5029, 'epoch': 3} {'type': 'loss', 'content': 0.0007688560290262103, 'timestamp': '2025-09-30 22:20:17.446811', 'step': 5030, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:17.478563', 'step': 5030, 'epoch': 3} {'type': 'loss', 'content': 0.0028127082623541355, 'timestamp': '2025-09-30 22:20:17.489162', 'step': 5031, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:17.529278', 'step': 5031, 'epoch': 3} {'type': 'loss', 'content': 0.004154357593506575, 'timestamp': '2025-09-30 22:20:17.560597', 'step': 5032, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:17.598751', 'step': 5032, 'epoch': 3} {'type': 'loss', 'content': 0.001402303110808134, 'timestamp': '2025-09-30 22:20:17.615024', 'step': 5033, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:17.658997', 'step': 5033, 'epoch': 3} {'type': 'loss', 'content': 0.0005616036360152066, 'timestamp': '2025-09-30 22:20:17.671340', 'step': 5034, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:17.704018', 'step': 5034, 'epoch': 3} {'type': 'loss', 'content': 0.0007149986340664327, 'timestamp': '2025-09-30 22:20:17.715071', 'step': 5035, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:20:17.758087', 'step': 5035, 'epoch': 3} {'type': 'loss', 'content': 0.0006970600225031376, 'timestamp': '2025-09-30 22:20:17.792801', 'step': 5036, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:17.840948', 'step': 5036, 'epoch': 3} {'type': 'loss', 'content': 0.0001771541137713939, 'timestamp': '2025-09-30 22:20:17.851519', 'step': 5037, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:17.897262', 'step': 5037, 'epoch': 3} {'type': 'loss', 'content': 0.002388948807492852, 'timestamp': '2025-09-30 22:20:17.905190', 'step': 5038, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:17.950640', 'step': 5038, 'epoch': 3} {'type': 'loss', 'content': 0.010566468350589275, 'timestamp': '2025-09-30 22:20:17.963228', 'step': 5039, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:20:17.997513', 'step': 5039, 'epoch': 3} {'type': 'loss', 'content': 0.0237954743206501, 'timestamp': '2025-09-30 22:20:18.025970', 'step': 5040, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:18.067672', 'step': 5040, 'epoch': 3} {'type': 'loss', 'content': 0.0006434906972572207, 'timestamp': '2025-09-30 22:20:18.079665', 'step': 5041, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:18.121037', 'step': 5041, 'epoch': 3} {'type': 'loss', 'content': 0.000888917304109782, 'timestamp': '2025-09-30 22:20:18.132315', 'step': 5042, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:20:18.178891', 'step': 5042, 'epoch': 3} {'type': 'loss', 'content': 0.0016442594351246953, 'timestamp': '2025-09-30 22:20:18.190234', 'step': 5043, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:20:18.230054', 'step': 5043, 'epoch': 3} {'type': 'loss', 'content': 0.009254230186343193, 'timestamp': '2025-09-30 22:20:18.258638', 'step': 5044, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:20:18.291921', 'step': 5044, 'epoch': 3} {'type': 'loss', 'content': 0.00074589136056602, 'timestamp': '2025-09-30 22:20:18.296473', 'step': 5045, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:20:18.331178', 'step': 5045, 'epoch': 3} {'type': 'loss', 'content': 0.0008237607544288039, 'timestamp': '2025-09-30 22:20:18.336049', 'step': 5046, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:20:18.369174', 'step': 5046, 'epoch': 3} {'type': 'loss', 'content': 0.0040649957954883575, 'timestamp': '2025-09-30 22:20:18.385038', 'step': 5047, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:18.418435', 'step': 5047, 'epoch': 3} {'type': 'loss', 'content': 0.005607422906905413, 'timestamp': '2025-09-30 22:20:18.449790', 'step': 5048, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:18.491379', 'step': 5048, 'epoch': 3} {'type': 'loss', 'content': 0.0010886044474318624, 'timestamp': '2025-09-30 22:20:18.503756', 'step': 5049, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:18.537558', 'step': 5049, 'epoch': 3} {'type': 'loss', 'content': 0.002487512305378914, 'timestamp': '2025-09-30 22:20:18.550094', 'step': 5050, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:20:18.607654', 'step': 5050, 'epoch': 3} {'type': 'loss', 'content': 0.002762896940112114, 'timestamp': '2025-09-30 22:20:18.621394', 'step': 5051, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:18.663181', 'step': 5051, 'epoch': 3} {'type': 'loss', 'content': 0.001024669618345797, 'timestamp': '2025-09-30 22:20:18.697269', 'step': 5052, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:20:18.730431', 'step': 5052, 'epoch': 3} {'type': 'loss', 'content': 0.0033020072150975466, 'timestamp': '2025-09-30 22:20:18.742139', 'step': 5053, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:20:18.783491', 'step': 5053, 'epoch': 3} {'type': 'loss', 'content': 0.006183733697980642, 'timestamp': '2025-09-30 22:20:18.795904', 'step': 5054, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:18.828389', 'step': 5054, 'epoch': 3} {'type': 'loss', 'content': 0.0011442236136645079, 'timestamp': '2025-09-30 22:20:18.839367', 'step': 5055, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:20:18.873178', 'step': 5055, 'epoch': 3} {'type': 'loss', 'content': 0.003287734929472208, 'timestamp': '2025-09-30 22:20:18.907436', 'step': 5056, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:18.941851', 'step': 5056, 'epoch': 3} {'type': 'loss', 'content': 0.0019181531388312578, 'timestamp': '2025-09-30 22:20:18.954859', 'step': 5057, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:20:18.999825', 'step': 5057, 'epoch': 3} {'type': 'loss', 'content': 0.004124120809137821, 'timestamp': '2025-09-30 22:20:19.013681', 'step': 5058, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:19.050015', 'step': 5058, 'epoch': 3} {'type': 'loss', 'content': 0.0011872005416080356, 'timestamp': '2025-09-30 22:20:19.062504', 'step': 5059, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:19.104044', 'step': 5059, 'epoch': 3} {'type': 'loss', 'content': 0.003657316090539098, 'timestamp': '2025-09-30 22:20:19.141364', 'step': 5060, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-09-30 22:20:21.541585', 'step': 5060, 'epoch': 3} {'type': 'pplx', 'content': 5.869142437621591, 'timestamp': '2025-09-30 22:20:21.552575', 'step': 5060, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:21.584332', 'step': 5060, 'epoch': 3} {'type': 'loss', 'content': 0.002698945812880993, 'timestamp': '2025-09-30 22:20:21.592275', 'step': 5061, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:21.625848', 'step': 5061, 'epoch': 3} {'type': 'loss', 'content': 0.002139536663889885, 'timestamp': '2025-09-30 22:20:21.638457', 'step': 5062, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:21.678589', 'step': 5062, 'epoch': 3} {'type': 'loss', 'content': 0.0012386830057948828, 'timestamp': '2025-09-30 22:20:21.688933', 'step': 5063, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:21.722650', 'step': 5063, 'epoch': 3} {'type': 'loss', 'content': 0.0005841401871293783, 'timestamp': '2025-09-30 22:20:21.754480', 'step': 5064, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:21.797283', 'step': 5064, 'epoch': 3} {'type': 'loss', 'content': 0.0005301560158841312, 'timestamp': '2025-09-30 22:20:21.807431', 'step': 5065, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:21.840276', 'step': 5065, 'epoch': 3} {'type': 'loss', 'content': 0.005327840335667133, 'timestamp': '2025-09-30 22:20:21.851464', 'step': 5066, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:20:21.891879', 'step': 5066, 'epoch': 3} {'type': 'loss', 'content': 0.0005545703461393714, 'timestamp': '2025-09-30 22:20:21.899460', 'step': 5067, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:21.931033', 'step': 5067, 'epoch': 3} {'type': 'loss', 'content': 0.001500859041698277, 'timestamp': '2025-09-30 22:20:21.963736', 'step': 5068, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:20:22.004078', 'step': 5068, 'epoch': 3} {'type': 'loss', 'content': 0.0016331304796040058, 'timestamp': '2025-09-30 22:20:22.017143', 'step': 5069, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:22.057346', 'step': 5069, 'epoch': 3} {'type': 'loss', 'content': 0.0013913362054154277, 'timestamp': '2025-09-30 22:20:22.065169', 'step': 5070, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:20:22.107825', 'step': 5070, 'epoch': 3} {'type': 'loss', 'content': 0.011057810857892036, 'timestamp': '2025-09-30 22:20:22.121565', 'step': 5071, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:20:22.162276', 'step': 5071, 'epoch': 3} {'type': 'loss', 'content': 0.0009770106989890337, 'timestamp': '2025-09-30 22:20:22.196472', 'step': 5072, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:22.229855', 'step': 5072, 'epoch': 3} {'type': 'loss', 'content': 0.0020410018041729927, 'timestamp': '2025-09-30 22:20:22.240626', 'step': 5073, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:20:22.282091', 'step': 5073, 'epoch': 3} {'type': 'loss', 'content': 0.003938453271985054, 'timestamp': '2025-09-30 22:20:22.295469', 'step': 5074, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:22.342443', 'step': 5074, 'epoch': 3} {'type': 'loss', 'content': 0.00034149156999774277, 'timestamp': '2025-09-30 22:20:22.356095', 'step': 5075, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:22.397790', 'step': 5075, 'epoch': 3} {'type': 'loss', 'content': 0.0015579075552523136, 'timestamp': '2025-09-30 22:20:22.429927', 'step': 5076, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:20:22.471366', 'step': 5076, 'epoch': 3} {'type': 'loss', 'content': 0.0011037306394428015, 'timestamp': '2025-09-30 22:20:22.481746', 'step': 5077, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:20:22.529593', 'step': 5077, 'epoch': 3} {'type': 'loss', 'content': 0.002535564359277487, 'timestamp': '2025-09-30 22:20:22.538953', 'step': 5078, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:20:22.578446', 'step': 5078, 'epoch': 3} {'type': 'loss', 'content': 0.0018124495400115848, 'timestamp': '2025-09-30 22:20:22.595330', 'step': 5079, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:20:22.635621', 'step': 5079, 'epoch': 3} {'type': 'loss', 'content': 0.003327887039631605, 'timestamp': '2025-09-30 22:20:22.664036', 'step': 5080, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:20:22.704286', 'step': 5080, 'epoch': 3} {'type': 'loss', 'content': 0.0026340847834944725, 'timestamp': '2025-09-30 22:20:22.716848', 'step': 5081, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:22.750905', 'step': 5081, 'epoch': 3} {'type': 'loss', 'content': 0.0014675736892968416, 'timestamp': '2025-09-30 22:20:22.762085', 'step': 5082, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:22.800421', 'step': 5082, 'epoch': 3} {'type': 'loss', 'content': 0.0008998352568596601, 'timestamp': '2025-09-30 22:20:22.811610', 'step': 5083, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:20:22.855065', 'step': 5083, 'epoch': 3} {'type': 'loss', 'content': 0.0049322196282446384, 'timestamp': '2025-09-30 22:20:22.889650', 'step': 5084, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:22.933551', 'step': 5084, 'epoch': 3} {'type': 'loss', 'content': 0.00042208569357171655, 'timestamp': '2025-09-30 22:20:22.942174', 'step': 5085, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:22.975520', 'step': 5085, 'epoch': 3} {'type': 'loss', 'content': 0.002821327419951558, 'timestamp': '2025-09-30 22:20:22.986544', 'step': 5086, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:23.027136', 'step': 5086, 'epoch': 3} {'type': 'loss', 'content': 0.0022981923539191484, 'timestamp': '2025-09-30 22:20:23.037458', 'step': 5087, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:23.090413', 'step': 5087, 'epoch': 3} {'type': 'loss', 'content': 0.0007188955205492675, 'timestamp': '2025-09-30 22:20:23.122218', 'step': 5088, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:23.166275', 'step': 5088, 'epoch': 3} {'type': 'loss', 'content': 0.00043219528743065894, 'timestamp': '2025-09-30 22:20:23.174335', 'step': 5089, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:20:23.212596', 'step': 5089, 'epoch': 3} {'type': 'loss', 'content': 0.0014171154471114278, 'timestamp': '2025-09-30 22:20:23.222844', 'step': 5090, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:23.257514', 'step': 5090, 'epoch': 3} {'type': 'loss', 'content': 0.0023080131504684687, 'timestamp': '2025-09-30 22:20:23.270766', 'step': 5091, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:23.317405', 'step': 5091, 'epoch': 3} {'type': 'loss', 'content': 0.002961727324873209, 'timestamp': '2025-09-30 22:20:23.347894', 'step': 5092, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:20:23.381768', 'step': 5092, 'epoch': 3} {'type': 'loss', 'content': 0.0007711683865636587, 'timestamp': '2025-09-30 22:20:23.394422', 'step': 5093, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:23.431988', 'step': 5093, 'epoch': 3} {'type': 'loss', 'content': 0.002112046116963029, 'timestamp': '2025-09-30 22:20:23.441171', 'step': 5094, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:23.481488', 'step': 5094, 'epoch': 3} {'type': 'loss', 'content': 0.0012389529729261994, 'timestamp': '2025-09-30 22:20:23.492530', 'step': 5095, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:23.527565', 'step': 5095, 'epoch': 3} {'type': 'loss', 'content': 0.006873283069580793, 'timestamp': '2025-09-30 22:20:23.561001', 'step': 5096, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:23.601288', 'step': 5096, 'epoch': 3} {'type': 'loss', 'content': 0.0029062642715871334, 'timestamp': '2025-09-30 22:20:23.609892', 'step': 5097, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:23.641598', 'step': 5097, 'epoch': 3} {'type': 'loss', 'content': 0.004804032389074564, 'timestamp': '2025-09-30 22:20:23.653875', 'step': 5098, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:23.694926', 'step': 5098, 'epoch': 3} {'type': 'loss', 'content': 0.001358595909550786, 'timestamp': '2025-09-30 22:20:23.702702', 'step': 5099, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:23.742504', 'step': 5099, 'epoch': 3} {'type': 'loss', 'content': 0.0027899867855012417, 'timestamp': '2025-09-30 22:20:23.773952', 'step': 5100, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:23.821382', 'step': 5100, 'epoch': 3} {'type': 'loss', 'content': 0.0014267588267102838, 'timestamp': '2025-09-30 22:20:23.831676', 'step': 5101, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:23.872720', 'step': 5101, 'epoch': 3} {'type': 'loss', 'content': 0.0021460719872266054, 'timestamp': '2025-09-30 22:20:23.885084', 'step': 5102, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:20:23.922788', 'step': 5102, 'epoch': 3} {'type': 'loss', 'content': 0.0053979563526809216, 'timestamp': '2025-09-30 22:20:23.936097', 'step': 5103, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:23.997633', 'step': 5103, 'epoch': 3} {'type': 'loss', 'content': 0.0006758447270840406, 'timestamp': '2025-09-30 22:20:24.031545', 'step': 5104, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:20:24.078213', 'step': 5104, 'epoch': 3} {'type': 'loss', 'content': 0.0037395928520709276, 'timestamp': '2025-09-30 22:20:24.084524', 'step': 5105, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:24.121548', 'step': 5105, 'epoch': 3} {'type': 'loss', 'content': 0.00035706738708540797, 'timestamp': '2025-09-30 22:20:24.139222', 'step': 5106, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:20:24.191593', 'step': 5106, 'epoch': 3} {'type': 'loss', 'content': 0.002709871158003807, 'timestamp': '2025-09-30 22:20:24.198911', 'step': 5107, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:24.243361', 'step': 5107, 'epoch': 3} {'type': 'loss', 'content': 0.0020465017296373844, 'timestamp': '2025-09-30 22:20:24.278108', 'step': 5108, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:24.315975', 'step': 5108, 'epoch': 3} {'type': 'loss', 'content': 0.001779739512130618, 'timestamp': '2025-09-30 22:20:24.324723', 'step': 5109, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:24.356177', 'step': 5109, 'epoch': 3} {'type': 'loss', 'content': 0.0033832560293376446, 'timestamp': '2025-09-30 22:20:24.364232', 'step': 5110, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:24.395781', 'step': 5110, 'epoch': 3} {'type': 'loss', 'content': 0.002700228476896882, 'timestamp': '2025-09-30 22:20:24.406780', 'step': 5111, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:20:24.439400', 'step': 5111, 'epoch': 3} {'type': 'loss', 'content': 0.0015108698280528188, 'timestamp': '2025-09-30 22:20:24.467535', 'step': 5112, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:20:24.502587', 'step': 5112, 'epoch': 3} {'type': 'loss', 'content': 0.0027247534599155188, 'timestamp': '2025-09-30 22:20:24.507447', 'step': 5113, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:20:24.539150', 'step': 5113, 'epoch': 3} {'type': 'loss', 'content': 0.0008614298421889544, 'timestamp': '2025-09-30 22:20:24.546106', 'step': 5114, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:20:24.581046', 'step': 5114, 'epoch': 3} {'type': 'loss', 'content': 0.0016441134503111243, 'timestamp': '2025-09-30 22:20:24.589468', 'step': 5115, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:24.621617', 'step': 5115, 'epoch': 3} {'type': 'loss', 'content': 0.0015888211783021688, 'timestamp': '2025-09-30 22:20:24.652777', 'step': 5116, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:24.685887', 'step': 5116, 'epoch': 3} {'type': 'loss', 'content': 0.0004532829625532031, 'timestamp': '2025-09-30 22:20:24.696576', 'step': 5117, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:24.729373', 'step': 5117, 'epoch': 3} {'type': 'loss', 'content': 0.009183548390865326, 'timestamp': '2025-09-30 22:20:24.737259', 'step': 5118, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:20:24.784572', 'step': 5118, 'epoch': 3} {'type': 'loss', 'content': 0.0008579294662922621, 'timestamp': '2025-09-30 22:20:24.789141', 'step': 5119, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:20:24.830750', 'step': 5119, 'epoch': 3} {'type': 'loss', 'content': 0.003431705292314291, 'timestamp': '2025-09-30 22:20:24.859252', 'step': 5120, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:20:24.891755', 'step': 5120, 'epoch': 3} {'type': 'loss', 'content': 0.00036426790757104754, 'timestamp': '2025-09-30 22:20:24.898406', 'step': 5121, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:20:24.931043', 'step': 5121, 'epoch': 3} {'type': 'loss', 'content': 0.0005698530003428459, 'timestamp': '2025-09-30 22:20:24.939405', 'step': 5122, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:24.976596', 'step': 5122, 'epoch': 3} {'type': 'loss', 'content': 0.003810714930295944, 'timestamp': '2025-09-30 22:20:24.986954', 'step': 5123, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:25.026821', 'step': 5123, 'epoch': 3} {'type': 'loss', 'content': 0.0008139772689901292, 'timestamp': '2025-09-30 22:20:25.058186', 'step': 5124, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:25.095840', 'step': 5124, 'epoch': 3} {'type': 'loss', 'content': 0.0010362503817304969, 'timestamp': '2025-09-30 22:20:25.101423', 'step': 5125, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:25.134031', 'step': 5125, 'epoch': 3} {'type': 'loss', 'content': 0.0014636110281571746, 'timestamp': '2025-09-30 22:20:25.144591', 'step': 5126, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:25.180560', 'step': 5126, 'epoch': 3} {'type': 'loss', 'content': 0.00022271781926974654, 'timestamp': '2025-09-30 22:20:25.191807', 'step': 5127, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:25.224558', 'step': 5127, 'epoch': 3} {'type': 'loss', 'content': 8.554430678486824e-05, 'timestamp': '2025-09-30 22:20:25.253444', 'step': 5128, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:20:25.286751', 'step': 5128, 'epoch': 3} {'type': 'loss', 'content': 0.005259825848042965, 'timestamp': '2025-09-30 22:20:25.291861', 'step': 5129, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:25.329086', 'step': 5129, 'epoch': 3} {'type': 'loss', 'content': 0.0031255753710865974, 'timestamp': '2025-09-30 22:20:25.337088', 'step': 5130, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:20:25.376168', 'step': 5130, 'epoch': 3} {'type': 'loss', 'content': 0.006884410977363586, 'timestamp': '2025-09-30 22:20:25.384897', 'step': 5131, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:25.425789', 'step': 5131, 'epoch': 3} {'type': 'loss', 'content': 0.00804687850177288, 'timestamp': '2025-09-30 22:20:25.458904', 'step': 5132, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:25.510823', 'step': 5132, 'epoch': 3} {'type': 'loss', 'content': 0.0006524841883219779, 'timestamp': '2025-09-30 22:20:25.517170', 'step': 5133, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:25.560331', 'step': 5133, 'epoch': 3} {'type': 'loss', 'content': 0.0006048069335520267, 'timestamp': '2025-09-30 22:20:25.568176', 'step': 5134, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:20:25.607791', 'step': 5134, 'epoch': 3} {'type': 'loss', 'content': 8.057076775003225e-05, 'timestamp': '2025-09-30 22:20:25.621117', 'step': 5135, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:25.657377', 'step': 5135, 'epoch': 3} {'type': 'loss', 'content': 0.002809246303513646, 'timestamp': '2025-09-30 22:20:25.686950', 'step': 5136, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:25.721322', 'step': 5136, 'epoch': 3} {'type': 'loss', 'content': 0.0012514720438048244, 'timestamp': '2025-09-30 22:20:25.732114', 'step': 5137, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:25.775466', 'step': 5137, 'epoch': 3} {'type': 'loss', 'content': 0.00027411506744101644, 'timestamp': '2025-09-30 22:20:25.789311', 'step': 5138, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:20:25.837741', 'step': 5138, 'epoch': 3} {'type': 'loss', 'content': 0.004303140100091696, 'timestamp': '2025-09-30 22:20:25.851174', 'step': 5139, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:20:25.894558', 'step': 5139, 'epoch': 3} {'type': 'loss', 'content': 0.0010680407285690308, 'timestamp': '2025-09-30 22:20:25.922570', 'step': 5140, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:20:25.954086', 'step': 5140, 'epoch': 3} {'type': 'loss', 'content': 0.0021734959445893764, 'timestamp': '2025-09-30 22:20:25.957403', 'step': 5141, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:20:25.990150', 'step': 5141, 'epoch': 3} {'type': 'loss', 'content': 0.0010639060055837035, 'timestamp': '2025-09-30 22:20:25.997226', 'step': 5142, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:26.031424', 'step': 5142, 'epoch': 3} {'type': 'loss', 'content': 0.0004178355447947979, 'timestamp': '2025-09-30 22:20:26.039293', 'step': 5143, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:20:26.087030', 'step': 5143, 'epoch': 3} {'type': 'loss', 'content': 0.00021914293756708503, 'timestamp': '2025-09-30 22:20:26.118064', 'step': 5144, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:26.154458', 'step': 5144, 'epoch': 3} {'type': 'loss', 'content': 0.00019750260980799794, 'timestamp': '2025-09-30 22:20:26.162865', 'step': 5145, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:26.195393', 'step': 5145, 'epoch': 3} {'type': 'loss', 'content': 0.0038122800178825855, 'timestamp': '2025-09-30 22:20:26.205844', 'step': 5146, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-30 22:20:26.242585', 'step': 5146, 'epoch': 3} {'type': 'loss', 'content': 0.0025838292203843594, 'timestamp': '2025-09-30 22:20:26.250123', 'step': 5147, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:20:26.288272', 'step': 5147, 'epoch': 3} {'type': 'loss', 'content': 0.0002004976267926395, 'timestamp': '2025-09-30 22:20:26.317461', 'step': 5148, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:26.350936', 'step': 5148, 'epoch': 3} {'type': 'loss', 'content': 0.009289993904531002, 'timestamp': '2025-09-30 22:20:26.360059', 'step': 5149, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:26.417430', 'step': 5149, 'epoch': 3} {'type': 'loss', 'content': 0.0010501905344426632, 'timestamp': '2025-09-30 22:20:26.425310', 'step': 5150, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:26.460385', 'step': 5150, 'epoch': 3} {'type': 'loss', 'content': 5.500621045939624e-05, 'timestamp': '2025-09-30 22:20:26.471432', 'step': 5151, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:26.511083', 'step': 5151, 'epoch': 3} {'type': 'loss', 'content': 0.0026809093542397022, 'timestamp': '2025-09-30 22:20:26.540146', 'step': 5152, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:26.572819', 'step': 5152, 'epoch': 3} {'type': 'loss', 'content': 0.001439956366084516, 'timestamp': '2025-09-30 22:20:26.582770', 'step': 5153, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:20:26.621925', 'step': 5153, 'epoch': 3} {'type': 'loss', 'content': 0.00030368121224455535, 'timestamp': '2025-09-30 22:20:26.629435', 'step': 5154, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:26.670075', 'step': 5154, 'epoch': 3} {'type': 'loss', 'content': 0.0015567500377073884, 'timestamp': '2025-09-30 22:20:26.682712', 'step': 5155, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:26.723354', 'step': 5155, 'epoch': 3} {'type': 'loss', 'content': 0.0007795571000315249, 'timestamp': '2025-09-30 22:20:26.756592', 'step': 5156, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:26.799544', 'step': 5156, 'epoch': 3} {'type': 'loss', 'content': 0.0017613848904147744, 'timestamp': '2025-09-30 22:20:26.808727', 'step': 5157, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:26.848721', 'step': 5157, 'epoch': 3} {'type': 'loss', 'content': 0.010004185140132904, 'timestamp': '2025-09-30 22:20:26.859698', 'step': 5158, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:26.894196', 'step': 5158, 'epoch': 3} {'type': 'loss', 'content': 0.00019910503760911524, 'timestamp': '2025-09-30 22:20:26.902087', 'step': 5159, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:20:26.941508', 'step': 5159, 'epoch': 3} {'type': 'loss', 'content': 0.0004115682386327535, 'timestamp': '2025-09-30 22:20:26.973178', 'step': 5160, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:20:27.010989', 'step': 5160, 'epoch': 3} {'type': 'loss', 'content': 0.001299984403885901, 'timestamp': '2025-09-30 22:20:27.015898', 'step': 5161, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:27.057827', 'step': 5161, 'epoch': 3} {'type': 'loss', 'content': 0.0034290680196136236, 'timestamp': '2025-09-30 22:20:27.069564', 'step': 5162, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:27.103709', 'step': 5162, 'epoch': 3} {'type': 'loss', 'content': 0.0007851793197914958, 'timestamp': '2025-09-30 22:20:27.114069', 'step': 5163, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:27.148201', 'step': 5163, 'epoch': 3} {'type': 'loss', 'content': 0.00461316155269742, 'timestamp': '2025-09-30 22:20:27.181044', 'step': 5164, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:27.214486', 'step': 5164, 'epoch': 3} {'type': 'loss', 'content': 0.00031694734934717417, 'timestamp': '2025-09-30 22:20:27.222420', 'step': 5165, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:20:27.255503', 'step': 5165, 'epoch': 3} {'type': 'loss', 'content': 0.0018249392742291093, 'timestamp': '2025-09-30 22:20:27.263099', 'step': 5166, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:27.306568', 'step': 5166, 'epoch': 3} {'type': 'loss', 'content': 0.00027169540408067405, 'timestamp': '2025-09-30 22:20:27.314474', 'step': 5167, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:27.360754', 'step': 5167, 'epoch': 3} {'type': 'loss', 'content': 0.0006059342413209379, 'timestamp': '2025-09-30 22:20:27.391899', 'step': 5168, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:27.438306', 'step': 5168, 'epoch': 3} {'type': 'loss', 'content': 0.024362290278077126, 'timestamp': '2025-09-30 22:20:27.447068', 'step': 5169, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:27.484372', 'step': 5169, 'epoch': 3} {'type': 'loss', 'content': 0.000262056099018082, 'timestamp': '2025-09-30 22:20:27.495393', 'step': 5170, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:27.531964', 'step': 5170, 'epoch': 3} {'type': 'loss', 'content': 0.0019671518821269274, 'timestamp': '2025-09-30 22:20:27.543174', 'step': 5171, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:20:27.583221', 'step': 5171, 'epoch': 3} {'type': 'loss', 'content': 0.00355559797026217, 'timestamp': '2025-09-30 22:20:27.617755', 'step': 5172, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:20:27.657834', 'step': 5172, 'epoch': 3} {'type': 'loss', 'content': 0.0005548345507122576, 'timestamp': '2025-09-30 22:20:27.670882', 'step': 5173, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:20:27.713145', 'step': 5173, 'epoch': 3} {'type': 'loss', 'content': 0.0007509011193178594, 'timestamp': '2025-09-30 22:20:27.720494', 'step': 5174, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:27.773740', 'step': 5174, 'epoch': 3} {'type': 'loss', 'content': 0.0009994101710617542, 'timestamp': '2025-09-30 22:20:27.786281', 'step': 5175, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-09-30 22:20:30.397236', 'step': 5175, 'epoch': 3} {'type': 'pplx', 'content': 5.938044559665769, 'timestamp': '2025-09-30 22:20:30.406806', 'step': 5175, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 15187581968384}, 'timestamp': '2025-09-30 22:20:30.457948', 'step': 5175, 'epoch': 3} {'type': 'loss', 'content': 0.0033478126861155033, 'timestamp': '2025-09-30 22:20:30.496544', 'step': 5176, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:30.534459', 'step': 5176, 'epoch': 3} {'type': 'loss', 'content': 0.006446576677262783, 'timestamp': '2025-09-30 22:20:30.544136', 'step': 5177, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:30.579979', 'step': 5177, 'epoch': 3} {'type': 'loss', 'content': 0.00023951790353748947, 'timestamp': '2025-09-30 22:20:30.592285', 'step': 5178, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:30.625485', 'step': 5178, 'epoch': 3} {'type': 'loss', 'content': 0.00028509373078122735, 'timestamp': '2025-09-30 22:20:30.635754', 'step': 5179, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:30.672135', 'step': 5179, 'epoch': 3} {'type': 'loss', 'content': 0.001627097255550325, 'timestamp': '2025-09-30 22:20:30.703372', 'step': 5180, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:30.736927', 'step': 5180, 'epoch': 3} {'type': 'loss', 'content': 0.016529927030205727, 'timestamp': '2025-09-30 22:20:30.742622', 'step': 5181, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:20:30.778174', 'step': 5181, 'epoch': 3} {'type': 'loss', 'content': 0.00010932084842352197, 'timestamp': '2025-09-30 22:20:30.786223', 'step': 5182, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:30.824697', 'step': 5182, 'epoch': 3} {'type': 'loss', 'content': 0.008478675037622452, 'timestamp': '2025-09-30 22:20:30.835729', 'step': 5183, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:30.885503', 'step': 5183, 'epoch': 3} {'type': 'loss', 'content': 0.0004986929707229137, 'timestamp': '2025-09-30 22:20:30.916590', 'step': 5184, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:20:30.955441', 'step': 5184, 'epoch': 3} {'type': 'loss', 'content': 0.0013999422080814838, 'timestamp': '2025-09-30 22:20:30.968507', 'step': 5185, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:20:31.017174', 'step': 5185, 'epoch': 3} {'type': 'loss', 'content': 0.0019102203659713268, 'timestamp': '2025-09-30 22:20:31.024784', 'step': 5186, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:20:31.058202', 'step': 5186, 'epoch': 3} {'type': 'loss', 'content': 0.002953177783638239, 'timestamp': '2025-09-30 22:20:31.065633', 'step': 5187, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:31.101080', 'step': 5187, 'epoch': 3} {'type': 'loss', 'content': 0.012278086505830288, 'timestamp': '2025-09-30 22:20:31.132862', 'step': 5188, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:31.166258', 'step': 5188, 'epoch': 3} {'type': 'loss', 'content': 0.002866664668545127, 'timestamp': '2025-09-30 22:20:31.174860', 'step': 5189, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:31.208799', 'step': 5189, 'epoch': 3} {'type': 'loss', 'content': 0.005537381861358881, 'timestamp': '2025-09-30 22:20:31.221348', 'step': 5190, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:20:31.265227', 'step': 5190, 'epoch': 3} {'type': 'loss', 'content': 0.0009773658821359277, 'timestamp': '2025-09-30 22:20:31.278516', 'step': 5191, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:31.318770', 'step': 5191, 'epoch': 3} {'type': 'loss', 'content': 0.00014409396681003273, 'timestamp': '2025-09-30 22:20:31.349869', 'step': 5192, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:31.382776', 'step': 5192, 'epoch': 3} {'type': 'loss', 'content': 0.005357799585908651, 'timestamp': '2025-09-30 22:20:31.391287', 'step': 5193, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:31.437125', 'step': 5193, 'epoch': 3} {'type': 'loss', 'content': 0.0030632750131189823, 'timestamp': '2025-09-30 22:20:31.448159', 'step': 5194, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:20:31.484449', 'step': 5194, 'epoch': 3} {'type': 'loss', 'content': 0.0006114967400208116, 'timestamp': '2025-09-30 22:20:31.491706', 'step': 5195, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:31.540795', 'step': 5195, 'epoch': 3} {'type': 'loss', 'content': 0.0019180022645741701, 'timestamp': '2025-09-30 22:20:31.569389', 'step': 5196, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:20:31.603603', 'step': 5196, 'epoch': 3} {'type': 'loss', 'content': 0.00090098223881796, 'timestamp': '2025-09-30 22:20:31.616585', 'step': 5197, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:31.659672', 'step': 5197, 'epoch': 3} {'type': 'loss', 'content': 0.0028920513577759266, 'timestamp': '2025-09-30 22:20:31.670691', 'step': 5198, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:31.707723', 'step': 5198, 'epoch': 3} {'type': 'loss', 'content': 0.0017432866152375937, 'timestamp': '2025-09-30 22:20:31.718872', 'step': 5199, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:31.750195', 'step': 5199, 'epoch': 3} {'type': 'loss', 'content': 0.0003439671127125621, 'timestamp': '2025-09-30 22:20:31.781536', 'step': 5200, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:31.815571', 'step': 5200, 'epoch': 3} {'type': 'loss', 'content': 0.005172444973140955, 'timestamp': '2025-09-30 22:20:31.825574', 'step': 5201, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:31.859713', 'step': 5201, 'epoch': 3} {'type': 'loss', 'content': 0.0005199200822971761, 'timestamp': '2025-09-30 22:20:31.872023', 'step': 5202, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:31.922284', 'step': 5202, 'epoch': 3} {'type': 'loss', 'content': 0.0014771092683076859, 'timestamp': '2025-09-30 22:20:31.930093', 'step': 5203, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:31.963629', 'step': 5203, 'epoch': 3} {'type': 'loss', 'content': 0.003369405400007963, 'timestamp': '2025-09-30 22:20:31.995512', 'step': 5204, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:32.028677', 'step': 5204, 'epoch': 3} {'type': 'loss', 'content': 0.0018177288584411144, 'timestamp': '2025-09-30 22:20:32.036563', 'step': 5205, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:20:32.072676', 'step': 5205, 'epoch': 3} {'type': 'loss', 'content': 0.0003869224456138909, 'timestamp': '2025-09-30 22:20:32.080301', 'step': 5206, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:20:32.113509', 'step': 5206, 'epoch': 3} {'type': 'loss', 'content': 0.0002616798155941069, 'timestamp': '2025-09-30 22:20:32.120523', 'step': 5207, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:32.155512', 'step': 5207, 'epoch': 3} {'type': 'loss', 'content': 0.003533238312229514, 'timestamp': '2025-09-30 22:20:32.184369', 'step': 5208, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:20:32.223681', 'step': 5208, 'epoch': 3} {'type': 'loss', 'content': 0.002410394372418523, 'timestamp': '2025-09-30 22:20:32.228986', 'step': 5209, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:32.260956', 'step': 5209, 'epoch': 3} {'type': 'loss', 'content': 0.0016966222319751978, 'timestamp': '2025-09-30 22:20:32.273316', 'step': 5210, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:20:32.309486', 'step': 5210, 'epoch': 3} {'type': 'loss', 'content': 0.0007273682276718318, 'timestamp': '2025-09-30 22:20:32.322850', 'step': 5211, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:32.364176', 'step': 5211, 'epoch': 3} {'type': 'loss', 'content': 0.000545423710718751, 'timestamp': '2025-09-30 22:20:32.393092', 'step': 5212, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:32.429172', 'step': 5212, 'epoch': 3} {'type': 'loss', 'content': 0.0007850287365727127, 'timestamp': '2025-09-30 22:20:32.434818', 'step': 5213, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:20:32.475036', 'step': 5213, 'epoch': 3} {'type': 'loss', 'content': 0.0016018090536817908, 'timestamp': '2025-09-30 22:20:32.478737', 'step': 5214, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:32.512436', 'step': 5214, 'epoch': 3} {'type': 'loss', 'content': 0.00016927003161981702, 'timestamp': '2025-09-30 22:20:32.523527', 'step': 5215, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:20:32.558872', 'step': 5215, 'epoch': 3} {'type': 'loss', 'content': 0.005434432066977024, 'timestamp': '2025-09-30 22:20:32.593075', 'step': 5216, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:32.629591', 'step': 5216, 'epoch': 3} {'type': 'loss', 'content': 0.0004774282278958708, 'timestamp': '2025-09-30 22:20:32.637430', 'step': 5217, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:32.673076', 'step': 5217, 'epoch': 3} {'type': 'loss', 'content': 0.0003328354796394706, 'timestamp': '2025-09-30 22:20:32.684226', 'step': 5218, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:20:32.726073', 'step': 5218, 'epoch': 3} {'type': 'loss', 'content': 0.000514206534717232, 'timestamp': '2025-09-30 22:20:32.739686', 'step': 5219, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:32.776011', 'step': 5219, 'epoch': 3} {'type': 'loss', 'content': 0.00044423979124985635, 'timestamp': '2025-09-30 22:20:32.809413', 'step': 5220, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:32.846641', 'step': 5220, 'epoch': 3} {'type': 'loss', 'content': 0.0009071379899978638, 'timestamp': '2025-09-30 22:20:32.855464', 'step': 5221, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:32.889382', 'step': 5221, 'epoch': 3} {'type': 'loss', 'content': 0.00041850502020679414, 'timestamp': '2025-09-30 22:20:32.901927', 'step': 5222, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:32.935762', 'step': 5222, 'epoch': 3} {'type': 'loss', 'content': 0.002450287574902177, 'timestamp': '2025-09-30 22:20:32.946753', 'step': 5223, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:32.985650', 'step': 5223, 'epoch': 3} {'type': 'loss', 'content': 0.0014607037883251905, 'timestamp': '2025-09-30 22:20:33.019133', 'step': 5224, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:20:33.057776', 'step': 5224, 'epoch': 3} {'type': 'loss', 'content': 0.0015632001450285316, 'timestamp': '2025-09-30 22:20:33.070439', 'step': 5225, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:33.110415', 'step': 5225, 'epoch': 3} {'type': 'loss', 'content': 0.0019418747397139668, 'timestamp': '2025-09-30 22:20:33.122403', 'step': 5226, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:33.159264', 'step': 5226, 'epoch': 3} {'type': 'loss', 'content': 0.00225188210606575, 'timestamp': '2025-09-30 22:20:33.171790', 'step': 5227, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:33.205574', 'step': 5227, 'epoch': 3} {'type': 'loss', 'content': 0.0004304961476009339, 'timestamp': '2025-09-30 22:20:33.238799', 'step': 5228, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:33.275459', 'step': 5228, 'epoch': 3} {'type': 'loss', 'content': 0.007783898152410984, 'timestamp': '2025-09-30 22:20:33.283634', 'step': 5229, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:20:33.324598', 'step': 5229, 'epoch': 3} {'type': 'loss', 'content': 0.000680253840982914, 'timestamp': '2025-09-30 22:20:33.338330', 'step': 5230, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:33.374389', 'step': 5230, 'epoch': 3} {'type': 'loss', 'content': 0.008012272417545319, 'timestamp': '2025-09-30 22:20:33.386779', 'step': 5231, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:33.422279', 'step': 5231, 'epoch': 3} {'type': 'loss', 'content': 0.001280724536627531, 'timestamp': '2025-09-30 22:20:33.455439', 'step': 5232, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:33.497268', 'step': 5232, 'epoch': 3} {'type': 'loss', 'content': 0.0010424808133393526, 'timestamp': '2025-09-30 22:20:33.507103', 'step': 5233, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:20:33.547504', 'step': 5233, 'epoch': 3} {'type': 'loss', 'content': 0.00045934141962789, 'timestamp': '2025-09-30 22:20:33.560896', 'step': 5234, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:33.597460', 'step': 5234, 'epoch': 3} {'type': 'loss', 'content': 0.008249049074947834, 'timestamp': '2025-09-30 22:20:33.609585', 'step': 5235, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:33.643445', 'step': 5235, 'epoch': 3} {'type': 'loss', 'content': 0.0005358067573979497, 'timestamp': '2025-09-30 22:20:33.676420', 'step': 5236, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:20:33.712599', 'step': 5236, 'epoch': 3} {'type': 'loss', 'content': 0.0032138959504663944, 'timestamp': '2025-09-30 22:20:33.725288', 'step': 5237, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:33.761490', 'step': 5237, 'epoch': 3} {'type': 'loss', 'content': 0.00473925331607461, 'timestamp': '2025-09-30 22:20:33.774046', 'step': 5238, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:33.816568', 'step': 5238, 'epoch': 3} {'type': 'loss', 'content': 0.0025055333971977234, 'timestamp': '2025-09-30 22:20:33.827764', 'step': 5239, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:20:33.864876', 'step': 5239, 'epoch': 3} {'type': 'loss', 'content': 0.00852253008633852, 'timestamp': '2025-09-30 22:20:33.899156', 'step': 5240, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:33.932394', 'step': 5240, 'epoch': 3} {'type': 'loss', 'content': 0.014438341371715069, 'timestamp': '2025-09-30 22:20:33.943061', 'step': 5241, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:33.998064', 'step': 5241, 'epoch': 3} {'type': 'loss', 'content': 0.0006247243145480752, 'timestamp': '2025-09-30 22:20:34.009118', 'step': 5242, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:20:34.045474', 'step': 5242, 'epoch': 3} {'type': 'loss', 'content': 0.0036119078285992146, 'timestamp': '2025-09-30 22:20:34.058814', 'step': 5243, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:34.101596', 'step': 5243, 'epoch': 3} {'type': 'loss', 'content': 0.005055837798863649, 'timestamp': '2025-09-30 22:20:34.130365', 'step': 5244, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:34.165120', 'step': 5244, 'epoch': 3} {'type': 'loss', 'content': 0.0008276871521957219, 'timestamp': '2025-09-30 22:20:34.173174', 'step': 5245, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:34.218116', 'step': 5245, 'epoch': 3} {'type': 'loss', 'content': 0.001044438686221838, 'timestamp': '2025-09-30 22:20:34.229154', 'step': 5246, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:20:34.264786', 'step': 5246, 'epoch': 3} {'type': 'loss', 'content': 0.0003220552462153137, 'timestamp': '2025-09-30 22:20:34.278504', 'step': 5247, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:34.330796', 'step': 5247, 'epoch': 3} {'type': 'loss', 'content': 0.0025070966221392155, 'timestamp': '2025-09-30 22:20:34.359689', 'step': 5248, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:34.412179', 'step': 5248, 'epoch': 3} {'type': 'loss', 'content': 0.0007652582135051489, 'timestamp': '2025-09-30 22:20:34.420284', 'step': 5249, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:20:34.466731', 'step': 5249, 'epoch': 3} {'type': 'loss', 'content': 0.001774575561285019, 'timestamp': '2025-09-30 22:20:34.480719', 'step': 5250, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:34.514465', 'step': 5250, 'epoch': 3} {'type': 'loss', 'content': 0.02314218133687973, 'timestamp': '2025-09-30 22:20:34.525574', 'step': 5251, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:34.571285', 'step': 5251, 'epoch': 3} {'type': 'loss', 'content': 0.007349980063736439, 'timestamp': '2025-09-30 22:20:34.602497', 'step': 5252, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:34.646748', 'step': 5252, 'epoch': 3} {'type': 'loss', 'content': 0.0015652753645554185, 'timestamp': '2025-09-30 22:20:34.655354', 'step': 5253, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:34.695670', 'step': 5253, 'epoch': 3} {'type': 'loss', 'content': 0.0008087062160484493, 'timestamp': '2025-09-30 22:20:34.708004', 'step': 5254, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:20:34.746280', 'step': 5254, 'epoch': 3} {'type': 'loss', 'content': 0.006365319713950157, 'timestamp': '2025-09-30 22:20:34.759676', 'step': 5255, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:34.796812', 'step': 5255, 'epoch': 3} {'type': 'loss', 'content': 0.0005592239904217422, 'timestamp': '2025-09-30 22:20:34.828055', 'step': 5256, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:34.860427', 'step': 5256, 'epoch': 3} {'type': 'loss', 'content': 0.0007176320650614798, 'timestamp': '2025-09-30 22:20:34.868293', 'step': 5257, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:34.906279', 'step': 5257, 'epoch': 3} {'type': 'loss', 'content': 0.0006524588097818196, 'timestamp': '2025-09-30 22:20:34.918829', 'step': 5258, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:34.956194', 'step': 5258, 'epoch': 3} {'type': 'loss', 'content': 0.0019349107751622796, 'timestamp': '2025-09-30 22:20:34.967311', 'step': 5259, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:35.004851', 'step': 5259, 'epoch': 3} {'type': 'loss', 'content': 0.004506261087954044, 'timestamp': '2025-09-30 22:20:35.038294', 'step': 5260, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:35.075235', 'step': 5260, 'epoch': 3} {'type': 'loss', 'content': 0.0013760910369455814, 'timestamp': '2025-09-30 22:20:35.085792', 'step': 5261, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:35.124341', 'step': 5261, 'epoch': 3} {'type': 'loss', 'content': 0.002927417866885662, 'timestamp': '2025-09-30 22:20:35.136755', 'step': 5262, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:35.177175', 'step': 5262, 'epoch': 3} {'type': 'loss', 'content': 0.0003989685792475939, 'timestamp': '2025-09-30 22:20:35.189530', 'step': 5263, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:20:35.237739', 'step': 5263, 'epoch': 3} {'type': 'loss', 'content': 0.000184141201316379, 'timestamp': '2025-09-30 22:20:35.272385', 'step': 5264, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:20:35.312270', 'step': 5264, 'epoch': 3} {'type': 'loss', 'content': 0.0003238429198972881, 'timestamp': '2025-09-30 22:20:35.324865', 'step': 5265, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:20:35.371176', 'step': 5265, 'epoch': 3} {'type': 'loss', 'content': 9.319038508692756e-05, 'timestamp': '2025-09-30 22:20:35.387082', 'step': 5266, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:20:35.430175', 'step': 5266, 'epoch': 3} {'type': 'loss', 'content': 0.0006466032355092466, 'timestamp': '2025-09-30 22:20:35.445799', 'step': 5267, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:20:35.502586', 'step': 5267, 'epoch': 3} {'type': 'loss', 'content': 0.004853468853980303, 'timestamp': '2025-09-30 22:20:35.537100', 'step': 5268, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:35.569918', 'step': 5268, 'epoch': 3} {'type': 'loss', 'content': 0.010006862692534924, 'timestamp': '2025-09-30 22:20:35.578581', 'step': 5269, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:35.616650', 'step': 5269, 'epoch': 3} {'type': 'loss', 'content': 0.0016347052296623588, 'timestamp': '2025-09-30 22:20:35.627806', 'step': 5270, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:35.661170', 'step': 5270, 'epoch': 3} {'type': 'loss', 'content': 0.000581001047976315, 'timestamp': '2025-09-30 22:20:35.671529', 'step': 5271, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:35.706630', 'step': 5271, 'epoch': 3} {'type': 'loss', 'content': 0.0010013194987550378, 'timestamp': '2025-09-30 22:20:35.740109', 'step': 5272, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:35.773585', 'step': 5272, 'epoch': 3} {'type': 'loss', 'content': 0.007017929572612047, 'timestamp': '2025-09-30 22:20:35.781484', 'step': 5273, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:35.826335', 'step': 5273, 'epoch': 3} {'type': 'loss', 'content': 0.00020266005594749004, 'timestamp': '2025-09-30 22:20:35.862472', 'step': 5274, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:35.917093', 'step': 5274, 'epoch': 3} {'type': 'loss', 'content': 0.00014247871877159923, 'timestamp': '2025-09-30 22:20:35.927471', 'step': 5275, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:35.964544', 'step': 5275, 'epoch': 3} {'type': 'loss', 'content': 0.0002896743535529822, 'timestamp': '2025-09-30 22:20:35.997994', 'step': 5276, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:20:36.033731', 'step': 5276, 'epoch': 3} {'type': 'loss', 'content': 0.0008965826709754765, 'timestamp': '2025-09-30 22:20:36.046395', 'step': 5277, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:36.084641', 'step': 5277, 'epoch': 3} {'type': 'loss', 'content': 0.0016090476419776678, 'timestamp': '2025-09-30 22:20:36.095641', 'step': 5278, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:20:36.146740', 'step': 5278, 'epoch': 3} {'type': 'loss', 'content': 0.0015315264463424683, 'timestamp': '2025-09-30 22:20:36.154395', 'step': 5279, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:36.191425', 'step': 5279, 'epoch': 3} {'type': 'loss', 'content': 0.0013392592081800103, 'timestamp': '2025-09-30 22:20:36.222347', 'step': 5280, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:36.266446', 'step': 5280, 'epoch': 3} {'type': 'loss', 'content': 0.00462631369009614, 'timestamp': '2025-09-30 22:20:36.275215', 'step': 5281, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:36.311795', 'step': 5281, 'epoch': 3} {'type': 'loss', 'content': 0.00020836050680372864, 'timestamp': '2025-09-30 22:20:36.324124', 'step': 5282, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:20:36.365008', 'step': 5282, 'epoch': 3} {'type': 'loss', 'content': 0.0002244135393993929, 'timestamp': '2025-09-30 22:20:36.378865', 'step': 5283, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:36.422522', 'step': 5283, 'epoch': 3} {'type': 'loss', 'content': 0.0002135797985829413, 'timestamp': '2025-09-30 22:20:36.451273', 'step': 5284, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:36.488841', 'step': 5284, 'epoch': 3} {'type': 'loss', 'content': 0.0008744591032154858, 'timestamp': '2025-09-30 22:20:36.498808', 'step': 5285, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:20:36.542770', 'step': 5285, 'epoch': 3} {'type': 'loss', 'content': 0.008620786480605602, 'timestamp': '2025-09-30 22:20:36.556629', 'step': 5286, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:36.594477', 'step': 5286, 'epoch': 3} {'type': 'loss', 'content': 0.00015636156604159623, 'timestamp': '2025-09-30 22:20:36.604903', 'step': 5287, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:36.646000', 'step': 5287, 'epoch': 3} {'type': 'loss', 'content': 0.0020843285601586103, 'timestamp': '2025-09-30 22:20:36.679184', 'step': 5288, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:20:36.730250', 'step': 5288, 'epoch': 3} {'type': 'loss', 'content': 0.004855290055274963, 'timestamp': '2025-09-30 22:20:36.748651', 'step': 5289, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:36.797612', 'step': 5289, 'epoch': 3} {'type': 'loss', 'content': 0.0006399782723747194, 'timestamp': '2025-09-30 22:20:36.813054', 'step': 5290, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-09-30 22:20:39.518380', 'step': 5290, 'epoch': 3} {'type': 'pplx', 'content': 6.157207483776002, 'timestamp': '2025-09-30 22:20:39.520921', 'step': 5290, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:20:39.556360', 'step': 5290, 'epoch': 3} {'type': 'loss', 'content': 0.0005087924073450267, 'timestamp': '2025-09-30 22:20:39.569702', 'step': 5291, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:20:39.610286', 'step': 5291, 'epoch': 3} {'type': 'loss', 'content': 0.00027282178052701056, 'timestamp': '2025-09-30 22:20:39.644467', 'step': 5292, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:39.700668', 'step': 5292, 'epoch': 3} {'type': 'loss', 'content': 0.0003268631116952747, 'timestamp': '2025-09-30 22:20:39.710885', 'step': 5293, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:20:39.761528', 'step': 5293, 'epoch': 3} {'type': 'loss', 'content': 0.010251539759337902, 'timestamp': '2025-09-30 22:20:39.775270', 'step': 5294, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:20:39.813329', 'step': 5294, 'epoch': 3} {'type': 'loss', 'content': 0.0006060664891265333, 'timestamp': '2025-09-30 22:20:39.826665', 'step': 5295, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:39.871660', 'step': 5295, 'epoch': 3} {'type': 'loss', 'content': 0.0006611746503040195, 'timestamp': '2025-09-30 22:20:39.904833', 'step': 5296, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:39.939104', 'step': 5296, 'epoch': 3} {'type': 'loss', 'content': 0.0018542595207691193, 'timestamp': '2025-09-30 22:20:39.944450', 'step': 5297, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:39.982181', 'step': 5297, 'epoch': 3} {'type': 'loss', 'content': 0.002848684089258313, 'timestamp': '2025-09-30 22:20:39.993175', 'step': 5298, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:40.027805', 'step': 5298, 'epoch': 3} {'type': 'loss', 'content': 0.0006789477774873376, 'timestamp': '2025-09-30 22:20:40.040408', 'step': 5299, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-09-30 22:20:40.093326', 'step': 5299, 'epoch': 3} {'type': 'loss', 'content': 0.0015816806117072701, 'timestamp': '2025-09-30 22:20:40.130374', 'step': 5300, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:20:40.175484', 'step': 5300, 'epoch': 3} {'type': 'loss', 'content': 0.0007194660720415413, 'timestamp': '2025-09-30 22:20:40.188133', 'step': 5301, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:20:40.245109', 'step': 5301, 'epoch': 3} {'type': 'loss', 'content': 0.0011649384396150708, 'timestamp': '2025-09-30 22:20:40.260743', 'step': 5302, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:40.328261', 'step': 5302, 'epoch': 3} {'type': 'loss', 'content': 0.0008334691519849002, 'timestamp': '2025-09-30 22:20:40.340585', 'step': 5303, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:40.376233', 'step': 5303, 'epoch': 3} {'type': 'loss', 'content': 0.0025359534192830324, 'timestamp': '2025-09-30 22:20:40.409660', 'step': 5304, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:40.444824', 'step': 5304, 'epoch': 3} {'type': 'loss', 'content': 0.002035485114902258, 'timestamp': '2025-09-30 22:20:40.452800', 'step': 5305, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:40.498002', 'step': 5305, 'epoch': 3} {'type': 'loss', 'content': 0.003279902972280979, 'timestamp': '2025-09-30 22:20:40.510343', 'step': 5306, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:20:40.560290', 'step': 5306, 'epoch': 3} {'type': 'loss', 'content': 0.0007503706146962941, 'timestamp': '2025-09-30 22:20:40.573956', 'step': 5307, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:20:40.618255', 'step': 5307, 'epoch': 3} {'type': 'loss', 'content': 0.0024530754890292883, 'timestamp': '2025-09-30 22:20:40.643483', 'step': 5308, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:20:40.688463', 'step': 5308, 'epoch': 3} {'type': 'loss', 'content': 0.0009420658461749554, 'timestamp': '2025-09-30 22:20:40.703510', 'step': 5309, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-30 22:20:40.741677', 'step': 5309, 'epoch': 3} {'type': 'loss', 'content': 6.078934529796243e-05, 'timestamp': '2025-09-30 22:20:40.746119', 'step': 5310, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-09-30 22:20:40.799835', 'step': 5310, 'epoch': 3} {'type': 'loss', 'content': 8.060686377575621e-05, 'timestamp': '2025-09-30 22:20:40.816066', 'step': 5311, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:20:40.864495', 'step': 5311, 'epoch': 3} {'type': 'loss', 'content': 0.0018246863037347794, 'timestamp': '2025-09-30 22:20:40.892405', 'step': 5312, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:40.952332', 'step': 5312, 'epoch': 3} {'type': 'loss', 'content': 0.0007188029121607542, 'timestamp': '2025-09-30 22:20:40.960695', 'step': 5313, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:40.995751', 'step': 5313, 'epoch': 3} {'type': 'loss', 'content': 0.0002277821331517771, 'timestamp': '2025-09-30 22:20:41.006797', 'step': 5314, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:41.052679', 'step': 5314, 'epoch': 3} {'type': 'loss', 'content': 0.0015718286158517003, 'timestamp': '2025-09-30 22:20:41.063136', 'step': 5315, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:41.097681', 'step': 5315, 'epoch': 3} {'type': 'loss', 'content': 0.002976819407194853, 'timestamp': '2025-09-30 22:20:41.130855', 'step': 5316, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:20:41.165424', 'step': 5316, 'epoch': 3} {'type': 'loss', 'content': 0.0011995661770924926, 'timestamp': '2025-09-30 22:20:41.170286', 'step': 5317, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:41.213789', 'step': 5317, 'epoch': 3} {'type': 'loss', 'content': 0.00014211825327947736, 'timestamp': '2025-09-30 22:20:41.225606', 'step': 5318, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:41.268612', 'step': 5318, 'epoch': 3} {'type': 'loss', 'content': 0.000843456422444433, 'timestamp': '2025-09-30 22:20:41.279518', 'step': 5319, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:41.326659', 'step': 5319, 'epoch': 3} {'type': 'loss', 'content': 0.0014417750062420964, 'timestamp': '2025-09-30 22:20:41.355443', 'step': 5320, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:20:41.390689', 'step': 5320, 'epoch': 3} {'type': 'loss', 'content': 0.0017638927092775702, 'timestamp': '2025-09-30 22:20:41.394355', 'step': 5321, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:41.429245', 'step': 5321, 'epoch': 3} {'type': 'loss', 'content': 0.00016600944218225777, 'timestamp': '2025-09-30 22:20:41.441381', 'step': 5322, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:20:41.496131', 'step': 5322, 'epoch': 3} {'type': 'loss', 'content': 0.000154578490764834, 'timestamp': '2025-09-30 22:20:41.511959', 'step': 5323, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:20:41.546816', 'step': 5323, 'epoch': 3} {'type': 'loss', 'content': 0.00029848323902115226, 'timestamp': '2025-09-30 22:20:41.573935', 'step': 5324, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:41.625919', 'step': 5324, 'epoch': 3} {'type': 'loss', 'content': 0.0010769865475594997, 'timestamp': '2025-09-30 22:20:41.635992', 'step': 5325, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:41.684496', 'step': 5325, 'epoch': 3} {'type': 'loss', 'content': 0.009188102558255196, 'timestamp': '2025-09-30 22:20:41.696127', 'step': 5326, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:41.738724', 'step': 5326, 'epoch': 3} {'type': 'loss', 'content': 0.0021131394896656275, 'timestamp': '2025-09-30 22:20:41.749109', 'step': 5327, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:20:41.791431', 'step': 5327, 'epoch': 3} {'type': 'loss', 'content': 0.000720141630154103, 'timestamp': '2025-09-30 22:20:41.826112', 'step': 5328, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:41.867175', 'step': 5328, 'epoch': 3} {'type': 'loss', 'content': 0.0013144101249054074, 'timestamp': '2025-09-30 22:20:41.875934', 'step': 5329, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:41.919620', 'step': 5329, 'epoch': 3} {'type': 'loss', 'content': 0.00027391567709855735, 'timestamp': '2025-09-30 22:20:41.931995', 'step': 5330, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:41.974162', 'step': 5330, 'epoch': 3} {'type': 'loss', 'content': 0.0004535318003036082, 'timestamp': '2025-09-30 22:20:41.986678', 'step': 5331, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:42.021345', 'step': 5331, 'epoch': 3} {'type': 'loss', 'content': 0.00015783542767167091, 'timestamp': '2025-09-30 22:20:42.054488', 'step': 5332, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:20:42.089363', 'step': 5332, 'epoch': 3} {'type': 'loss', 'content': 0.0016249046893790364, 'timestamp': '2025-09-30 22:20:42.102400', 'step': 5333, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:20:42.148800', 'step': 5333, 'epoch': 3} {'type': 'loss', 'content': 0.0012738303048536181, 'timestamp': '2025-09-30 22:20:42.162267', 'step': 5334, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:20:42.207882', 'step': 5334, 'epoch': 3} {'type': 'loss', 'content': 0.0007393105188384652, 'timestamp': '2025-09-30 22:20:42.221302', 'step': 5335, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:42.265253', 'step': 5335, 'epoch': 3} {'type': 'loss', 'content': 0.008159262128174305, 'timestamp': '2025-09-30 22:20:42.298161', 'step': 5336, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:42.346737', 'step': 5336, 'epoch': 3} {'type': 'loss', 'content': 0.004605071619153023, 'timestamp': '2025-09-30 22:20:42.356313', 'step': 5337, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:42.403054', 'step': 5337, 'epoch': 3} {'type': 'loss', 'content': 0.00010321156878490001, 'timestamp': '2025-09-30 22:20:42.413202', 'step': 5338, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:20:42.460354', 'step': 5338, 'epoch': 3} {'type': 'loss', 'content': 0.0001988293806789443, 'timestamp': '2025-09-30 22:20:42.466492', 'step': 5339, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:42.510940', 'step': 5339, 'epoch': 3} {'type': 'loss', 'content': 0.0008656299905851483, 'timestamp': '2025-09-30 22:20:42.542279', 'step': 5340, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:20:42.580546', 'step': 5340, 'epoch': 3} {'type': 'loss', 'content': 0.0016622475814074278, 'timestamp': '2025-09-30 22:20:42.593619', 'step': 5341, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:20:42.630049', 'step': 5341, 'epoch': 3} {'type': 'loss', 'content': 0.0063072070479393005, 'timestamp': '2025-09-30 22:20:42.637372', 'step': 5342, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:20:42.676457', 'step': 5342, 'epoch': 3} {'type': 'loss', 'content': 0.0005689572426490486, 'timestamp': '2025-09-30 22:20:42.690183', 'step': 5343, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:42.733094', 'step': 5343, 'epoch': 3} {'type': 'loss', 'content': 0.00042845390271395445, 'timestamp': '2025-09-30 22:20:42.761998', 'step': 5344, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:42.806381', 'step': 5344, 'epoch': 3} {'type': 'loss', 'content': 0.0014855386689305305, 'timestamp': '2025-09-30 22:20:42.814561', 'step': 5345, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:42.851778', 'step': 5345, 'epoch': 3} {'type': 'loss', 'content': 0.0006970709073357284, 'timestamp': '2025-09-30 22:20:42.864320', 'step': 5346, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:20:42.897836', 'step': 5346, 'epoch': 3} {'type': 'loss', 'content': 0.00020175872487016022, 'timestamp': '2025-09-30 22:20:42.905442', 'step': 5347, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:20:42.950083', 'step': 5347, 'epoch': 3} {'type': 'loss', 'content': 0.0008992621442303061, 'timestamp': '2025-09-30 22:20:42.984291', 'step': 5348, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:43.030136', 'step': 5348, 'epoch': 3} {'type': 'loss', 'content': 0.025950860232114792, 'timestamp': '2025-09-30 22:20:43.038943', 'step': 5349, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:43.083207', 'step': 5349, 'epoch': 3} {'type': 'loss', 'content': 0.0006523510091938078, 'timestamp': '2025-09-30 22:20:43.091248', 'step': 5350, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:43.125114', 'step': 5350, 'epoch': 3} {'type': 'loss', 'content': 0.0005884814891032875, 'timestamp': '2025-09-30 22:20:43.135670', 'step': 5351, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:43.169338', 'step': 5351, 'epoch': 3} {'type': 'loss', 'content': 0.023698223754763603, 'timestamp': '2025-09-30 22:20:43.198211', 'step': 5352, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:43.238318', 'step': 5352, 'epoch': 3} {'type': 'loss', 'content': 0.00043184056994505227, 'timestamp': '2025-09-30 22:20:43.255173', 'step': 5353, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:20:43.315358', 'step': 5353, 'epoch': 3} {'type': 'loss', 'content': 0.00028453642153181136, 'timestamp': '2025-09-30 22:20:43.328792', 'step': 5354, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:20:43.381360', 'step': 5354, 'epoch': 3} {'type': 'loss', 'content': 0.000272301520453766, 'timestamp': '2025-09-30 22:20:43.401062', 'step': 5355, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-09-30 22:20:43.464266', 'step': 5355, 'epoch': 3} {'type': 'loss', 'content': 0.012864899821579456, 'timestamp': '2025-09-30 22:20:43.501279', 'step': 5356, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:20:43.540332', 'step': 5356, 'epoch': 3} {'type': 'loss', 'content': 0.0010666355956345797, 'timestamp': '2025-09-30 22:20:43.553018', 'step': 5357, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:43.589037', 'step': 5357, 'epoch': 3} {'type': 'loss', 'content': 0.0013121003285050392, 'timestamp': '2025-09-30 22:20:43.601595', 'step': 5358, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:43.651520', 'step': 5358, 'epoch': 3} {'type': 'loss', 'content': 0.001673312857747078, 'timestamp': '2025-09-30 22:20:43.662676', 'step': 5359, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:20:43.714889', 'step': 5359, 'epoch': 3} {'type': 'loss', 'content': 0.010204694233834743, 'timestamp': '2025-09-30 22:20:43.749511', 'step': 5360, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:20:43.798275', 'step': 5360, 'epoch': 3} {'type': 'loss', 'content': 0.0004900519852526486, 'timestamp': '2025-09-30 22:20:43.811648', 'step': 5361, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:43.855331', 'step': 5361, 'epoch': 3} {'type': 'loss', 'content': 8.291222911793739e-05, 'timestamp': '2025-09-30 22:20:43.863374', 'step': 5362, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:43.898066', 'step': 5362, 'epoch': 3} {'type': 'loss', 'content': 0.00014946149894967675, 'timestamp': '2025-09-30 22:20:43.910598', 'step': 5363, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:43.947048', 'step': 5363, 'epoch': 3} {'type': 'loss', 'content': 6.812860374338925e-05, 'timestamp': '2025-09-30 22:20:43.975881', 'step': 5364, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-30 22:20:44.008283', 'step': 5364, 'epoch': 3} {'type': 'loss', 'content': 0.00020966037118341774, 'timestamp': '2025-09-30 22:20:44.013164', 'step': 5365, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:44.046423', 'step': 5365, 'epoch': 3} {'type': 'loss', 'content': 0.0008381285006180406, 'timestamp': '2025-09-30 22:20:44.054499', 'step': 5366, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:44.095836', 'step': 5366, 'epoch': 3} {'type': 'loss', 'content': 0.013554837554693222, 'timestamp': '2025-09-30 22:20:44.106549', 'step': 5367, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:20:44.149265', 'step': 5367, 'epoch': 3} {'type': 'loss', 'content': 0.003462092485278845, 'timestamp': '2025-09-30 22:20:44.177796', 'step': 5368, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:44.216957', 'step': 5368, 'epoch': 3} {'type': 'loss', 'content': 0.0015900923172011971, 'timestamp': '2025-09-30 22:20:44.226879', 'step': 5369, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-09-30 22:20:44.276180', 'step': 5369, 'epoch': 3} {'type': 'loss', 'content': 0.002574779326096177, 'timestamp': '2025-09-30 22:20:44.293295', 'step': 5370, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-09-30 22:20:44.337048', 'step': 5370, 'epoch': 3} {'type': 'loss', 'content': 0.0006000144057907164, 'timestamp': '2025-09-30 22:20:44.353177', 'step': 5371, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:20:44.389279', 'step': 5371, 'epoch': 3} {'type': 'loss', 'content': 0.015625132247805595, 'timestamp': '2025-09-30 22:20:44.423501', 'step': 5372, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:44.480047', 'step': 5372, 'epoch': 3} {'type': 'loss', 'content': 0.00029536019428633153, 'timestamp': '2025-09-30 22:20:44.490243', 'step': 5373, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:20:44.534152', 'step': 5373, 'epoch': 3} {'type': 'loss', 'content': 0.0016645942814648151, 'timestamp': '2025-09-30 22:20:44.547502', 'step': 5374, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:20:44.593183', 'step': 5374, 'epoch': 3} {'type': 'loss', 'content': 0.006586844101548195, 'timestamp': '2025-09-30 22:20:44.606401', 'step': 5375, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:20:44.645070', 'step': 5375, 'epoch': 3} {'type': 'loss', 'content': 0.010785430669784546, 'timestamp': '2025-09-30 22:20:44.679334', 'step': 5376, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:20:44.715488', 'step': 5376, 'epoch': 3} {'type': 'loss', 'content': 0.006151542533189058, 'timestamp': '2025-09-30 22:20:44.728665', 'step': 5377, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:44.762109', 'step': 5377, 'epoch': 3} {'type': 'loss', 'content': 0.004861661233007908, 'timestamp': '2025-09-30 22:20:44.772780', 'step': 5378, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:20:44.809751', 'step': 5378, 'epoch': 3} {'type': 'loss', 'content': 0.00281528034247458, 'timestamp': '2025-09-30 22:20:44.823491', 'step': 5379, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:44.856719', 'step': 5379, 'epoch': 3} {'type': 'loss', 'content': 0.0017934244824573398, 'timestamp': '2025-09-30 22:20:44.889952', 'step': 5380, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:20:44.934312', 'step': 5380, 'epoch': 3} {'type': 'loss', 'content': 0.0002498268731869757, 'timestamp': '2025-09-30 22:20:44.947276', 'step': 5381, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:44.981882', 'step': 5381, 'epoch': 3} {'type': 'loss', 'content': 0.0021900867577642202, 'timestamp': '2025-09-30 22:20:44.994432', 'step': 5382, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:45.027886', 'step': 5382, 'epoch': 3} {'type': 'loss', 'content': 0.0006635947502218187, 'timestamp': '2025-09-30 22:20:45.035783', 'step': 5383, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:20:45.073738', 'step': 5383, 'epoch': 3} {'type': 'loss', 'content': 0.0046465955674648285, 'timestamp': '2025-09-30 22:20:45.107983', 'step': 5384, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:20:45.151573', 'step': 5384, 'epoch': 3} {'type': 'loss', 'content': 0.0010856845183297992, 'timestamp': '2025-09-30 22:20:45.166766', 'step': 5385, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:45.211473', 'step': 5385, 'epoch': 3} {'type': 'loss', 'content': 0.0001665302988840267, 'timestamp': '2025-09-30 22:20:45.223808', 'step': 5386, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:45.263857', 'step': 5386, 'epoch': 3} {'type': 'loss', 'content': 0.0006953799165785313, 'timestamp': '2025-09-30 22:20:45.276436', 'step': 5387, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:45.321565', 'step': 5387, 'epoch': 3} {'type': 'loss', 'content': 0.0009348868625238538, 'timestamp': '2025-09-30 22:20:45.354808', 'step': 5388, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:45.394481', 'step': 5388, 'epoch': 3} {'type': 'loss', 'content': 0.0010101856896653771, 'timestamp': '2025-09-30 22:20:45.403149', 'step': 5389, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:45.466172', 'step': 5389, 'epoch': 3} {'type': 'loss', 'content': 0.004780636169016361, 'timestamp': '2025-09-30 22:20:45.477376', 'step': 5390, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:45.511881', 'step': 5390, 'epoch': 3} {'type': 'loss', 'content': 0.012082463130354881, 'timestamp': '2025-09-30 22:20:45.522380', 'step': 5391, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:45.557177', 'step': 5391, 'epoch': 3} {'type': 'loss', 'content': 0.0008092407369986176, 'timestamp': '2025-09-30 22:20:45.588368', 'step': 5392, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:45.631944', 'step': 5392, 'epoch': 3} {'type': 'loss', 'content': 0.0012615135638043284, 'timestamp': '2025-09-30 22:20:45.640940', 'step': 5393, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:45.685595', 'step': 5393, 'epoch': 3} {'type': 'loss', 'content': 0.002899823011830449, 'timestamp': '2025-09-30 22:20:45.696704', 'step': 5394, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:20:45.733505', 'step': 5394, 'epoch': 3} {'type': 'loss', 'content': 0.0009003611630760133, 'timestamp': '2025-09-30 22:20:45.741384', 'step': 5395, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:20:45.791089', 'step': 5395, 'epoch': 3} {'type': 'loss', 'content': 0.004248927813023329, 'timestamp': '2025-09-30 22:20:45.825696', 'step': 5396, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:20:45.864260', 'step': 5396, 'epoch': 3} {'type': 'loss', 'content': 0.0022605466656386852, 'timestamp': '2025-09-30 22:20:45.877291', 'step': 5397, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:45.924895', 'step': 5397, 'epoch': 3} {'type': 'loss', 'content': 0.0018691695295274258, 'timestamp': '2025-09-30 22:20:45.937479', 'step': 5398, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:45.982123', 'step': 5398, 'epoch': 3} {'type': 'loss', 'content': 0.004714638460427523, 'timestamp': '2025-09-30 22:20:45.992644', 'step': 5399, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:46.036767', 'step': 5399, 'epoch': 3} {'type': 'loss', 'content': 0.0013302817242220044, 'timestamp': '2025-09-30 22:20:46.068947', 'step': 5400, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:20:46.105525', 'step': 5400, 'epoch': 3} {'type': 'loss', 'content': 0.0004780096060130745, 'timestamp': '2025-09-30 22:20:46.118593', 'step': 5401, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:20:46.177681', 'step': 5401, 'epoch': 3} {'type': 'loss', 'content': 0.010950141586363316, 'timestamp': '2025-09-30 22:20:46.191362', 'step': 5402, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-09-30 22:20:46.262163', 'step': 5402, 'epoch': 3} {'type': 'loss', 'content': 0.0008549345657229424, 'timestamp': '2025-09-30 22:20:46.278501', 'step': 5403, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:46.322339', 'step': 5403, 'epoch': 3} {'type': 'loss', 'content': 0.0003098884189967066, 'timestamp': '2025-09-30 22:20:46.354326', 'step': 5404, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:46.394559', 'step': 5404, 'epoch': 3} {'type': 'loss', 'content': 0.00030104260076768696, 'timestamp': '2025-09-30 22:20:46.399710', 'step': 5405, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-09-30 22:20:49.232301', 'step': 5405, 'epoch': 3} {'type': 'pplx', 'content': 6.176700245754593, 'timestamp': '2025-09-30 22:20:49.234471', 'step': 5405, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:49.265286', 'step': 5405, 'epoch': 3} {'type': 'loss', 'content': 0.0015386255690827966, 'timestamp': '2025-09-30 22:20:49.277795', 'step': 5406, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:49.313932', 'step': 5406, 'epoch': 3} {'type': 'loss', 'content': 0.003452225122600794, 'timestamp': '2025-09-30 22:20:49.324443', 'step': 5407, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:49.357550', 'step': 5407, 'epoch': 3} {'type': 'loss', 'content': 0.006190893240272999, 'timestamp': '2025-09-30 22:20:49.391004', 'step': 5408, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:49.428190', 'step': 5408, 'epoch': 3} {'type': 'loss', 'content': 0.004402454011142254, 'timestamp': '2025-09-30 22:20:49.436914', 'step': 5409, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:49.486175', 'step': 5409, 'epoch': 3} {'type': 'loss', 'content': 0.001000831020064652, 'timestamp': '2025-09-30 22:20:49.497265', 'step': 5410, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:49.558018', 'step': 5410, 'epoch': 3} {'type': 'loss', 'content': 0.00031259850948117673, 'timestamp': '2025-09-30 22:20:49.570589', 'step': 5411, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:49.603567', 'step': 5411, 'epoch': 3} {'type': 'loss', 'content': 0.008803533390164375, 'timestamp': '2025-09-30 22:20:49.636555', 'step': 5412, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:49.673417', 'step': 5412, 'epoch': 3} {'type': 'loss', 'content': 0.0019300552085042, 'timestamp': '2025-09-30 22:20:49.679146', 'step': 5413, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:20:49.717014', 'step': 5413, 'epoch': 3} {'type': 'loss', 'content': 0.003091174876317382, 'timestamp': '2025-09-30 22:20:49.730428', 'step': 5414, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:49.772962', 'step': 5414, 'epoch': 3} {'type': 'loss', 'content': 0.00227998080663383, 'timestamp': '2025-09-30 22:20:49.781100', 'step': 5415, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:20:49.818516', 'step': 5415, 'epoch': 3} {'type': 'loss', 'content': 0.001922230003401637, 'timestamp': '2025-09-30 22:20:49.853156', 'step': 5416, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:49.886187', 'step': 5416, 'epoch': 3} {'type': 'loss', 'content': 0.004396355245262384, 'timestamp': '2025-09-30 22:20:49.894315', 'step': 5417, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:20:49.937412', 'step': 5417, 'epoch': 3} {'type': 'loss', 'content': 0.003643837058916688, 'timestamp': '2025-09-30 22:20:49.944502', 'step': 5418, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:49.989098', 'step': 5418, 'epoch': 3} {'type': 'loss', 'content': 0.0007700038840994239, 'timestamp': '2025-09-30 22:20:50.000357', 'step': 5419, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:50.036147', 'step': 5419, 'epoch': 3} {'type': 'loss', 'content': 0.00034244020935148, 'timestamp': '2025-09-30 22:20:50.067650', 'step': 5420, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:50.103007', 'step': 5420, 'epoch': 3} {'type': 'loss', 'content': 0.0015500112203881145, 'timestamp': '2025-09-30 22:20:50.108636', 'step': 5421, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:50.146260', 'step': 5421, 'epoch': 3} {'type': 'loss', 'content': 0.0006771894986741245, 'timestamp': '2025-09-30 22:20:50.158607', 'step': 5422, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:50.203742', 'step': 5422, 'epoch': 3} {'type': 'loss', 'content': 0.0002916664816439152, 'timestamp': '2025-09-30 22:20:50.216104', 'step': 5423, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:20:50.264018', 'step': 5423, 'epoch': 3} {'type': 'loss', 'content': 0.0021485788747668266, 'timestamp': '2025-09-30 22:20:50.298540', 'step': 5424, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:20:50.360698', 'step': 5424, 'epoch': 3} {'type': 'loss', 'content': 0.0031237241346389055, 'timestamp': '2025-09-30 22:20:50.373798', 'step': 5425, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:20:50.407195', 'step': 5425, 'epoch': 3} {'type': 'loss', 'content': 0.0006846352480351925, 'timestamp': '2025-09-30 22:20:50.414223', 'step': 5426, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:20:50.455165', 'step': 5426, 'epoch': 3} {'type': 'loss', 'content': 0.0031432094983756542, 'timestamp': '2025-09-30 22:20:50.461250', 'step': 5427, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-30 22:20:50.496108', 'step': 5427, 'epoch': 3} {'type': 'loss', 'content': 0.0017296245787292719, 'timestamp': '2025-09-30 22:20:50.521285', 'step': 5428, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:50.558571', 'step': 5428, 'epoch': 3} {'type': 'loss', 'content': 0.0013727423502132297, 'timestamp': '2025-09-30 22:20:50.568701', 'step': 5429, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:50.621634', 'step': 5429, 'epoch': 3} {'type': 'loss', 'content': 0.0017471632454544306, 'timestamp': '2025-09-30 22:20:50.632104', 'step': 5430, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:50.678338', 'step': 5430, 'epoch': 3} {'type': 'loss', 'content': 0.0008682655752636492, 'timestamp': '2025-09-30 22:20:50.688655', 'step': 5431, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:20:50.723624', 'step': 5431, 'epoch': 3} {'type': 'loss', 'content': 0.0007178595406003296, 'timestamp': '2025-09-30 22:20:50.752109', 'step': 5432, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:50.787262', 'step': 5432, 'epoch': 3} {'type': 'loss', 'content': 0.007646486163139343, 'timestamp': '2025-09-30 22:20:50.793924', 'step': 5433, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:50.847055', 'step': 5433, 'epoch': 3} {'type': 'loss', 'content': 0.0029912583995610476, 'timestamp': '2025-09-30 22:20:50.855089', 'step': 5434, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:50.906659', 'step': 5434, 'epoch': 3} {'type': 'loss', 'content': 0.00047764150076545775, 'timestamp': '2025-09-30 22:20:50.919264', 'step': 5435, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:50.962956', 'step': 5435, 'epoch': 3} {'type': 'loss', 'content': 0.00027994034462608397, 'timestamp': '2025-09-30 22:20:50.996385', 'step': 5436, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:51.043702', 'step': 5436, 'epoch': 3} {'type': 'loss', 'content': 0.011703962460160255, 'timestamp': '2025-09-30 22:20:51.052395', 'step': 5437, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:20:51.096411', 'step': 5437, 'epoch': 3} {'type': 'loss', 'content': 0.001737725455313921, 'timestamp': '2025-09-30 22:20:51.109808', 'step': 5438, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:51.148509', 'step': 5438, 'epoch': 3} {'type': 'loss', 'content': 0.0026286798529326916, 'timestamp': '2025-09-30 22:20:51.156531', 'step': 5439, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:51.190060', 'step': 5439, 'epoch': 3} {'type': 'loss', 'content': 0.003658075351268053, 'timestamp': '2025-09-30 22:20:51.221526', 'step': 5440, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:20:51.253910', 'step': 5440, 'epoch': 3} {'type': 'loss', 'content': 0.0006490391097031534, 'timestamp': '2025-09-30 22:20:51.262842', 'step': 5441, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:51.304060', 'step': 5441, 'epoch': 3} {'type': 'loss', 'content': 0.003733021439984441, 'timestamp': '2025-09-30 22:20:51.311956', 'step': 5442, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:20:51.350856', 'step': 5442, 'epoch': 3} {'type': 'loss', 'content': 0.001515704789198935, 'timestamp': '2025-09-30 22:20:51.364233', 'step': 5443, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:51.412638', 'step': 5443, 'epoch': 3} {'type': 'loss', 'content': 0.0006004043971188366, 'timestamp': '2025-09-30 22:20:51.444682', 'step': 5444, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:51.481210', 'step': 5444, 'epoch': 3} {'type': 'loss', 'content': 0.0005910450126975775, 'timestamp': '2025-09-30 22:20:51.486960', 'step': 5445, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:20:51.535640', 'step': 5445, 'epoch': 3} {'type': 'loss', 'content': 0.0017474086489528418, 'timestamp': '2025-09-30 22:20:51.549021', 'step': 5446, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 14712978242368}, 'timestamp': '2025-09-30 22:20:51.603456', 'step': 5446, 'epoch': 3} {'type': 'loss', 'content': 0.0037502232007682323, 'timestamp': '2025-09-30 22:20:51.621070', 'step': 5447, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:20:51.669938', 'step': 5447, 'epoch': 3} {'type': 'loss', 'content': 0.004871891345828772, 'timestamp': '2025-09-30 22:20:51.704467', 'step': 5448, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-09-30 22:20:51.761803', 'step': 5448, 'epoch': 3} {'type': 'loss', 'content': 0.003989064134657383, 'timestamp': '2025-09-30 22:20:51.777015', 'step': 5449, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:51.818006', 'step': 5449, 'epoch': 3} {'type': 'loss', 'content': 0.013066486455500126, 'timestamp': '2025-09-30 22:20:51.830597', 'step': 5450, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:51.870697', 'step': 5450, 'epoch': 3} {'type': 'loss', 'content': 0.002019469393417239, 'timestamp': '2025-09-30 22:20:51.883213', 'step': 5451, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:20:51.930014', 'step': 5451, 'epoch': 3} {'type': 'loss', 'content': 0.010137668810784817, 'timestamp': '2025-09-30 22:20:51.964573', 'step': 5452, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:51.998069', 'step': 5452, 'epoch': 3} {'type': 'loss', 'content': 0.0009108879021368921, 'timestamp': '2025-09-30 22:20:52.008622', 'step': 5453, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:52.061534', 'step': 5453, 'epoch': 3} {'type': 'loss', 'content': 0.0012465942418202758, 'timestamp': '2025-09-30 22:20:52.073960', 'step': 5454, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:20:52.133473', 'step': 5454, 'epoch': 3} {'type': 'loss', 'content': 0.004608785267919302, 'timestamp': '2025-09-30 22:20:52.147199', 'step': 5455, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:52.196103', 'step': 5455, 'epoch': 3} {'type': 'loss', 'content': 0.0007762848399579525, 'timestamp': '2025-09-30 22:20:52.227939', 'step': 5456, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:52.266593', 'step': 5456, 'epoch': 3} {'type': 'loss', 'content': 0.005198488011956215, 'timestamp': '2025-09-30 22:20:52.276593', 'step': 5457, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:20:52.323726', 'step': 5457, 'epoch': 3} {'type': 'loss', 'content': 0.0027350198943167925, 'timestamp': '2025-09-30 22:20:52.337550', 'step': 5458, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:52.379356', 'step': 5458, 'epoch': 3} {'type': 'loss', 'content': 0.0009995006257668138, 'timestamp': '2025-09-30 22:20:52.391937', 'step': 5459, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:52.427163', 'step': 5459, 'epoch': 3} {'type': 'loss', 'content': 0.0010448892135173082, 'timestamp': '2025-09-30 22:20:52.458650', 'step': 5460, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:52.499964', 'step': 5460, 'epoch': 3} {'type': 'loss', 'content': 0.0008325534290634096, 'timestamp': '2025-09-30 22:20:52.516146', 'step': 5461, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:52.553299', 'step': 5461, 'epoch': 3} {'type': 'loss', 'content': 0.002204886171966791, 'timestamp': '2025-09-30 22:20:52.564692', 'step': 5462, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:52.630410', 'step': 5462, 'epoch': 3} {'type': 'loss', 'content': 0.0005283255595713854, 'timestamp': '2025-09-30 22:20:52.649602', 'step': 5463, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:20:52.696554', 'step': 5463, 'epoch': 3} {'type': 'loss', 'content': 0.0007552019087597728, 'timestamp': '2025-09-30 22:20:52.731094', 'step': 5464, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:20:52.776441', 'step': 5464, 'epoch': 3} {'type': 'loss', 'content': 0.0012059420114383101, 'timestamp': '2025-09-30 22:20:52.791890', 'step': 5465, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:20:52.836147', 'step': 5465, 'epoch': 3} {'type': 'loss', 'content': 0.0018100308952853084, 'timestamp': '2025-09-30 22:20:52.849918', 'step': 5466, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:20:52.891998', 'step': 5466, 'epoch': 3} {'type': 'loss', 'content': 0.0005931039340794086, 'timestamp': '2025-09-30 22:20:52.905922', 'step': 5467, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:52.948160', 'step': 5467, 'epoch': 3} {'type': 'loss', 'content': 0.0002712627174332738, 'timestamp': '2025-09-30 22:20:52.981582', 'step': 5468, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:53.027342', 'step': 5468, 'epoch': 3} {'type': 'loss', 'content': 0.0028416018467396498, 'timestamp': '2025-09-30 22:20:53.037242', 'step': 5469, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:53.075209', 'step': 5469, 'epoch': 3} {'type': 'loss', 'content': 0.0011812286684289575, 'timestamp': '2025-09-30 22:20:53.087800', 'step': 5470, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:20:53.130085', 'step': 5470, 'epoch': 3} {'type': 'loss', 'content': 0.00046421645674854517, 'timestamp': '2025-09-30 22:20:53.146022', 'step': 5471, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:20:53.188102', 'step': 5471, 'epoch': 3} {'type': 'loss', 'content': 0.0011313254944980145, 'timestamp': '2025-09-30 22:20:53.220023', 'step': 5472, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-09-30 22:20:53.264112', 'step': 5472, 'epoch': 3} {'type': 'loss', 'content': 0.0023931600153446198, 'timestamp': '2025-09-30 22:20:53.279806', 'step': 5473, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:53.315078', 'step': 5473, 'epoch': 3} {'type': 'loss', 'content': 0.0008334001176990569, 'timestamp': '2025-09-30 22:20:53.326422', 'step': 5474, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:53.368283', 'step': 5474, 'epoch': 3} {'type': 'loss', 'content': 0.00035439120256341994, 'timestamp': '2025-09-30 22:20:53.379364', 'step': 5475, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:53.424988', 'step': 5475, 'epoch': 3} {'type': 'loss', 'content': 0.0028823537286370993, 'timestamp': '2025-09-30 22:20:53.456516', 'step': 5476, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-09-30 22:20:53.516608', 'step': 5476, 'epoch': 3} {'type': 'loss', 'content': 0.0009436103282496333, 'timestamp': '2025-09-30 22:20:53.532318', 'step': 5477, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:20:53.582510', 'step': 5477, 'epoch': 3} {'type': 'loss', 'content': 0.002098672790452838, 'timestamp': '2025-09-30 22:20:53.601689', 'step': 5478, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:20:53.666161', 'step': 5478, 'epoch': 3} {'type': 'loss', 'content': 0.004675887059420347, 'timestamp': '2025-09-30 22:20:53.680075', 'step': 5479, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 14238374516352}, 'timestamp': '2025-09-30 22:20:53.738357', 'step': 5479, 'epoch': 3} {'type': 'loss', 'content': 0.0021486529149115086, 'timestamp': '2025-09-30 22:20:53.776605', 'step': 5480, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:53.813850', 'step': 5480, 'epoch': 3} {'type': 'loss', 'content': 0.003002091310918331, 'timestamp': '2025-09-30 22:20:53.822684', 'step': 5481, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:53.860633', 'step': 5481, 'epoch': 3} {'type': 'loss', 'content': 0.00045192582183517516, 'timestamp': '2025-09-30 22:20:53.872986', 'step': 5482, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:20:53.921415', 'step': 5482, 'epoch': 3} {'type': 'loss', 'content': 0.0009709245641715825, 'timestamp': '2025-09-30 22:20:53.935176', 'step': 5483, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:20:53.984733', 'step': 5483, 'epoch': 3} {'type': 'loss', 'content': 0.0003787397581618279, 'timestamp': '2025-09-30 22:20:54.027112', 'step': 5484, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:54.065489', 'step': 5484, 'epoch': 3} {'type': 'loss', 'content': 0.003653504652902484, 'timestamp': '2025-09-30 22:20:54.074463', 'step': 5485, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:20:54.110825', 'step': 5485, 'epoch': 3} {'type': 'loss', 'content': 0.002409368986263871, 'timestamp': '2025-09-30 22:20:54.124209', 'step': 5486, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:20:54.162760', 'step': 5486, 'epoch': 3} {'type': 'loss', 'content': 0.005153408274054527, 'timestamp': '2025-09-30 22:20:54.176478', 'step': 5487, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:54.219308', 'step': 5487, 'epoch': 3} {'type': 'loss', 'content': 0.0031536994501948357, 'timestamp': '2025-09-30 22:20:54.252572', 'step': 5488, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:54.298481', 'step': 5488, 'epoch': 3} {'type': 'loss', 'content': 0.0007980418158695102, 'timestamp': '2025-09-30 22:20:54.308636', 'step': 5489, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:54.342262', 'step': 5489, 'epoch': 3} {'type': 'loss', 'content': 0.009608576074242592, 'timestamp': '2025-09-30 22:20:54.353455', 'step': 5490, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:20:54.392958', 'step': 5490, 'epoch': 3} {'type': 'loss', 'content': 0.007950414903461933, 'timestamp': '2025-09-30 22:20:54.403635', 'step': 5491, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:20:54.444519', 'step': 5491, 'epoch': 3} {'type': 'loss', 'content': 0.0034113333094865084, 'timestamp': '2025-09-30 22:20:54.476463', 'step': 5492, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-30 22:20:54.518991', 'step': 5492, 'epoch': 3} {'type': 'loss', 'content': 0.0006860059220343828, 'timestamp': '2025-09-30 22:20:54.526994', 'step': 5493, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:20:54.564196', 'step': 5493, 'epoch': 3} {'type': 'loss', 'content': 0.0003513667033985257, 'timestamp': '2025-09-30 22:20:54.573032', 'step': 5494, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:20:54.621320', 'step': 5494, 'epoch': 3} {'type': 'loss', 'content': 0.000984512735158205, 'timestamp': '2025-09-30 22:20:54.634656', 'step': 5495, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:54.692078', 'step': 5495, 'epoch': 3} {'type': 'loss', 'content': 0.004931764677166939, 'timestamp': '2025-09-30 22:20:54.725509', 'step': 5496, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:54.765897', 'step': 5496, 'epoch': 3} {'type': 'loss', 'content': 0.008381886407732964, 'timestamp': '2025-09-30 22:20:54.776118', 'step': 5497, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:20:54.813180', 'step': 5497, 'epoch': 3} {'type': 'loss', 'content': 0.0005858208751305938, 'timestamp': '2025-09-30 22:20:54.825832', 'step': 5498, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:20:54.867284', 'step': 5498, 'epoch': 3} {'type': 'loss', 'content': 0.0003775583754759282, 'timestamp': '2025-09-30 22:20:54.880689', 'step': 5499, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:20:54.929026', 'step': 5499, 'epoch': 3} {'type': 'loss', 'content': 0.001292352331802249, 'timestamp': '2025-09-30 22:20:54.962242', 'step': 5500, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 5500', 'timestamp': '2025-09-30 22:21:00.015988', 'step': 5500, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:21:00.069353', 'step': 5500, 'epoch': 3} {'type': 'loss', 'content': 0.0005804897518828511, 'timestamp': '2025-09-30 22:21:00.082695', 'step': 5501, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:21:00.135359', 'step': 5501, 'epoch': 3} {'type': 'loss', 'content': 0.00038161006523296237, 'timestamp': '2025-09-30 22:21:00.147484', 'step': 5502, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 17560600598464}, 'timestamp': '2025-09-30 22:21:00.209709', 'step': 5502, 'epoch': 3} {'type': 'loss', 'content': 0.0006065990310162306, 'timestamp': '2025-09-30 22:21:00.230754', 'step': 5503, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:21:00.279084', 'step': 5503, 'epoch': 3} {'type': 'loss', 'content': 0.0013182084076106548, 'timestamp': '2025-09-30 22:21:00.313432', 'step': 5504, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-30 22:21:00.359227', 'step': 5504, 'epoch': 3} {'type': 'loss', 'content': 0.0002575173566583544, 'timestamp': '2025-09-30 22:21:00.372387', 'step': 5505, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:21:00.418829', 'step': 5505, 'epoch': 3} {'type': 'loss', 'content': 0.00010437212768010795, 'timestamp': '2025-09-30 22:21:00.431142', 'step': 5506, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:21:00.477294', 'step': 5506, 'epoch': 3} {'type': 'loss', 'content': 0.0005292315036058426, 'timestamp': '2025-09-30 22:21:00.490689', 'step': 5507, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:21:00.538793', 'step': 5507, 'epoch': 3} {'type': 'loss', 'content': 0.0005687709781341255, 'timestamp': '2025-09-30 22:21:00.572192', 'step': 5508, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:21:00.615810', 'step': 5508, 'epoch': 3} {'type': 'loss', 'content': 0.0007103682146407664, 'timestamp': '2025-09-30 22:21:00.628486', 'step': 5509, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:21:00.675749', 'step': 5509, 'epoch': 3} {'type': 'loss', 'content': 0.005789314396679401, 'timestamp': '2025-09-30 22:21:00.686761', 'step': 5510, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:21:00.721663', 'step': 5510, 'epoch': 3} {'type': 'loss', 'content': 0.0008344220696017146, 'timestamp': '2025-09-30 22:21:00.738657', 'step': 5511, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:21:00.799727', 'step': 5511, 'epoch': 3} {'type': 'loss', 'content': 0.0006502811447717249, 'timestamp': '2025-09-30 22:21:00.834682', 'step': 5512, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:21:00.888949', 'step': 5512, 'epoch': 3} {'type': 'loss', 'content': 0.00043549088877625763, 'timestamp': '2025-09-30 22:21:00.901659', 'step': 5513, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:21:00.954339', 'step': 5513, 'epoch': 3} {'type': 'loss', 'content': 0.00593127217143774, 'timestamp': '2025-09-30 22:21:00.966974', 'step': 5514, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:21:01.014981', 'step': 5514, 'epoch': 3} {'type': 'loss', 'content': 0.006700599100440741, 'timestamp': '2025-09-30 22:21:01.028727', 'step': 5515, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:21:01.068924', 'step': 5515, 'epoch': 3} {'type': 'loss', 'content': 0.0027859918773174286, 'timestamp': '2025-09-30 22:21:01.105943', 'step': 5516, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:21:01.146730', 'step': 5516, 'epoch': 3} {'type': 'loss', 'content': 0.003578424919396639, 'timestamp': '2025-09-30 22:21:01.156527', 'step': 5517, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-30 22:21:01.203669', 'step': 5517, 'epoch': 3} {'type': 'loss', 'content': 0.0036419949028640985, 'timestamp': '2025-09-30 22:21:01.214874', 'step': 5518, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-30 22:21:01.258877', 'step': 5518, 'epoch': 3} {'type': 'loss', 'content': 0.019855622202157974, 'timestamp': '2025-09-30 22:21:01.278038', 'step': 5519, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-09-30 22:21:01.319944', 'step': 5519, 'epoch': 3} {'type': 'loss', 'content': 0.00021206910605542362, 'timestamp': '2025-09-30 22:21:01.354786', 'step': 5520, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-09-30 22:21:04.043012', 'step': 5520, 'epoch': 3} {'type': 'pplx', 'content': 6.126922369940893, 'timestamp': '2025-09-30 22:21:04.047684', 'step': 5520, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:21:04.088921', 'step': 5520, 'epoch': 3} {'type': 'loss', 'content': 0.0011069091269746423, 'timestamp': '2025-09-30 22:21:04.101936', 'step': 5521, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 15187581968384}, 'timestamp': '2025-09-30 22:21:04.157703', 'step': 5521, 'epoch': 3} {'type': 'loss', 'content': 0.0017075618961825967, 'timestamp': '2025-09-30 22:21:04.175482', 'step': 5522, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-09-30 22:21:04.229684', 'step': 5522, 'epoch': 3} {'type': 'loss', 'content': 0.004409335553646088, 'timestamp': '2025-09-30 22:21:04.245572', 'step': 5523, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:21:04.283940', 'step': 5523, 'epoch': 3} {'type': 'loss', 'content': 0.00010646448208717629, 'timestamp': '2025-09-30 22:21:04.326210', 'step': 5524, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-30 22:21:04.370348', 'step': 5524, 'epoch': 3} {'type': 'loss', 'content': 0.00014393814490176737, 'timestamp': '2025-09-30 22:21:04.380405', 'step': 5525, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:21:04.421144', 'step': 5525, 'epoch': 3} {'type': 'loss', 'content': 0.0021583314519375563, 'timestamp': '2025-09-30 22:21:04.434490', 'step': 5526, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:21:04.492165', 'step': 5526, 'epoch': 3} {'type': 'loss', 'content': 0.0008602791931480169, 'timestamp': '2025-09-30 22:21:04.505496', 'step': 5527, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-09-30 22:21:04.561474', 'step': 5527, 'epoch': 3} {'type': 'loss', 'content': 0.0017894088523462415, 'timestamp': '2025-09-30 22:21:04.596053', 'step': 5528, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-30 22:21:04.644908', 'step': 5528, 'epoch': 3} {'type': 'loss', 'content': 0.003701946698129177, 'timestamp': '2025-09-30 22:21:04.657583', 'step': 5529, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-09-30 22:21:04.703046', 'step': 5529, 'epoch': 3} {'type': 'loss', 'content': 0.0004692415823228657, 'timestamp': '2025-09-30 22:21:04.715627', 'step': 5530, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-09-30 22:21:04.757924', 'step': 5530, 'epoch': 3} {'type': 'loss', 'content': 0.007988791912794113, 'timestamp': '2025-09-30 22:21:04.774942', 'step': 5531, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-30 22:21:04.811155', 'step': 5531, 'epoch': 3} {'type': 'loss', 'content': 0.010778653435409069, 'timestamp': '2025-09-30 22:21:04.839679', 'step': 5532, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-09-30 22:21:07.479130', 'step': 5532, 'epoch': 3} {'type': 'pplx', 'content': 6.123145012094139, 'timestamp': '2025-09-30 22:21:07.498095', 'step': 5532, 'epoch': 3} {'type': 'best_pplx', 'content': 5.286725956468357, 'timestamp': '2025-09-30 22:21:07.515096', 'step': 5532, 'epoch': 3} {'type': 'best_step', 'content': 1380, 'timestamp': '2025-09-30 22:21:07.530192', 'step': 5532, 'epoch': 3} {'type': 'total_pplx_flops', 'content': 24378677094380800, 'timestamp': '2025-09-30 22:21:07.545013', 'step': 5532, 'epoch': 3} {'type': 'total_train_flops', 'content': 49118667663965760, 'timestamp': '2025-09-30 22:21:07.564275', 'step': 5532, 'epoch': 3}