{'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-10-01 04:15:43.388165', 'step': 0, 'epoch': 0} {'type': 'pplx', 'content': 9.2736354286407, 'timestamp': '2025-10-01 04:15:43.391379', 'step': 0, 'epoch': 0} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:15:43.480683', 'step': 0, 'epoch': 1} {'type': 'loss', 'content': 0.3148724436759949, 'timestamp': '2025-10-01 04:15:43.482991', 'step': 1, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:15:43.545382', 'step': 1, 'epoch': 1} {'type': 'loss', 'content': 0.1496344953775406, 'timestamp': '2025-10-01 04:15:43.554244', 'step': 2, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:15:43.607278', 'step': 2, 'epoch': 1} {'type': 'loss', 'content': 0.20663881301879883, 'timestamp': '2025-10-01 04:15:43.615134', 'step': 3, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:15:43.685827', 'step': 3, 'epoch': 1} {'type': 'loss', 'content': 0.22344671189785004, 'timestamp': '2025-10-01 04:15:43.734531', 'step': 4, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:15:43.781103', 'step': 4, 'epoch': 1} {'type': 'loss', 'content': 0.07606936246156693, 'timestamp': '2025-10-01 04:15:43.785967', 'step': 5, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:15:43.843906', 'step': 5, 'epoch': 1} {'type': 'loss', 'content': 0.0697832852602005, 'timestamp': '2025-10-01 04:15:43.851803', 'step': 6, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:15:43.903272', 'step': 6, 'epoch': 1} {'type': 'loss', 'content': 0.05548838898539543, 'timestamp': '2025-10-01 04:15:43.916684', 'step': 7, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:15:43.956285', 'step': 7, 'epoch': 1} {'type': 'loss', 'content': 0.05366652458906174, 'timestamp': '2025-10-01 04:15:43.986179', 'step': 8, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-10-01 04:15:44.053544', 'step': 8, 'epoch': 1} {'type': 'loss', 'content': 0.02459055185317993, 'timestamp': '2025-10-01 04:15:44.066190', 'step': 9, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:15:44.114509', 'step': 9, 'epoch': 1} {'type': 'loss', 'content': 0.021615447476506233, 'timestamp': '2025-10-01 04:15:44.126238', 'step': 10, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:15:44.179996', 'step': 10, 'epoch': 1} {'type': 'loss', 'content': 0.014977546408772469, 'timestamp': '2025-10-01 04:15:44.193484', 'step': 11, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:15:44.246784', 'step': 11, 'epoch': 1} {'type': 'loss', 'content': 0.031654540449380875, 'timestamp': '2025-10-01 04:15:44.281214', 'step': 12, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:15:44.335731', 'step': 12, 'epoch': 1} {'type': 'loss', 'content': 0.023420626297593117, 'timestamp': '2025-10-01 04:15:44.341399', 'step': 13, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:15:44.402590', 'step': 13, 'epoch': 1} {'type': 'loss', 'content': 0.025920502841472626, 'timestamp': '2025-10-01 04:15:44.407475', 'step': 14, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:15:44.449369', 'step': 14, 'epoch': 1} {'type': 'loss', 'content': 0.017238708212971687, 'timestamp': '2025-10-01 04:15:44.460849', 'step': 15, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:15:44.515942', 'step': 15, 'epoch': 1} {'type': 'loss', 'content': 0.01428209524601698, 'timestamp': '2025-10-01 04:15:44.550337', 'step': 16, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:15:44.611292', 'step': 16, 'epoch': 1} {'type': 'loss', 'content': 0.04219168424606323, 'timestamp': '2025-10-01 04:15:44.615305', 'step': 17, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:15:44.665846', 'step': 17, 'epoch': 1} {'type': 'loss', 'content': 0.02214355394244194, 'timestamp': '2025-10-01 04:15:44.678533', 'step': 18, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:15:44.749215', 'step': 18, 'epoch': 1} {'type': 'loss', 'content': 0.0346294529736042, 'timestamp': '2025-10-01 04:15:44.753779', 'step': 19, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:15:44.803552', 'step': 19, 'epoch': 1} {'type': 'loss', 'content': 0.03569815680384636, 'timestamp': '2025-10-01 04:15:44.832526', 'step': 20, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:15:44.884376', 'step': 20, 'epoch': 1} {'type': 'loss', 'content': 0.02472688816487789, 'timestamp': '2025-10-01 04:15:44.890164', 'step': 21, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:15:44.939266', 'step': 21, 'epoch': 1} {'type': 'loss', 'content': 0.036727070808410645, 'timestamp': '2025-10-01 04:15:44.947691', 'step': 22, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:15:45.012911', 'step': 22, 'epoch': 1} {'type': 'loss', 'content': 0.035241659730672836, 'timestamp': '2025-10-01 04:15:45.017267', 'step': 23, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:15:45.069415', 'step': 23, 'epoch': 1} {'type': 'loss', 'content': 0.0369919016957283, 'timestamp': '2025-10-01 04:15:45.097921', 'step': 24, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:15:45.161624', 'step': 24, 'epoch': 1} {'type': 'loss', 'content': 0.015591696836054325, 'timestamp': '2025-10-01 04:15:45.169054', 'step': 25, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:15:45.213371', 'step': 25, 'epoch': 1} {'type': 'loss', 'content': 0.03152967989444733, 'timestamp': '2025-10-01 04:15:45.220748', 'step': 26, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:15:45.272289', 'step': 26, 'epoch': 1} {'type': 'loss', 'content': 0.014276011846959591, 'timestamp': '2025-10-01 04:15:45.286241', 'step': 27, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:15:45.339956', 'step': 27, 'epoch': 1} {'type': 'loss', 'content': 0.02697276696562767, 'timestamp': '2025-10-01 04:15:45.373523', 'step': 28, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:15:45.415205', 'step': 28, 'epoch': 1} {'type': 'loss', 'content': 0.016183609142899513, 'timestamp': '2025-10-01 04:15:45.426296', 'step': 29, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:15:45.466495', 'step': 29, 'epoch': 1} {'type': 'loss', 'content': 0.018347211182117462, 'timestamp': '2025-10-01 04:15:45.480150', 'step': 30, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:15:45.539675', 'step': 30, 'epoch': 1} {'type': 'loss', 'content': 0.041274331510066986, 'timestamp': '2025-10-01 04:15:45.551112', 'step': 31, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:15:45.606123', 'step': 31, 'epoch': 1} {'type': 'loss', 'content': 0.009860141202807426, 'timestamp': '2025-10-01 04:15:45.635310', 'step': 32, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:15:45.691853', 'step': 32, 'epoch': 1} {'type': 'loss', 'content': 0.023594791069626808, 'timestamp': '2025-10-01 04:15:45.705178', 'step': 33, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:15:45.739683', 'step': 33, 'epoch': 1} {'type': 'loss', 'content': 0.031733106821775436, 'timestamp': '2025-10-01 04:15:45.751251', 'step': 34, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:15:45.790921', 'step': 34, 'epoch': 1} {'type': 'loss', 'content': 0.022514784708619118, 'timestamp': '2025-10-01 04:15:45.804947', 'step': 35, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:15:45.844205', 'step': 35, 'epoch': 1} {'type': 'loss', 'content': 0.017519265413284302, 'timestamp': '2025-10-01 04:15:45.879057', 'step': 36, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:15:45.917667', 'step': 36, 'epoch': 1} {'type': 'loss', 'content': 0.02474364824593067, 'timestamp': '2025-10-01 04:15:45.923497', 'step': 37, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:15:45.958116', 'step': 37, 'epoch': 1} {'type': 'loss', 'content': 0.016819903627038002, 'timestamp': '2025-10-01 04:15:45.969454', 'step': 38, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:15:46.021352', 'step': 38, 'epoch': 1} {'type': 'loss', 'content': 0.021695135161280632, 'timestamp': '2025-10-01 04:15:46.028822', 'step': 39, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:15:46.082199', 'step': 39, 'epoch': 1} {'type': 'loss', 'content': 0.03070221282541752, 'timestamp': '2025-10-01 04:15:46.110421', 'step': 40, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:15:46.165709', 'step': 40, 'epoch': 1} {'type': 'loss', 'content': 0.02313392050564289, 'timestamp': '2025-10-01 04:15:46.171563', 'step': 41, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:15:46.217145', 'step': 41, 'epoch': 1} {'type': 'loss', 'content': 0.025325890630483627, 'timestamp': '2025-10-01 04:15:46.227660', 'step': 42, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 15187581968384}, 'timestamp': '2025-10-01 04:15:46.289662', 'step': 42, 'epoch': 1} {'type': 'loss', 'content': 0.010489081963896751, 'timestamp': '2025-10-01 04:15:46.307497', 'step': 43, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:15:46.364246', 'step': 43, 'epoch': 1} {'type': 'loss', 'content': 0.011836855672299862, 'timestamp': '2025-10-01 04:15:46.399273', 'step': 44, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:15:46.470352', 'step': 44, 'epoch': 1} {'type': 'loss', 'content': 0.012925690039992332, 'timestamp': '2025-10-01 04:15:46.480976', 'step': 45, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:15:46.527558', 'step': 45, 'epoch': 1} {'type': 'loss', 'content': 0.022553613409399986, 'timestamp': '2025-10-01 04:15:46.540156', 'step': 46, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:15:46.577724', 'step': 46, 'epoch': 1} {'type': 'loss', 'content': 0.014072427526116371, 'timestamp': '2025-10-01 04:15:46.585906', 'step': 47, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:15:46.645124', 'step': 47, 'epoch': 1} {'type': 'loss', 'content': 0.02024070918560028, 'timestamp': '2025-10-01 04:15:46.677496', 'step': 48, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:15:46.736184', 'step': 48, 'epoch': 1} {'type': 'loss', 'content': 0.02084747701883316, 'timestamp': '2025-10-01 04:15:46.742028', 'step': 49, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:15:46.795799', 'step': 49, 'epoch': 1} {'type': 'loss', 'content': 0.024721672758460045, 'timestamp': '2025-10-01 04:15:46.807387', 'step': 50, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:15:46.856872', 'step': 50, 'epoch': 1} {'type': 'loss', 'content': 0.0444813147187233, 'timestamp': '2025-10-01 04:15:46.864900', 'step': 51, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:15:46.919409', 'step': 51, 'epoch': 1} {'type': 'loss', 'content': 0.018756715580821037, 'timestamp': '2025-10-01 04:15:46.953840', 'step': 52, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:15:47.007384', 'step': 52, 'epoch': 1} {'type': 'loss', 'content': 0.010533892549574375, 'timestamp': '2025-10-01 04:15:47.020745', 'step': 53, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:15:47.072600', 'step': 53, 'epoch': 1} {'type': 'loss', 'content': 0.019328080117702484, 'timestamp': '2025-10-01 04:15:47.083969', 'step': 54, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:15:47.136382', 'step': 54, 'epoch': 1} {'type': 'loss', 'content': 0.028576405718922615, 'timestamp': '2025-10-01 04:15:47.147596', 'step': 55, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:15:47.195380', 'step': 55, 'epoch': 1} {'type': 'loss', 'content': 0.050611186772584915, 'timestamp': '2025-10-01 04:15:47.228887', 'step': 56, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:15:47.289426', 'step': 56, 'epoch': 1} {'type': 'loss', 'content': 0.01814517378807068, 'timestamp': '2025-10-01 04:15:47.306622', 'step': 57, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:15:47.366219', 'step': 57, 'epoch': 1} {'type': 'loss', 'content': 0.017344679683446884, 'timestamp': '2025-10-01 04:15:47.383252', 'step': 58, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:15:47.441393', 'step': 58, 'epoch': 1} {'type': 'loss', 'content': 0.03285380080342293, 'timestamp': '2025-10-01 04:15:47.453658', 'step': 59, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:15:47.513310', 'step': 59, 'epoch': 1} {'type': 'loss', 'content': 0.028442176058888435, 'timestamp': '2025-10-01 04:15:47.541908', 'step': 60, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:15:47.602648', 'step': 60, 'epoch': 1} {'type': 'loss', 'content': 0.020839227363467216, 'timestamp': '2025-10-01 04:15:47.609393', 'step': 61, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:15:47.660813', 'step': 61, 'epoch': 1} {'type': 'loss', 'content': 0.01868492178618908, 'timestamp': '2025-10-01 04:15:47.677160', 'step': 62, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:15:47.743221', 'step': 62, 'epoch': 1} {'type': 'loss', 'content': 0.011659521609544754, 'timestamp': '2025-10-01 04:15:47.755694', 'step': 63, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:15:47.822289', 'step': 63, 'epoch': 1} {'type': 'loss', 'content': 0.010985899716615677, 'timestamp': '2025-10-01 04:15:47.857324', 'step': 64, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:15:47.920282', 'step': 64, 'epoch': 1} {'type': 'loss', 'content': 0.01268599834293127, 'timestamp': '2025-10-01 04:15:47.935503', 'step': 65, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:15:47.989794', 'step': 65, 'epoch': 1} {'type': 'loss', 'content': 0.020359791815280914, 'timestamp': '2025-10-01 04:15:47.998187', 'step': 66, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:15:48.057136', 'step': 66, 'epoch': 1} {'type': 'loss', 'content': 0.02078658528625965, 'timestamp': '2025-10-01 04:15:48.075899', 'step': 67, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:15:48.139512', 'step': 67, 'epoch': 1} {'type': 'loss', 'content': 0.011878191493451595, 'timestamp': '2025-10-01 04:15:48.177137', 'step': 68, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:15:48.229672', 'step': 68, 'epoch': 1} {'type': 'loss', 'content': 0.012522432953119278, 'timestamp': '2025-10-01 04:15:48.241338', 'step': 69, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:15:48.295921', 'step': 69, 'epoch': 1} {'type': 'loss', 'content': 0.013056221418082714, 'timestamp': '2025-10-01 04:15:48.309908', 'step': 70, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:15:48.367423', 'step': 70, 'epoch': 1} {'type': 'loss', 'content': 0.017279231920838356, 'timestamp': '2025-10-01 04:15:48.377167', 'step': 71, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:15:48.438485', 'step': 71, 'epoch': 1} {'type': 'loss', 'content': 0.018696613609790802, 'timestamp': '2025-10-01 04:15:48.474643', 'step': 72, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:15:48.519240', 'step': 72, 'epoch': 1} {'type': 'loss', 'content': 0.014112441800534725, 'timestamp': '2025-10-01 04:15:48.537867', 'step': 73, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:15:48.587382', 'step': 73, 'epoch': 1} {'type': 'loss', 'content': 0.012964009307324886, 'timestamp': '2025-10-01 04:15:48.618109', 'step': 74, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:15:48.670942', 'step': 74, 'epoch': 1} {'type': 'loss', 'content': 0.012404029257595539, 'timestamp': '2025-10-01 04:15:48.682410', 'step': 75, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:15:48.736801', 'step': 75, 'epoch': 1} {'type': 'loss', 'content': 0.01757497899234295, 'timestamp': '2025-10-01 04:15:48.765626', 'step': 76, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:15:48.816138', 'step': 76, 'epoch': 1} {'type': 'loss', 'content': 0.01670520380139351, 'timestamp': '2025-10-01 04:15:48.826097', 'step': 77, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:15:48.868228', 'step': 77, 'epoch': 1} {'type': 'loss', 'content': 0.019245807081460953, 'timestamp': '2025-10-01 04:15:48.880834', 'step': 78, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:15:48.935349', 'step': 78, 'epoch': 1} {'type': 'loss', 'content': 0.014431243762373924, 'timestamp': '2025-10-01 04:15:48.946771', 'step': 79, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:15:48.997696', 'step': 79, 'epoch': 1} {'type': 'loss', 'content': 0.021434934809803963, 'timestamp': '2025-10-01 04:15:49.026085', 'step': 80, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:15:49.076912', 'step': 80, 'epoch': 1} {'type': 'loss', 'content': 0.02286781184375286, 'timestamp': '2025-10-01 04:15:49.085305', 'step': 81, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:15:49.142670', 'step': 81, 'epoch': 1} {'type': 'loss', 'content': 0.01724059320986271, 'timestamp': '2025-10-01 04:15:49.151013', 'step': 82, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:15:49.203052', 'step': 82, 'epoch': 1} {'type': 'loss', 'content': 0.015399989672005177, 'timestamp': '2025-10-01 04:15:49.216623', 'step': 83, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-10-01 04:15:49.282644', 'step': 83, 'epoch': 1} {'type': 'loss', 'content': 0.008467916399240494, 'timestamp': '2025-10-01 04:15:49.317280', 'step': 84, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:15:49.368132', 'step': 84, 'epoch': 1} {'type': 'loss', 'content': 0.012758773751556873, 'timestamp': '2025-10-01 04:15:49.380967', 'step': 85, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:15:49.428870', 'step': 85, 'epoch': 1} {'type': 'loss', 'content': 0.017357897013425827, 'timestamp': '2025-10-01 04:15:49.442943', 'step': 86, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:15:49.495230', 'step': 86, 'epoch': 1} {'type': 'loss', 'content': 0.016277920454740524, 'timestamp': '2025-10-01 04:15:49.509161', 'step': 87, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:15:49.560802', 'step': 87, 'epoch': 1} {'type': 'loss', 'content': 0.013220068998634815, 'timestamp': '2025-10-01 04:15:49.595357', 'step': 88, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:15:49.651569', 'step': 88, 'epoch': 1} {'type': 'loss', 'content': 0.010602391324937344, 'timestamp': '2025-10-01 04:15:49.664810', 'step': 89, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:15:49.717461', 'step': 89, 'epoch': 1} {'type': 'loss', 'content': 0.03764573484659195, 'timestamp': '2025-10-01 04:15:49.725164', 'step': 90, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:15:49.766841', 'step': 90, 'epoch': 1} {'type': 'loss', 'content': 0.016209786757826805, 'timestamp': '2025-10-01 04:15:49.777672', 'step': 91, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:15:49.833223', 'step': 91, 'epoch': 1} {'type': 'loss', 'content': 0.01531579066067934, 'timestamp': '2025-10-01 04:15:49.862301', 'step': 92, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:15:49.911293', 'step': 92, 'epoch': 1} {'type': 'loss', 'content': 0.018722211942076683, 'timestamp': '2025-10-01 04:15:49.919038', 'step': 93, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:15:49.976304', 'step': 93, 'epoch': 1} {'type': 'loss', 'content': 0.017094120383262634, 'timestamp': '2025-10-01 04:15:49.987944', 'step': 94, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:15:50.043788', 'step': 94, 'epoch': 1} {'type': 'loss', 'content': 0.021558774635195732, 'timestamp': '2025-10-01 04:15:50.057776', 'step': 95, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:15:50.109968', 'step': 95, 'epoch': 1} {'type': 'loss', 'content': 0.018528137356042862, 'timestamp': '2025-10-01 04:15:50.143330', 'step': 96, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:15:50.187756', 'step': 96, 'epoch': 1} {'type': 'loss', 'content': 0.019974930211901665, 'timestamp': '2025-10-01 04:15:50.195978', 'step': 97, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:15:50.253233', 'step': 97, 'epoch': 1} {'type': 'loss', 'content': 0.03256944939494133, 'timestamp': '2025-10-01 04:15:50.266033', 'step': 98, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:15:50.324980', 'step': 98, 'epoch': 1} {'type': 'loss', 'content': 0.01754806749522686, 'timestamp': '2025-10-01 04:15:50.338871', 'step': 99, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:15:50.385962', 'step': 99, 'epoch': 1} {'type': 'loss', 'content': 0.012493299320340157, 'timestamp': '2025-10-01 04:15:50.420388', 'step': 100, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:15:50.465839', 'step': 100, 'epoch': 1} {'type': 'loss', 'content': 0.01095268502831459, 'timestamp': '2025-10-01 04:15:50.479117', 'step': 101, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:15:50.540285', 'step': 101, 'epoch': 1} {'type': 'loss', 'content': 0.020325329154729843, 'timestamp': '2025-10-01 04:15:50.552858', 'step': 102, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:15:50.603470', 'step': 102, 'epoch': 1} {'type': 'loss', 'content': 0.012788294814527035, 'timestamp': '2025-10-01 04:15:50.615915', 'step': 103, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:15:50.676017', 'step': 103, 'epoch': 1} {'type': 'loss', 'content': 0.023105181753635406, 'timestamp': '2025-10-01 04:15:50.704897', 'step': 104, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:15:50.749167', 'step': 104, 'epoch': 1} {'type': 'loss', 'content': 0.023724466562271118, 'timestamp': '2025-10-01 04:15:50.757552', 'step': 105, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:15:50.799506', 'step': 105, 'epoch': 1} {'type': 'loss', 'content': 0.020332131534814835, 'timestamp': '2025-10-01 04:15:50.813631', 'step': 106, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:15:50.861851', 'step': 106, 'epoch': 1} {'type': 'loss', 'content': 0.014770078472793102, 'timestamp': '2025-10-01 04:15:50.874554', 'step': 107, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:15:50.921007', 'step': 107, 'epoch': 1} {'type': 'loss', 'content': 0.01890796609222889, 'timestamp': '2025-10-01 04:15:50.952623', 'step': 108, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:15:50.988966', 'step': 108, 'epoch': 1} {'type': 'loss', 'content': 0.01276029646396637, 'timestamp': '2025-10-01 04:15:50.994718', 'step': 109, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:15:51.044902', 'step': 109, 'epoch': 1} {'type': 'loss', 'content': 0.026222240179777145, 'timestamp': '2025-10-01 04:15:51.057631', 'step': 110, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-10-01 04:15:51.114761', 'step': 110, 'epoch': 1} {'type': 'loss', 'content': 0.013714265078306198, 'timestamp': '2025-10-01 04:15:51.131006', 'step': 111, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:15:51.194905', 'step': 111, 'epoch': 1} {'type': 'loss', 'content': 0.018145931884646416, 'timestamp': '2025-10-01 04:15:51.228358', 'step': 112, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:15:51.273948', 'step': 112, 'epoch': 1} {'type': 'loss', 'content': 0.010721199214458466, 'timestamp': '2025-10-01 04:15:51.287385', 'step': 113, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:15:51.328533', 'step': 113, 'epoch': 1} {'type': 'loss', 'content': 0.019626762717962265, 'timestamp': '2025-10-01 04:15:51.336890', 'step': 114, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:15:51.406199', 'step': 114, 'epoch': 1} {'type': 'loss', 'content': 0.03277111425995827, 'timestamp': '2025-10-01 04:15:51.418350', 'step': 115, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-10-01 04:15:54.535033', 'step': 115, 'epoch': 1} {'type': 'pplx', 'content': 5.652769097202767, 'timestamp': '2025-10-01 04:15:54.547739', 'step': 115, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:15:54.596550', 'step': 115, 'epoch': 1} {'type': 'loss', 'content': 0.016940824687480927, 'timestamp': '2025-10-01 04:15:54.633512', 'step': 116, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:15:54.675856', 'step': 116, 'epoch': 1} {'type': 'loss', 'content': 0.009923848323523998, 'timestamp': '2025-10-01 04:15:54.684087', 'step': 117, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:15:54.734847', 'step': 117, 'epoch': 1} {'type': 'loss', 'content': 0.021037673577666283, 'timestamp': '2025-10-01 04:15:54.747538', 'step': 118, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:15:54.799002', 'step': 118, 'epoch': 1} {'type': 'loss', 'content': 0.027497602626681328, 'timestamp': '2025-10-01 04:15:54.810398', 'step': 119, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:15:54.864849', 'step': 119, 'epoch': 1} {'type': 'loss', 'content': 0.014379456639289856, 'timestamp': '2025-10-01 04:15:54.899799', 'step': 120, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:15:54.957255', 'step': 120, 'epoch': 1} {'type': 'loss', 'content': 0.03203187137842178, 'timestamp': '2025-10-01 04:15:54.970577', 'step': 121, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:15:55.020884', 'step': 121, 'epoch': 1} {'type': 'loss', 'content': 0.013633854687213898, 'timestamp': '2025-10-01 04:15:55.033718', 'step': 122, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:15:55.096744', 'step': 122, 'epoch': 1} {'type': 'loss', 'content': 0.021394893527030945, 'timestamp': '2025-10-01 04:15:55.110785', 'step': 123, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:15:55.177235', 'step': 123, 'epoch': 1} {'type': 'loss', 'content': 0.011156504973769188, 'timestamp': '2025-10-01 04:15:55.214308', 'step': 124, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:15:55.270727', 'step': 124, 'epoch': 1} {'type': 'loss', 'content': 0.006865525618195534, 'timestamp': '2025-10-01 04:15:55.286364', 'step': 125, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:15:55.352417', 'step': 125, 'epoch': 1} {'type': 'loss', 'content': 0.009875403717160225, 'timestamp': '2025-10-01 04:15:55.365321', 'step': 126, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:15:55.416248', 'step': 126, 'epoch': 1} {'type': 'loss', 'content': 0.015166105702519417, 'timestamp': '2025-10-01 04:15:55.428001', 'step': 127, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 16611393146432}, 'timestamp': '2025-10-01 04:15:55.480642', 'step': 127, 'epoch': 1} {'type': 'loss', 'content': 0.015555760823190212, 'timestamp': '2025-10-01 04:15:55.521066', 'step': 128, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-10-01 04:15:55.576480', 'step': 128, 'epoch': 1} {'type': 'loss', 'content': 0.00854937732219696, 'timestamp': '2025-10-01 04:15:55.593359', 'step': 129, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:15:55.655001', 'step': 129, 'epoch': 1} {'type': 'loss', 'content': 0.015342685393989086, 'timestamp': '2025-10-01 04:15:55.669230', 'step': 130, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:15:55.714032', 'step': 130, 'epoch': 1} {'type': 'loss', 'content': 0.014570376835763454, 'timestamp': '2025-10-01 04:15:55.726647', 'step': 131, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:15:55.775425', 'step': 131, 'epoch': 1} {'type': 'loss', 'content': 0.019256293773651123, 'timestamp': '2025-10-01 04:15:55.809883', 'step': 132, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-10-01 04:15:55.867066', 'step': 132, 'epoch': 1} {'type': 'loss', 'content': 0.015043064020574093, 'timestamp': '2025-10-01 04:15:55.883927', 'step': 133, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 14712978242368}, 'timestamp': '2025-10-01 04:15:55.951524', 'step': 133, 'epoch': 1} {'type': 'loss', 'content': 0.009730640798807144, 'timestamp': '2025-10-01 04:15:55.969277', 'step': 134, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:15:56.030747', 'step': 134, 'epoch': 1} {'type': 'loss', 'content': 0.02242092415690422, 'timestamp': '2025-10-01 04:15:56.044818', 'step': 135, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:15:56.096012', 'step': 135, 'epoch': 1} {'type': 'loss', 'content': 0.024790991097688675, 'timestamp': '2025-10-01 04:15:56.127450', 'step': 136, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:15:56.181609', 'step': 136, 'epoch': 1} {'type': 'loss', 'content': 0.013523571193218231, 'timestamp': '2025-10-01 04:15:56.190011', 'step': 137, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:15:56.247050', 'step': 137, 'epoch': 1} {'type': 'loss', 'content': 0.03349044546484947, 'timestamp': '2025-10-01 04:15:56.259842', 'step': 138, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:15:56.304160', 'step': 138, 'epoch': 1} {'type': 'loss', 'content': 0.01887688599526882, 'timestamp': '2025-10-01 04:15:56.315782', 'step': 139, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:15:56.362785', 'step': 139, 'epoch': 1} {'type': 'loss', 'content': 0.0222526416182518, 'timestamp': '2025-10-01 04:15:56.391775', 'step': 140, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:15:56.447438', 'step': 140, 'epoch': 1} {'type': 'loss', 'content': 0.015390612185001373, 'timestamp': '2025-10-01 04:15:56.453566', 'step': 141, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:15:56.501964', 'step': 141, 'epoch': 1} {'type': 'loss', 'content': 0.019168680533766747, 'timestamp': '2025-10-01 04:15:56.515518', 'step': 142, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-10-01 04:15:56.575537', 'step': 142, 'epoch': 1} {'type': 'loss', 'content': 0.017232360318303108, 'timestamp': '2025-10-01 04:15:56.591789', 'step': 143, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:15:56.632280', 'step': 143, 'epoch': 1} {'type': 'loss', 'content': 0.018397539854049683, 'timestamp': '2025-10-01 04:15:56.661173', 'step': 144, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:15:56.708821', 'step': 144, 'epoch': 1} {'type': 'loss', 'content': 0.017362473532557487, 'timestamp': '2025-10-01 04:15:56.720007', 'step': 145, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:15:56.764752', 'step': 145, 'epoch': 1} {'type': 'loss', 'content': 0.017278455197811127, 'timestamp': '2025-10-01 04:15:56.773279', 'step': 146, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:15:56.832822', 'step': 146, 'epoch': 1} {'type': 'loss', 'content': 0.014997708611190319, 'timestamp': '2025-10-01 04:15:56.840761', 'step': 147, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:15:56.896129', 'step': 147, 'epoch': 1} {'type': 'loss', 'content': 0.04334031045436859, 'timestamp': '2025-10-01 04:15:56.924446', 'step': 148, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:15:56.986495', 'step': 148, 'epoch': 1} {'type': 'loss', 'content': 0.016128014773130417, 'timestamp': '2025-10-01 04:15:56.997446', 'step': 149, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:15:57.067217', 'step': 149, 'epoch': 1} {'type': 'loss', 'content': 0.04830405116081238, 'timestamp': '2025-10-01 04:15:57.078849', 'step': 150, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:15:57.131680', 'step': 150, 'epoch': 1} {'type': 'loss', 'content': 0.027345485985279083, 'timestamp': '2025-10-01 04:15:57.143395', 'step': 151, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:15:57.200029', 'step': 151, 'epoch': 1} {'type': 'loss', 'content': 0.016594676300883293, 'timestamp': '2025-10-01 04:15:57.235206', 'step': 152, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:15:57.286139', 'step': 152, 'epoch': 1} {'type': 'loss', 'content': 0.012336907908320427, 'timestamp': '2025-10-01 04:15:57.297109', 'step': 153, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:15:57.346211', 'step': 153, 'epoch': 1} {'type': 'loss', 'content': 0.00920384842902422, 'timestamp': '2025-10-01 04:15:57.360212', 'step': 154, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:15:57.422484', 'step': 154, 'epoch': 1} {'type': 'loss', 'content': 0.010826568119227886, 'timestamp': '2025-10-01 04:15:57.437292', 'step': 155, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:15:57.499276', 'step': 155, 'epoch': 1} {'type': 'loss', 'content': 0.014185565523803234, 'timestamp': '2025-10-01 04:15:57.536116', 'step': 156, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:15:57.591868', 'step': 156, 'epoch': 1} {'type': 'loss', 'content': 0.022365674376487732, 'timestamp': '2025-10-01 04:15:57.603026', 'step': 157, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:15:57.671448', 'step': 157, 'epoch': 1} {'type': 'loss', 'content': 0.007380561903119087, 'timestamp': '2025-10-01 04:15:57.685507', 'step': 158, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:15:57.740407', 'step': 158, 'epoch': 1} {'type': 'loss', 'content': 0.03495415672659874, 'timestamp': '2025-10-01 04:15:57.752990', 'step': 159, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:15:57.796139', 'step': 159, 'epoch': 1} {'type': 'loss', 'content': 0.02510489709675312, 'timestamp': '2025-10-01 04:15:57.831331', 'step': 160, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:15:57.881423', 'step': 160, 'epoch': 1} {'type': 'loss', 'content': 0.01745026372373104, 'timestamp': '2025-10-01 04:15:57.894358', 'step': 161, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:15:57.961510', 'step': 161, 'epoch': 1} {'type': 'loss', 'content': 0.015378102660179138, 'timestamp': '2025-10-01 04:15:57.974240', 'step': 162, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:15:58.034868', 'step': 162, 'epoch': 1} {'type': 'loss', 'content': 0.02042653039097786, 'timestamp': '2025-10-01 04:15:58.046440', 'step': 163, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:15:58.096137', 'step': 163, 'epoch': 1} {'type': 'loss', 'content': 0.013080197386443615, 'timestamp': '2025-10-01 04:15:58.129894', 'step': 164, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:15:58.184617', 'step': 164, 'epoch': 1} {'type': 'loss', 'content': 0.01232184562832117, 'timestamp': '2025-10-01 04:15:58.195713', 'step': 165, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:15:58.259353', 'step': 165, 'epoch': 1} {'type': 'loss', 'content': 0.01344381831586361, 'timestamp': '2025-10-01 04:15:58.269664', 'step': 166, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:15:58.333357', 'step': 166, 'epoch': 1} {'type': 'loss', 'content': 0.009826877154409885, 'timestamp': '2025-10-01 04:15:58.346028', 'step': 167, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:15:58.402530', 'step': 167, 'epoch': 1} {'type': 'loss', 'content': 0.012489954009652138, 'timestamp': '2025-10-01 04:15:58.434321', 'step': 168, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:15:58.484211', 'step': 168, 'epoch': 1} {'type': 'loss', 'content': 0.01958457939326763, 'timestamp': '2025-10-01 04:15:58.491524', 'step': 169, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:15:58.552531', 'step': 169, 'epoch': 1} {'type': 'loss', 'content': 0.015150906518101692, 'timestamp': '2025-10-01 04:15:58.560902', 'step': 170, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:15:58.621631', 'step': 170, 'epoch': 1} {'type': 'loss', 'content': 0.014022663235664368, 'timestamp': '2025-10-01 04:15:58.635778', 'step': 171, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:15:58.689177', 'step': 171, 'epoch': 1} {'type': 'loss', 'content': 0.01281251385807991, 'timestamp': '2025-10-01 04:15:58.718038', 'step': 172, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:15:58.768938', 'step': 172, 'epoch': 1} {'type': 'loss', 'content': 0.015419230796396732, 'timestamp': '2025-10-01 04:15:58.779254', 'step': 173, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:15:58.828711', 'step': 173, 'epoch': 1} {'type': 'loss', 'content': 0.01556409802287817, 'timestamp': '2025-10-01 04:15:58.843176', 'step': 174, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:15:58.904820', 'step': 174, 'epoch': 1} {'type': 'loss', 'content': 0.020101085305213928, 'timestamp': '2025-10-01 04:15:58.911934', 'step': 175, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:15:58.970066', 'step': 175, 'epoch': 1} {'type': 'loss', 'content': 0.013700472190976143, 'timestamp': '2025-10-01 04:15:59.003552', 'step': 176, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:15:59.065614', 'step': 176, 'epoch': 1} {'type': 'loss', 'content': 0.010425503365695477, 'timestamp': '2025-10-01 04:15:59.078900', 'step': 177, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:15:59.137363', 'step': 177, 'epoch': 1} {'type': 'loss', 'content': 0.009766981936991215, 'timestamp': '2025-10-01 04:15:59.147852', 'step': 178, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-10-01 04:15:59.216476', 'step': 178, 'epoch': 1} {'type': 'loss', 'content': 0.007230386603623629, 'timestamp': '2025-10-01 04:15:59.232932', 'step': 179, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:15:59.291319', 'step': 179, 'epoch': 1} {'type': 'loss', 'content': 0.0189223550260067, 'timestamp': '2025-10-01 04:15:59.325807', 'step': 180, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:15:59.380868', 'step': 180, 'epoch': 1} {'type': 'loss', 'content': 0.012651477009057999, 'timestamp': '2025-10-01 04:15:59.391556', 'step': 181, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:15:59.447171', 'step': 181, 'epoch': 1} {'type': 'loss', 'content': 0.01200708094984293, 'timestamp': '2025-10-01 04:15:59.458720', 'step': 182, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:15:59.518496', 'step': 182, 'epoch': 1} {'type': 'loss', 'content': 0.041099969297647476, 'timestamp': '2025-10-01 04:15:59.526800', 'step': 183, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:15:59.580852', 'step': 183, 'epoch': 1} {'type': 'loss', 'content': 0.014940493740141392, 'timestamp': '2025-10-01 04:15:59.614496', 'step': 184, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:15:59.686831', 'step': 184, 'epoch': 1} {'type': 'loss', 'content': 0.022617174312472343, 'timestamp': '2025-10-01 04:15:59.689411', 'step': 185, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:15:59.741474', 'step': 185, 'epoch': 1} {'type': 'loss', 'content': 0.011522305198013783, 'timestamp': '2025-10-01 04:15:59.752890', 'step': 186, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:15:59.812339', 'step': 186, 'epoch': 1} {'type': 'loss', 'content': 0.0218654815107584, 'timestamp': '2025-10-01 04:15:59.826360', 'step': 187, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:15:59.876877', 'step': 187, 'epoch': 1} {'type': 'loss', 'content': 0.030659809708595276, 'timestamp': '2025-10-01 04:15:59.905910', 'step': 188, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:15:59.956562', 'step': 188, 'epoch': 1} {'type': 'loss', 'content': 0.010193354450166225, 'timestamp': '2025-10-01 04:15:59.969401', 'step': 189, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:16:00.024420', 'step': 189, 'epoch': 1} {'type': 'loss', 'content': 0.010961892083287239, 'timestamp': '2025-10-01 04:16:00.037235', 'step': 190, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:16:00.100195', 'step': 190, 'epoch': 1} {'type': 'loss', 'content': 0.017373962327837944, 'timestamp': '2025-10-01 04:16:00.116132', 'step': 191, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:16:00.173340', 'step': 191, 'epoch': 1} {'type': 'loss', 'content': 0.009168125689029694, 'timestamp': '2025-10-01 04:16:00.206911', 'step': 192, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:16:00.258304', 'step': 192, 'epoch': 1} {'type': 'loss', 'content': 0.015117742121219635, 'timestamp': '2025-10-01 04:16:00.263298', 'step': 193, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:00.314897', 'step': 193, 'epoch': 1} {'type': 'loss', 'content': 0.015156692825257778, 'timestamp': '2025-10-01 04:16:00.325860', 'step': 194, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:16:00.384849', 'step': 194, 'epoch': 1} {'type': 'loss', 'content': 0.011178301647305489, 'timestamp': '2025-10-01 04:16:00.400874', 'step': 195, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:16:00.454271', 'step': 195, 'epoch': 1} {'type': 'loss', 'content': 0.011227980256080627, 'timestamp': '2025-10-01 04:16:00.489195', 'step': 196, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:16:00.544459', 'step': 196, 'epoch': 1} {'type': 'loss', 'content': 0.021653154864907265, 'timestamp': '2025-10-01 04:16:00.557864', 'step': 197, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:00.619037', 'step': 197, 'epoch': 1} {'type': 'loss', 'content': 0.017506862059235573, 'timestamp': '2025-10-01 04:16:00.631463', 'step': 198, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:00.692412', 'step': 198, 'epoch': 1} {'type': 'loss', 'content': 0.01950342021882534, 'timestamp': '2025-10-01 04:16:00.703239', 'step': 199, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:00.760610', 'step': 199, 'epoch': 1} {'type': 'loss', 'content': 0.011990833096206188, 'timestamp': '2025-10-01 04:16:00.793009', 'step': 200, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:00.857713', 'step': 200, 'epoch': 1} {'type': 'loss', 'content': 0.017169926315546036, 'timestamp': '2025-10-01 04:16:00.869002', 'step': 201, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:16:00.920119', 'step': 201, 'epoch': 1} {'type': 'loss', 'content': 0.015315508469939232, 'timestamp': '2025-10-01 04:16:00.933871', 'step': 202, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:16:00.999689', 'step': 202, 'epoch': 1} {'type': 'loss', 'content': 0.005204418674111366, 'timestamp': '2025-10-01 04:16:01.013774', 'step': 203, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:16:01.071096', 'step': 203, 'epoch': 1} {'type': 'loss', 'content': 0.010469146072864532, 'timestamp': '2025-10-01 04:16:01.106082', 'step': 204, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:16:01.154341', 'step': 204, 'epoch': 1} {'type': 'loss', 'content': 0.0175720676779747, 'timestamp': '2025-10-01 04:16:01.167152', 'step': 205, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:16:01.226505', 'step': 205, 'epoch': 1} {'type': 'loss', 'content': 0.02216772362589836, 'timestamp': '2025-10-01 04:16:01.238979', 'step': 206, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:16:01.298295', 'step': 206, 'epoch': 1} {'type': 'loss', 'content': 0.02042493410408497, 'timestamp': '2025-10-01 04:16:01.311861', 'step': 207, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:16:01.356358', 'step': 207, 'epoch': 1} {'type': 'loss', 'content': 0.017883921042084694, 'timestamp': '2025-10-01 04:16:01.391364', 'step': 208, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:01.442658', 'step': 208, 'epoch': 1} {'type': 'loss', 'content': 0.015674149617552757, 'timestamp': '2025-10-01 04:16:01.453655', 'step': 209, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:16:01.508870', 'step': 209, 'epoch': 1} {'type': 'loss', 'content': 0.009745707735419273, 'timestamp': '2025-10-01 04:16:01.522842', 'step': 210, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:01.566935', 'step': 210, 'epoch': 1} {'type': 'loss', 'content': 0.015926841646432877, 'timestamp': '2025-10-01 04:16:01.574282', 'step': 211, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:16:01.620376', 'step': 211, 'epoch': 1} {'type': 'loss', 'content': 0.015715662389993668, 'timestamp': '2025-10-01 04:16:01.654930', 'step': 212, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:16:01.704725', 'step': 212, 'epoch': 1} {'type': 'loss', 'content': 0.014021548442542553, 'timestamp': '2025-10-01 04:16:01.715201', 'step': 213, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:01.753756', 'step': 213, 'epoch': 1} {'type': 'loss', 'content': 0.016658710315823555, 'timestamp': '2025-10-01 04:16:01.765334', 'step': 214, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:16:01.814242', 'step': 214, 'epoch': 1} {'type': 'loss', 'content': 0.019720206037163734, 'timestamp': '2025-10-01 04:16:01.822941', 'step': 215, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:16:01.870612', 'step': 215, 'epoch': 1} {'type': 'loss', 'content': 0.0145456213504076, 'timestamp': '2025-10-01 04:16:01.905450', 'step': 216, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:16:01.953803', 'step': 216, 'epoch': 1} {'type': 'loss', 'content': 0.007101206108927727, 'timestamp': '2025-10-01 04:16:01.966658', 'step': 217, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:16:02.016856', 'step': 217, 'epoch': 1} {'type': 'loss', 'content': 0.011377424001693726, 'timestamp': '2025-10-01 04:16:02.029703', 'step': 218, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:16:02.086248', 'step': 218, 'epoch': 1} {'type': 'loss', 'content': 0.016751078888773918, 'timestamp': '2025-10-01 04:16:02.099812', 'step': 219, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:16:02.141795', 'step': 219, 'epoch': 1} {'type': 'loss', 'content': 0.010861853137612343, 'timestamp': '2025-10-01 04:16:02.175271', 'step': 220, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:16:02.221923', 'step': 220, 'epoch': 1} {'type': 'loss', 'content': 0.010152469389140606, 'timestamp': '2025-10-01 04:16:02.234722', 'step': 221, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:16:02.284322', 'step': 221, 'epoch': 1} {'type': 'loss', 'content': 0.02063046023249626, 'timestamp': '2025-10-01 04:16:02.292428', 'step': 222, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:16:02.339122', 'step': 222, 'epoch': 1} {'type': 'loss', 'content': 0.010324391536414623, 'timestamp': '2025-10-01 04:16:02.351805', 'step': 223, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:02.405288', 'step': 223, 'epoch': 1} {'type': 'loss', 'content': 0.01861061155796051, 'timestamp': '2025-10-01 04:16:02.434314', 'step': 224, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:16:02.480339', 'step': 224, 'epoch': 1} {'type': 'loss', 'content': 0.012313750572502613, 'timestamp': '2025-10-01 04:16:02.491283', 'step': 225, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:02.539329', 'step': 225, 'epoch': 1} {'type': 'loss', 'content': 0.01361026894301176, 'timestamp': '2025-10-01 04:16:02.550924', 'step': 226, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:16:02.591906', 'step': 226, 'epoch': 1} {'type': 'loss', 'content': 0.017629968002438545, 'timestamp': '2025-10-01 04:16:02.602576', 'step': 227, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-10-01 04:16:02.665074', 'step': 227, 'epoch': 1} {'type': 'loss', 'content': 0.01168879121541977, 'timestamp': '2025-10-01 04:16:02.703301', 'step': 228, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:16:02.752375', 'step': 228, 'epoch': 1} {'type': 'loss', 'content': 0.011294701136648655, 'timestamp': '2025-10-01 04:16:02.765262', 'step': 229, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:16:02.811802', 'step': 229, 'epoch': 1} {'type': 'loss', 'content': 0.01239917241036892, 'timestamp': '2025-10-01 04:16:02.824390', 'step': 230, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-10-01 04:16:05.511001', 'step': 230, 'epoch': 1} {'type': 'pplx', 'content': 5.607253716383798, 'timestamp': '2025-10-01 04:16:05.515765', 'step': 230, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:05.562438', 'step': 230, 'epoch': 1} {'type': 'loss', 'content': 0.01880328357219696, 'timestamp': '2025-10-01 04:16:05.569886', 'step': 231, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:16:05.608427', 'step': 231, 'epoch': 1} {'type': 'loss', 'content': 0.02568168379366398, 'timestamp': '2025-10-01 04:16:05.641907', 'step': 232, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:05.686213', 'step': 232, 'epoch': 1} {'type': 'loss', 'content': 0.018438123166561127, 'timestamp': '2025-10-01 04:16:05.694422', 'step': 233, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:16:05.735197', 'step': 233, 'epoch': 1} {'type': 'loss', 'content': 0.013187897391617298, 'timestamp': '2025-10-01 04:16:05.747926', 'step': 234, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:05.791686', 'step': 234, 'epoch': 1} {'type': 'loss', 'content': 0.017361653968691826, 'timestamp': '2025-10-01 04:16:05.799955', 'step': 235, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:16:05.836939', 'step': 235, 'epoch': 1} {'type': 'loss', 'content': 0.010809950530529022, 'timestamp': '2025-10-01 04:16:05.871511', 'step': 236, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:05.909331', 'step': 236, 'epoch': 1} {'type': 'loss', 'content': 0.016752883791923523, 'timestamp': '2025-10-01 04:16:05.918539', 'step': 237, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:16:05.972462', 'step': 237, 'epoch': 1} {'type': 'loss', 'content': 0.018295761197805405, 'timestamp': '2025-10-01 04:16:05.985161', 'step': 238, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:16:06.027138', 'step': 238, 'epoch': 1} {'type': 'loss', 'content': 0.014801901765167713, 'timestamp': '2025-10-01 04:16:06.039819', 'step': 239, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:16:06.084513', 'step': 239, 'epoch': 1} {'type': 'loss', 'content': 0.013710476458072662, 'timestamp': '2025-10-01 04:16:06.119043', 'step': 240, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:16:06.163817', 'step': 240, 'epoch': 1} {'type': 'loss', 'content': 0.015167715027928352, 'timestamp': '2025-10-01 04:16:06.177171', 'step': 241, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:06.214010', 'step': 241, 'epoch': 1} {'type': 'loss', 'content': 0.011374469846487045, 'timestamp': '2025-10-01 04:16:06.225577', 'step': 242, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:06.273475', 'step': 242, 'epoch': 1} {'type': 'loss', 'content': 0.021162409335374832, 'timestamp': '2025-10-01 04:16:06.285013', 'step': 243, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:16:06.325602', 'step': 243, 'epoch': 1} {'type': 'loss', 'content': 0.008011997677385807, 'timestamp': '2025-10-01 04:16:06.360733', 'step': 244, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:16:06.403745', 'step': 244, 'epoch': 1} {'type': 'loss', 'content': 0.018164141103625298, 'timestamp': '2025-10-01 04:16:06.417244', 'step': 245, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:16:06.463634', 'step': 245, 'epoch': 1} {'type': 'loss', 'content': 0.011018065735697746, 'timestamp': '2025-10-01 04:16:06.479672', 'step': 246, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:16:06.523485', 'step': 246, 'epoch': 1} {'type': 'loss', 'content': 0.009743544273078442, 'timestamp': '2025-10-01 04:16:06.537175', 'step': 247, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:16:06.571974', 'step': 247, 'epoch': 1} {'type': 'loss', 'content': 0.02036970481276512, 'timestamp': '2025-10-01 04:16:06.605688', 'step': 248, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:16:06.643173', 'step': 248, 'epoch': 1} {'type': 'loss', 'content': 0.011595949530601501, 'timestamp': '2025-10-01 04:16:06.656833', 'step': 249, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:16:06.698722', 'step': 249, 'epoch': 1} {'type': 'loss', 'content': 0.011444943957030773, 'timestamp': '2025-10-01 04:16:06.714805', 'step': 250, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:16:06.757570', 'step': 250, 'epoch': 1} {'type': 'loss', 'content': 0.012089982628822327, 'timestamp': '2025-10-01 04:16:06.771726', 'step': 251, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:16:06.814417', 'step': 251, 'epoch': 1} {'type': 'loss', 'content': 0.005498992744833231, 'timestamp': '2025-10-01 04:16:06.851564', 'step': 252, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:16:06.891525', 'step': 252, 'epoch': 1} {'type': 'loss', 'content': 0.010803326964378357, 'timestamp': '2025-10-01 04:16:06.901852', 'step': 253, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:16:06.940857', 'step': 253, 'epoch': 1} {'type': 'loss', 'content': 0.01593819074332714, 'timestamp': '2025-10-01 04:16:06.954651', 'step': 254, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:16:07.002801', 'step': 254, 'epoch': 1} {'type': 'loss', 'content': 0.01053207740187645, 'timestamp': '2025-10-01 04:16:07.016788', 'step': 255, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:07.058237', 'step': 255, 'epoch': 1} {'type': 'loss', 'content': 0.021653834730386734, 'timestamp': '2025-10-01 04:16:07.090678', 'step': 256, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:16:07.126954', 'step': 256, 'epoch': 1} {'type': 'loss', 'content': 0.01918930374085903, 'timestamp': '2025-10-01 04:16:07.140327', 'step': 257, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:16:07.182395', 'step': 257, 'epoch': 1} {'type': 'loss', 'content': 0.013103567063808441, 'timestamp': '2025-10-01 04:16:07.194975', 'step': 258, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:16:07.239906', 'step': 258, 'epoch': 1} {'type': 'loss', 'content': 0.018775828182697296, 'timestamp': '2025-10-01 04:16:07.253529', 'step': 259, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:16:07.306904', 'step': 259, 'epoch': 1} {'type': 'loss', 'content': 0.015003686770796776, 'timestamp': '2025-10-01 04:16:07.342009', 'step': 260, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 14238374516352}, 'timestamp': '2025-10-01 04:16:07.393917', 'step': 260, 'epoch': 1} {'type': 'loss', 'content': 0.011988172307610512, 'timestamp': '2025-10-01 04:16:07.411042', 'step': 261, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:16:07.455583', 'step': 261, 'epoch': 1} {'type': 'loss', 'content': 0.024883845821022987, 'timestamp': '2025-10-01 04:16:07.469610', 'step': 262, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:07.512689', 'step': 262, 'epoch': 1} {'type': 'loss', 'content': 0.028008200228214264, 'timestamp': '2025-10-01 04:16:07.523159', 'step': 263, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:07.566349', 'step': 263, 'epoch': 1} {'type': 'loss', 'content': 0.03127381578087807, 'timestamp': '2025-10-01 04:16:07.599759', 'step': 264, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:07.635045', 'step': 264, 'epoch': 1} {'type': 'loss', 'content': 0.0314021110534668, 'timestamp': '2025-10-01 04:16:07.641957', 'step': 265, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:16:07.674405', 'step': 265, 'epoch': 1} {'type': 'loss', 'content': 0.014314349740743637, 'timestamp': '2025-10-01 04:16:07.687211', 'step': 266, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:16:07.728765', 'step': 266, 'epoch': 1} {'type': 'loss', 'content': 0.017302215099334717, 'timestamp': '2025-10-01 04:16:07.742495', 'step': 267, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:16:07.777901', 'step': 267, 'epoch': 1} {'type': 'loss', 'content': 0.022208889946341515, 'timestamp': '2025-10-01 04:16:07.811471', 'step': 268, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:16:07.843831', 'step': 268, 'epoch': 1} {'type': 'loss', 'content': 0.023853404447436333, 'timestamp': '2025-10-01 04:16:07.854463', 'step': 269, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:16:07.891465', 'step': 269, 'epoch': 1} {'type': 'loss', 'content': 0.014913097955286503, 'timestamp': '2025-10-01 04:16:07.905193', 'step': 270, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-10-01 04:16:07.949143', 'step': 270, 'epoch': 1} {'type': 'loss', 'content': 0.007243603467941284, 'timestamp': '2025-10-01 04:16:07.965677', 'step': 271, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:16:08.012228', 'step': 271, 'epoch': 1} {'type': 'loss', 'content': 0.01373897772282362, 'timestamp': '2025-10-01 04:16:08.049213', 'step': 272, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:16:08.090824', 'step': 272, 'epoch': 1} {'type': 'loss', 'content': 0.011879388242959976, 'timestamp': '2025-10-01 04:16:08.104295', 'step': 273, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:08.143354', 'step': 273, 'epoch': 1} {'type': 'loss', 'content': 0.024222249165177345, 'timestamp': '2025-10-01 04:16:08.154065', 'step': 274, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-10-01 04:16:08.202374', 'step': 274, 'epoch': 1} {'type': 'loss', 'content': 0.007162329275161028, 'timestamp': '2025-10-01 04:16:08.218830', 'step': 275, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:16:08.260168', 'step': 275, 'epoch': 1} {'type': 'loss', 'content': 0.01410145778208971, 'timestamp': '2025-10-01 04:16:08.295207', 'step': 276, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:16:08.335178', 'step': 276, 'epoch': 1} {'type': 'loss', 'content': 0.017773054540157318, 'timestamp': '2025-10-01 04:16:08.345420', 'step': 277, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:16:08.380900', 'step': 277, 'epoch': 1} {'type': 'loss', 'content': 0.0138253103941679, 'timestamp': '2025-10-01 04:16:08.388691', 'step': 278, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:08.428847', 'step': 278, 'epoch': 1} {'type': 'loss', 'content': 0.01933428645133972, 'timestamp': '2025-10-01 04:16:08.440416', 'step': 279, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:08.481006', 'step': 279, 'epoch': 1} {'type': 'loss', 'content': 0.022205237299203873, 'timestamp': '2025-10-01 04:16:08.512714', 'step': 280, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:16:08.555415', 'step': 280, 'epoch': 1} {'type': 'loss', 'content': 0.03377882018685341, 'timestamp': '2025-10-01 04:16:08.568733', 'step': 281, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:08.601911', 'step': 281, 'epoch': 1} {'type': 'loss', 'content': 0.014277351088821888, 'timestamp': '2025-10-01 04:16:08.612624', 'step': 282, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:16:08.650829', 'step': 282, 'epoch': 1} {'type': 'loss', 'content': 0.016705619171261787, 'timestamp': '2025-10-01 04:16:08.665132', 'step': 283, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:16:08.711691', 'step': 283, 'epoch': 1} {'type': 'loss', 'content': 0.013043038547039032, 'timestamp': '2025-10-01 04:16:08.745297', 'step': 284, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:16:08.790076', 'step': 284, 'epoch': 1} {'type': 'loss', 'content': 0.011550795286893845, 'timestamp': '2025-10-01 04:16:08.805351', 'step': 285, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:16:08.854726', 'step': 285, 'epoch': 1} {'type': 'loss', 'content': 0.019154729321599007, 'timestamp': '2025-10-01 04:16:08.870841', 'step': 286, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:08.904607', 'step': 286, 'epoch': 1} {'type': 'loss', 'content': 0.023732976987957954, 'timestamp': '2025-10-01 04:16:08.916058', 'step': 287, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:08.949099', 'step': 287, 'epoch': 1} {'type': 'loss', 'content': 0.010763418860733509, 'timestamp': '2025-10-01 04:16:08.980611', 'step': 288, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:09.013299', 'step': 288, 'epoch': 1} {'type': 'loss', 'content': 0.02386975847184658, 'timestamp': '2025-10-01 04:16:09.023970', 'step': 289, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:16:09.059447', 'step': 289, 'epoch': 1} {'type': 'loss', 'content': 0.01169043779373169, 'timestamp': '2025-10-01 04:16:09.073047', 'step': 290, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:16:09.116529', 'step': 290, 'epoch': 1} {'type': 'loss', 'content': 0.009183173067867756, 'timestamp': '2025-10-01 04:16:09.130188', 'step': 291, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:16:09.170326', 'step': 291, 'epoch': 1} {'type': 'loss', 'content': 0.01387426070868969, 'timestamp': '2025-10-01 04:16:09.203810', 'step': 292, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:09.244405', 'step': 292, 'epoch': 1} {'type': 'loss', 'content': 0.02306823432445526, 'timestamp': '2025-10-01 04:16:09.253341', 'step': 293, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:16:09.292534', 'step': 293, 'epoch': 1} {'type': 'loss', 'content': 0.014040554873645306, 'timestamp': '2025-10-01 04:16:09.305113', 'step': 294, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:16:09.345701', 'step': 294, 'epoch': 1} {'type': 'loss', 'content': 0.022995956242084503, 'timestamp': '2025-10-01 04:16:09.353025', 'step': 295, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:16:09.398358', 'step': 295, 'epoch': 1} {'type': 'loss', 'content': 0.009345492348074913, 'timestamp': '2025-10-01 04:16:09.435456', 'step': 296, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:16:09.470494', 'step': 296, 'epoch': 1} {'type': 'loss', 'content': 0.022916002199053764, 'timestamp': '2025-10-01 04:16:09.477314', 'step': 297, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:16:09.517532', 'step': 297, 'epoch': 1} {'type': 'loss', 'content': 0.016851814463734627, 'timestamp': '2025-10-01 04:16:09.524813', 'step': 298, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:09.563034', 'step': 298, 'epoch': 1} {'type': 'loss', 'content': 0.016646744683384895, 'timestamp': '2025-10-01 04:16:09.573257', 'step': 299, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:09.607689', 'step': 299, 'epoch': 1} {'type': 'loss', 'content': 0.017989274114370346, 'timestamp': '2025-10-01 04:16:09.640090', 'step': 300, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:16:09.682148', 'step': 300, 'epoch': 1} {'type': 'loss', 'content': 0.011906052939593792, 'timestamp': '2025-10-01 04:16:09.695691', 'step': 301, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:09.727596', 'step': 301, 'epoch': 1} {'type': 'loss', 'content': 0.01962910406291485, 'timestamp': '2025-10-01 04:16:09.735930', 'step': 302, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:16:09.778770', 'step': 302, 'epoch': 1} {'type': 'loss', 'content': 0.03568243235349655, 'timestamp': '2025-10-01 04:16:09.786247', 'step': 303, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:09.817559', 'step': 303, 'epoch': 1} {'type': 'loss', 'content': 0.010100298561155796, 'timestamp': '2025-10-01 04:16:09.850076', 'step': 304, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:16:09.884862', 'step': 304, 'epoch': 1} {'type': 'loss', 'content': 0.010526360012590885, 'timestamp': '2025-10-01 04:16:09.897684', 'step': 305, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:09.939797', 'step': 305, 'epoch': 1} {'type': 'loss', 'content': 0.01768452487885952, 'timestamp': '2025-10-01 04:16:09.948143', 'step': 306, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:09.987169', 'step': 306, 'epoch': 1} {'type': 'loss', 'content': 0.011406570672988892, 'timestamp': '2025-10-01 04:16:09.995162', 'step': 307, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:16:10.038879', 'step': 307, 'epoch': 1} {'type': 'loss', 'content': 0.01714218407869339, 'timestamp': '2025-10-01 04:16:10.073828', 'step': 308, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:16:10.118021', 'step': 308, 'epoch': 1} {'type': 'loss', 'content': 0.007288444321602583, 'timestamp': '2025-10-01 04:16:10.133641', 'step': 309, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-10-01 04:16:10.181839', 'step': 309, 'epoch': 1} {'type': 'loss', 'content': 0.011914399452507496, 'timestamp': '2025-10-01 04:16:10.199126', 'step': 310, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:16:10.239466', 'step': 310, 'epoch': 1} {'type': 'loss', 'content': 0.018713096156716347, 'timestamp': '2025-10-01 04:16:10.253536', 'step': 311, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:16:10.293220', 'step': 311, 'epoch': 1} {'type': 'loss', 'content': 0.008557330816984177, 'timestamp': '2025-10-01 04:16:10.328391', 'step': 312, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:10.364280', 'step': 312, 'epoch': 1} {'type': 'loss', 'content': 0.025217026472091675, 'timestamp': '2025-10-01 04:16:10.372871', 'step': 313, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:16:10.422534', 'step': 313, 'epoch': 1} {'type': 'loss', 'content': 0.011356729082763195, 'timestamp': '2025-10-01 04:16:10.436545', 'step': 314, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:16:10.476427', 'step': 314, 'epoch': 1} {'type': 'loss', 'content': 0.011046732775866985, 'timestamp': '2025-10-01 04:16:10.489018', 'step': 315, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:16:10.531160', 'step': 315, 'epoch': 1} {'type': 'loss', 'content': 0.007812062744051218, 'timestamp': '2025-10-01 04:16:10.565830', 'step': 316, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:16:10.602699', 'step': 316, 'epoch': 1} {'type': 'loss', 'content': 0.008752225898206234, 'timestamp': '2025-10-01 04:16:10.616310', 'step': 317, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:16:10.660089', 'step': 317, 'epoch': 1} {'type': 'loss', 'content': 0.010346435010433197, 'timestamp': '2025-10-01 04:16:10.673696', 'step': 318, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:16:10.708336', 'step': 318, 'epoch': 1} {'type': 'loss', 'content': 0.012063035741448402, 'timestamp': '2025-10-01 04:16:10.721107', 'step': 319, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:16:10.760542', 'step': 319, 'epoch': 1} {'type': 'loss', 'content': 0.005755930207669735, 'timestamp': '2025-10-01 04:16:10.795503', 'step': 320, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:10.832844', 'step': 320, 'epoch': 1} {'type': 'loss', 'content': 0.022907188162207603, 'timestamp': '2025-10-01 04:16:10.841153', 'step': 321, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:16:10.875141', 'step': 321, 'epoch': 1} {'type': 'loss', 'content': 0.01874624378979206, 'timestamp': '2025-10-01 04:16:10.887897', 'step': 322, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:16:10.918904', 'step': 322, 'epoch': 1} {'type': 'loss', 'content': 0.021654993295669556, 'timestamp': '2025-10-01 04:16:10.926633', 'step': 323, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:16:10.960551', 'step': 323, 'epoch': 1} {'type': 'loss', 'content': 0.02306421473622322, 'timestamp': '2025-10-01 04:16:10.989007', 'step': 324, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:16:11.019621', 'step': 324, 'epoch': 1} {'type': 'loss', 'content': 0.01029556430876255, 'timestamp': '2025-10-01 04:16:11.024618', 'step': 325, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:16:11.057123', 'step': 325, 'epoch': 1} {'type': 'loss', 'content': 0.015253379940986633, 'timestamp': '2025-10-01 04:16:11.064841', 'step': 326, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:16:11.097954', 'step': 326, 'epoch': 1} {'type': 'loss', 'content': 0.0181264765560627, 'timestamp': '2025-10-01 04:16:11.110731', 'step': 327, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-10-01 04:16:11.154169', 'step': 327, 'epoch': 1} {'type': 'loss', 'content': 0.03436052426695824, 'timestamp': '2025-10-01 04:16:11.185228', 'step': 328, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:16:11.220690', 'step': 328, 'epoch': 1} {'type': 'loss', 'content': 0.006343089509755373, 'timestamp': '2025-10-01 04:16:11.234120', 'step': 329, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:11.267541', 'step': 329, 'epoch': 1} {'type': 'loss', 'content': 0.01778026856482029, 'timestamp': '2025-10-01 04:16:11.278582', 'step': 330, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:16:11.321186', 'step': 330, 'epoch': 1} {'type': 'loss', 'content': 0.008538925088942051, 'timestamp': '2025-10-01 04:16:11.335178', 'step': 331, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:16:11.383300', 'step': 331, 'epoch': 1} {'type': 'loss', 'content': 0.012420018203556538, 'timestamp': '2025-10-01 04:16:11.420133', 'step': 332, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:16:11.465477', 'step': 332, 'epoch': 1} {'type': 'loss', 'content': 0.014103807508945465, 'timestamp': '2025-10-01 04:16:11.478885', 'step': 333, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:16:11.521342', 'step': 333, 'epoch': 1} {'type': 'loss', 'content': 0.014106870628893375, 'timestamp': '2025-10-01 04:16:11.534944', 'step': 334, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:16:11.575149', 'step': 334, 'epoch': 1} {'type': 'loss', 'content': 0.007152016740292311, 'timestamp': '2025-10-01 04:16:11.589335', 'step': 335, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:16:11.629529', 'step': 335, 'epoch': 1} {'type': 'loss', 'content': 0.024127693846821785, 'timestamp': '2025-10-01 04:16:11.664054', 'step': 336, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:11.699178', 'step': 336, 'epoch': 1} {'type': 'loss', 'content': 0.017014382407069206, 'timestamp': '2025-10-01 04:16:11.708343', 'step': 337, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:16:11.743231', 'step': 337, 'epoch': 1} {'type': 'loss', 'content': 0.01286369375884533, 'timestamp': '2025-10-01 04:16:11.756888', 'step': 338, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:16:11.796968', 'step': 338, 'epoch': 1} {'type': 'loss', 'content': 0.01168111152946949, 'timestamp': '2025-10-01 04:16:11.804549', 'step': 339, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:11.839270', 'step': 339, 'epoch': 1} {'type': 'loss', 'content': 0.030921176075935364, 'timestamp': '2025-10-01 04:16:11.868245', 'step': 340, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:16:11.906796', 'step': 340, 'epoch': 1} {'type': 'loss', 'content': 0.09170614928007126, 'timestamp': '2025-10-01 04:16:11.917146', 'step': 341, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:11.955250', 'step': 341, 'epoch': 1} {'type': 'loss', 'content': 0.019493835046887398, 'timestamp': '2025-10-01 04:16:11.966365', 'step': 342, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:16:12.001590', 'step': 342, 'epoch': 1} {'type': 'loss', 'content': 0.0430162250995636, 'timestamp': '2025-10-01 04:16:12.008870', 'step': 343, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:12.044454', 'step': 343, 'epoch': 1} {'type': 'loss', 'content': 0.01926451548933983, 'timestamp': '2025-10-01 04:16:12.076056', 'step': 344, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:16:12.109598', 'step': 344, 'epoch': 1} {'type': 'loss', 'content': 0.02840515412390232, 'timestamp': '2025-10-01 04:16:12.115172', 'step': 345, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-10-01 04:16:14.548466', 'step': 345, 'epoch': 1} {'type': 'pplx', 'content': 5.549141753085327, 'timestamp': '2025-10-01 04:16:14.557946', 'step': 345, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:16:14.595095', 'step': 345, 'epoch': 1} {'type': 'loss', 'content': 0.02395755425095558, 'timestamp': '2025-10-01 04:16:14.604344', 'step': 346, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:14.642356', 'step': 346, 'epoch': 1} {'type': 'loss', 'content': 0.01985536515712738, 'timestamp': '2025-10-01 04:16:14.650426', 'step': 347, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:14.684910', 'step': 347, 'epoch': 1} {'type': 'loss', 'content': 0.015479464083909988, 'timestamp': '2025-10-01 04:16:14.717246', 'step': 348, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:16:14.752015', 'step': 348, 'epoch': 1} {'type': 'loss', 'content': 0.009653572924435139, 'timestamp': '2025-10-01 04:16:14.765391', 'step': 349, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:14.801983', 'step': 349, 'epoch': 1} {'type': 'loss', 'content': 0.01574530638754368, 'timestamp': '2025-10-01 04:16:14.813492', 'step': 350, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:14.848563', 'step': 350, 'epoch': 1} {'type': 'loss', 'content': 0.009675166569650173, 'timestamp': '2025-10-01 04:16:14.860105', 'step': 351, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:16:14.896150', 'step': 351, 'epoch': 1} {'type': 'loss', 'content': 0.02392168901860714, 'timestamp': '2025-10-01 04:16:14.930795', 'step': 352, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:16:14.964495', 'step': 352, 'epoch': 1} {'type': 'loss', 'content': 0.017834385856986046, 'timestamp': '2025-10-01 04:16:14.977356', 'step': 353, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:16:15.017111', 'step': 353, 'epoch': 1} {'type': 'loss', 'content': 0.009682202711701393, 'timestamp': '2025-10-01 04:16:15.031086', 'step': 354, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:16:15.070321', 'step': 354, 'epoch': 1} {'type': 'loss', 'content': 0.012593532912433147, 'timestamp': '2025-10-01 04:16:15.084555', 'step': 355, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:16:15.120799', 'step': 355, 'epoch': 1} {'type': 'loss', 'content': 0.01858651079237461, 'timestamp': '2025-10-01 04:16:15.155296', 'step': 356, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:16:15.192234', 'step': 356, 'epoch': 1} {'type': 'loss', 'content': 0.017077673226594925, 'timestamp': '2025-10-01 04:16:15.202425', 'step': 357, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:16:15.238169', 'step': 357, 'epoch': 1} {'type': 'loss', 'content': 0.013251079246401787, 'timestamp': '2025-10-01 04:16:15.250755', 'step': 358, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:16:15.283341', 'step': 358, 'epoch': 1} {'type': 'loss', 'content': 0.027262454852461815, 'timestamp': '2025-10-01 04:16:15.290648', 'step': 359, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:16:15.334600', 'step': 359, 'epoch': 1} {'type': 'loss', 'content': 0.0176805779337883, 'timestamp': '2025-10-01 04:16:15.371679', 'step': 360, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:16:15.410127', 'step': 360, 'epoch': 1} {'type': 'loss', 'content': 0.014092015102505684, 'timestamp': '2025-10-01 04:16:15.423464', 'step': 361, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:15.456244', 'step': 361, 'epoch': 1} {'type': 'loss', 'content': 0.012955495156347752, 'timestamp': '2025-10-01 04:16:15.464096', 'step': 362, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:16:15.504163', 'step': 362, 'epoch': 1} {'type': 'loss', 'content': 0.013094773516058922, 'timestamp': '2025-10-01 04:16:15.520234', 'step': 363, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 14238374516352}, 'timestamp': '2025-10-01 04:16:15.569367', 'step': 363, 'epoch': 1} {'type': 'loss', 'content': 0.0121398214250803, 'timestamp': '2025-10-01 04:16:15.607842', 'step': 364, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:16:15.646866', 'step': 364, 'epoch': 1} {'type': 'loss', 'content': 0.022744549438357353, 'timestamp': '2025-10-01 04:16:15.657040', 'step': 365, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:16:15.695620', 'step': 365, 'epoch': 1} {'type': 'loss', 'content': 0.015176698565483093, 'timestamp': '2025-10-01 04:16:15.708326', 'step': 366, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:15.745242', 'step': 366, 'epoch': 1} {'type': 'loss', 'content': 0.014952910132706165, 'timestamp': '2025-10-01 04:16:15.753225', 'step': 367, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:16:15.788690', 'step': 367, 'epoch': 1} {'type': 'loss', 'content': 0.013316516764461994, 'timestamp': '2025-10-01 04:16:15.822128', 'step': 368, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:15.858476', 'step': 368, 'epoch': 1} {'type': 'loss', 'content': 0.013045667670667171, 'timestamp': '2025-10-01 04:16:15.865969', 'step': 369, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:16:15.897744', 'step': 369, 'epoch': 1} {'type': 'loss', 'content': 0.01651594042778015, 'timestamp': '2025-10-01 04:16:15.905734', 'step': 370, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:15.941414', 'step': 370, 'epoch': 1} {'type': 'loss', 'content': 0.012774420902132988, 'timestamp': '2025-10-01 04:16:15.949719', 'step': 371, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:15.984686', 'step': 371, 'epoch': 1} {'type': 'loss', 'content': 0.01938713900744915, 'timestamp': '2025-10-01 04:16:16.016423', 'step': 372, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:16:16.049468', 'step': 372, 'epoch': 1} {'type': 'loss', 'content': 0.018018566071987152, 'timestamp': '2025-10-01 04:16:16.059928', 'step': 373, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:16:16.097997', 'step': 373, 'epoch': 1} {'type': 'loss', 'content': 0.013140740804374218, 'timestamp': '2025-10-01 04:16:16.110771', 'step': 374, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:16:16.147072', 'step': 374, 'epoch': 1} {'type': 'loss', 'content': 0.009986679069697857, 'timestamp': '2025-10-01 04:16:16.161081', 'step': 375, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:16.201960', 'step': 375, 'epoch': 1} {'type': 'loss', 'content': 0.023599032312631607, 'timestamp': '2025-10-01 04:16:16.231196', 'step': 376, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:16.264714', 'step': 376, 'epoch': 1} {'type': 'loss', 'content': 0.01528859231621027, 'timestamp': '2025-10-01 04:16:16.270957', 'step': 377, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:16:16.308527', 'step': 377, 'epoch': 1} {'type': 'loss', 'content': 0.018152805045247078, 'timestamp': '2025-10-01 04:16:16.321250', 'step': 378, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:16:16.354519', 'step': 378, 'epoch': 1} {'type': 'loss', 'content': 0.034699440002441406, 'timestamp': '2025-10-01 04:16:16.362610', 'step': 379, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:16:16.399153', 'step': 379, 'epoch': 1} {'type': 'loss', 'content': 0.028115659952163696, 'timestamp': '2025-10-01 04:16:16.432539', 'step': 380, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:16:16.465510', 'step': 380, 'epoch': 1} {'type': 'loss', 'content': 0.014396275393664837, 'timestamp': '2025-10-01 04:16:16.471140', 'step': 381, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:16:16.506564', 'step': 381, 'epoch': 1} {'type': 'loss', 'content': 0.026828931644558907, 'timestamp': '2025-10-01 04:16:16.519082', 'step': 382, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:16:16.554369', 'step': 382, 'epoch': 1} {'type': 'loss', 'content': 0.020584721118211746, 'timestamp': '2025-10-01 04:16:16.567189', 'step': 383, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:16:16.602925', 'step': 383, 'epoch': 1} {'type': 'loss', 'content': 0.009258688427507877, 'timestamp': '2025-10-01 04:16:16.637439', 'step': 384, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:16.677377', 'step': 384, 'epoch': 1} {'type': 'loss', 'content': 0.01084110327064991, 'timestamp': '2025-10-01 04:16:16.686378', 'step': 385, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:16:16.726860', 'step': 385, 'epoch': 1} {'type': 'loss', 'content': 0.016416052356362343, 'timestamp': '2025-10-01 04:16:16.734720', 'step': 386, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:16:16.780810', 'step': 386, 'epoch': 1} {'type': 'loss', 'content': 0.015982788056135178, 'timestamp': '2025-10-01 04:16:16.794826', 'step': 387, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:16:16.836941', 'step': 387, 'epoch': 1} {'type': 'loss', 'content': 0.019290953874588013, 'timestamp': '2025-10-01 04:16:16.869297', 'step': 388, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:16.909151', 'step': 388, 'epoch': 1} {'type': 'loss', 'content': 0.013801880180835724, 'timestamp': '2025-10-01 04:16:16.914972', 'step': 389, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:16:16.947148', 'step': 389, 'epoch': 1} {'type': 'loss', 'content': 0.028254201635718346, 'timestamp': '2025-10-01 04:16:16.959726', 'step': 390, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:16.995057', 'step': 390, 'epoch': 1} {'type': 'loss', 'content': 0.017436588183045387, 'timestamp': '2025-10-01 04:16:17.005596', 'step': 391, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:16:17.039389', 'step': 391, 'epoch': 1} {'type': 'loss', 'content': 0.01159089058637619, 'timestamp': '2025-10-01 04:16:17.072998', 'step': 392, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:16:17.112813', 'step': 392, 'epoch': 1} {'type': 'loss', 'content': 0.01592138595879078, 'timestamp': '2025-10-01 04:16:17.125629', 'step': 393, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:16:17.161161', 'step': 393, 'epoch': 1} {'type': 'loss', 'content': 0.013231460936367512, 'timestamp': '2025-10-01 04:16:17.173927', 'step': 394, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:17.207927', 'step': 394, 'epoch': 1} {'type': 'loss', 'content': 0.028956327587366104, 'timestamp': '2025-10-01 04:16:17.216090', 'step': 395, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:16:17.257627', 'step': 395, 'epoch': 1} {'type': 'loss', 'content': 0.01622656360268593, 'timestamp': '2025-10-01 04:16:17.292627', 'step': 396, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:16:17.324542', 'step': 396, 'epoch': 1} {'type': 'loss', 'content': 0.03022404946386814, 'timestamp': '2025-10-01 04:16:17.334784', 'step': 397, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:16:17.374180', 'step': 397, 'epoch': 1} {'type': 'loss', 'content': 0.015238631516695023, 'timestamp': '2025-10-01 04:16:17.387769', 'step': 398, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:16:17.427226', 'step': 398, 'epoch': 1} {'type': 'loss', 'content': 0.008990581147372723, 'timestamp': '2025-10-01 04:16:17.441428', 'step': 399, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:16:17.474917', 'step': 399, 'epoch': 1} {'type': 'loss', 'content': 0.018623104318976402, 'timestamp': '2025-10-01 04:16:17.508637', 'step': 400, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:16:17.542008', 'step': 400, 'epoch': 1} {'type': 'loss', 'content': 0.01643587276339531, 'timestamp': '2025-10-01 04:16:17.554851', 'step': 401, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:17.587416', 'step': 401, 'epoch': 1} {'type': 'loss', 'content': 0.021473292261362076, 'timestamp': '2025-10-01 04:16:17.598462', 'step': 402, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:16:17.628990', 'step': 402, 'epoch': 1} {'type': 'loss', 'content': 0.016137998551130295, 'timestamp': '2025-10-01 04:16:17.636667', 'step': 403, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:16:17.671927', 'step': 403, 'epoch': 1} {'type': 'loss', 'content': 0.015936408191919327, 'timestamp': '2025-10-01 04:16:17.700320', 'step': 404, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:17.731245', 'step': 404, 'epoch': 1} {'type': 'loss', 'content': 0.02445746399462223, 'timestamp': '2025-10-01 04:16:17.737032', 'step': 405, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:17.768856', 'step': 405, 'epoch': 1} {'type': 'loss', 'content': 0.019335979595780373, 'timestamp': '2025-10-01 04:16:17.777078', 'step': 406, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-10-01 04:16:17.807582', 'step': 406, 'epoch': 1} {'type': 'loss', 'content': 0.029689718037843704, 'timestamp': '2025-10-01 04:16:17.811903', 'step': 407, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:16:17.844717', 'step': 407, 'epoch': 1} {'type': 'loss', 'content': 0.02221430279314518, 'timestamp': '2025-10-01 04:16:17.872652', 'step': 408, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 15187581968384}, 'timestamp': '2025-10-01 04:16:17.914700', 'step': 408, 'epoch': 1} {'type': 'loss', 'content': 0.007874400354921818, 'timestamp': '2025-10-01 04:16:17.932230', 'step': 409, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:16:17.968109', 'step': 409, 'epoch': 1} {'type': 'loss', 'content': 0.015291288495063782, 'timestamp': '2025-10-01 04:16:17.982187', 'step': 410, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:16:18.016607', 'step': 410, 'epoch': 1} {'type': 'loss', 'content': 0.029950913041830063, 'timestamp': '2025-10-01 04:16:18.021521', 'step': 411, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:18.055643', 'step': 411, 'epoch': 1} {'type': 'loss', 'content': 0.009023243561387062, 'timestamp': '2025-10-01 04:16:18.088118', 'step': 412, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:16:18.122013', 'step': 412, 'epoch': 1} {'type': 'loss', 'content': 0.02289900742471218, 'timestamp': '2025-10-01 04:16:18.127872', 'step': 413, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:18.164946', 'step': 413, 'epoch': 1} {'type': 'loss', 'content': 0.026321103796362877, 'timestamp': '2025-10-01 04:16:18.173235', 'step': 414, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 14712978242368}, 'timestamp': '2025-10-01 04:16:18.227322', 'step': 414, 'epoch': 1} {'type': 'loss', 'content': 0.008057674393057823, 'timestamp': '2025-10-01 04:16:18.245105', 'step': 415, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:16:18.278354', 'step': 415, 'epoch': 1} {'type': 'loss', 'content': 0.01705600507557392, 'timestamp': '2025-10-01 04:16:18.307215', 'step': 416, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:18.341429', 'step': 416, 'epoch': 1} {'type': 'loss', 'content': 0.010497545823454857, 'timestamp': '2025-10-01 04:16:18.348143', 'step': 417, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:18.379803', 'step': 417, 'epoch': 1} {'type': 'loss', 'content': 0.01922953687608242, 'timestamp': '2025-10-01 04:16:18.388159', 'step': 418, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:18.421344', 'step': 418, 'epoch': 1} {'type': 'loss', 'content': 0.01180372666567564, 'timestamp': '2025-10-01 04:16:18.429884', 'step': 419, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:16:18.467883', 'step': 419, 'epoch': 1} {'type': 'loss', 'content': 0.015043324790894985, 'timestamp': '2025-10-01 04:16:18.502829', 'step': 420, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:16:18.536994', 'step': 420, 'epoch': 1} {'type': 'loss', 'content': 0.024084806442260742, 'timestamp': '2025-10-01 04:16:18.542683', 'step': 421, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:16:18.581195', 'step': 421, 'epoch': 1} {'type': 'loss', 'content': 0.009565691463649273, 'timestamp': '2025-10-01 04:16:18.595192', 'step': 422, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:16:18.631717', 'step': 422, 'epoch': 1} {'type': 'loss', 'content': 0.014175781980156898, 'timestamp': '2025-10-01 04:16:18.645238', 'step': 423, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:16:18.685031', 'step': 423, 'epoch': 1} {'type': 'loss', 'content': 0.011601341888308525, 'timestamp': '2025-10-01 04:16:18.720013', 'step': 424, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-10-01 04:16:18.761411', 'step': 424, 'epoch': 1} {'type': 'loss', 'content': 0.008249860256910324, 'timestamp': '2025-10-01 04:16:18.778367', 'step': 425, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:18.841825', 'step': 425, 'epoch': 1} {'type': 'loss', 'content': 0.019711390137672424, 'timestamp': '2025-10-01 04:16:18.852579', 'step': 426, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:18.938250', 'step': 426, 'epoch': 1} {'type': 'loss', 'content': 0.0370551161468029, 'timestamp': '2025-10-01 04:16:18.954433', 'step': 427, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:19.004372', 'step': 427, 'epoch': 1} {'type': 'loss', 'content': 0.018689412623643875, 'timestamp': '2025-10-01 04:16:19.036710', 'step': 428, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:16:19.084622', 'step': 428, 'epoch': 1} {'type': 'loss', 'content': 0.019718261435627937, 'timestamp': '2025-10-01 04:16:19.090861', 'step': 429, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:16:19.141561', 'step': 429, 'epoch': 1} {'type': 'loss', 'content': 0.009979892522096634, 'timestamp': '2025-10-01 04:16:19.157649', 'step': 430, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:16:19.208332', 'step': 430, 'epoch': 1} {'type': 'loss', 'content': 0.010781127028167248, 'timestamp': '2025-10-01 04:16:19.222531', 'step': 431, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:16:19.274496', 'step': 431, 'epoch': 1} {'type': 'loss', 'content': 0.017781859263777733, 'timestamp': '2025-10-01 04:16:19.309482', 'step': 432, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:19.363907', 'step': 432, 'epoch': 1} {'type': 'loss', 'content': 0.016139471903443336, 'timestamp': '2025-10-01 04:16:19.373610', 'step': 433, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:16:19.415900', 'step': 433, 'epoch': 1} {'type': 'loss', 'content': 0.026224790140986443, 'timestamp': '2025-10-01 04:16:19.427416', 'step': 434, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:16:19.482047', 'step': 434, 'epoch': 1} {'type': 'loss', 'content': 0.016444019973278046, 'timestamp': '2025-10-01 04:16:19.494653', 'step': 435, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:19.533552', 'step': 435, 'epoch': 1} {'type': 'loss', 'content': 0.016047537326812744, 'timestamp': '2025-10-01 04:16:19.565934', 'step': 436, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:16:19.611683', 'step': 436, 'epoch': 1} {'type': 'loss', 'content': 0.01915944740176201, 'timestamp': '2025-10-01 04:16:19.624540', 'step': 437, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:16:19.665418', 'step': 437, 'epoch': 1} {'type': 'loss', 'content': 0.014189903624355793, 'timestamp': '2025-10-01 04:16:19.673444', 'step': 438, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:19.714441', 'step': 438, 'epoch': 1} {'type': 'loss', 'content': 0.030241338536143303, 'timestamp': '2025-10-01 04:16:19.726083', 'step': 439, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:19.758611', 'step': 439, 'epoch': 1} {'type': 'loss', 'content': 0.011532990261912346, 'timestamp': '2025-10-01 04:16:19.791086', 'step': 440, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:16:19.828288', 'step': 440, 'epoch': 1} {'type': 'loss', 'content': 0.016996772959828377, 'timestamp': '2025-10-01 04:16:19.839293', 'step': 441, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:19.877356', 'step': 441, 'epoch': 1} {'type': 'loss', 'content': 0.022609945386648178, 'timestamp': '2025-10-01 04:16:19.885484', 'step': 442, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:16:19.924393', 'step': 442, 'epoch': 1} {'type': 'loss', 'content': 0.016120105981826782, 'timestamp': '2025-10-01 04:16:19.937164', 'step': 443, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:16:19.978682', 'step': 443, 'epoch': 1} {'type': 'loss', 'content': 0.012497787363827229, 'timestamp': '2025-10-01 04:16:20.013694', 'step': 444, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:16:20.065205', 'step': 444, 'epoch': 1} {'type': 'loss', 'content': 0.006934118922799826, 'timestamp': '2025-10-01 04:16:20.078523', 'step': 445, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:16:20.130683', 'step': 445, 'epoch': 1} {'type': 'loss', 'content': 0.011952010914683342, 'timestamp': '2025-10-01 04:16:20.144717', 'step': 446, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-10-01 04:16:20.193868', 'step': 446, 'epoch': 1} {'type': 'loss', 'content': 0.010065864771604538, 'timestamp': '2025-10-01 04:16:20.210323', 'step': 447, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:16:20.244285', 'step': 447, 'epoch': 1} {'type': 'loss', 'content': 0.011584152467548847, 'timestamp': '2025-10-01 04:16:20.277978', 'step': 448, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:16:20.312950', 'step': 448, 'epoch': 1} {'type': 'loss', 'content': 0.012550590559840202, 'timestamp': '2025-10-01 04:16:20.325743', 'step': 449, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:16:20.371006', 'step': 449, 'epoch': 1} {'type': 'loss', 'content': 0.007951286621391773, 'timestamp': '2025-10-01 04:16:20.387133', 'step': 450, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:20.423133', 'step': 450, 'epoch': 1} {'type': 'loss', 'content': 0.023190557956695557, 'timestamp': '2025-10-01 04:16:20.433835', 'step': 451, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:20.468499', 'step': 451, 'epoch': 1} {'type': 'loss', 'content': 0.008842021226882935, 'timestamp': '2025-10-01 04:16:20.501070', 'step': 452, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:20.531965', 'step': 452, 'epoch': 1} {'type': 'loss', 'content': 0.015318826772272587, 'timestamp': '2025-10-01 04:16:20.540499', 'step': 453, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:16:20.575080', 'step': 453, 'epoch': 1} {'type': 'loss', 'content': 0.022531012073159218, 'timestamp': '2025-10-01 04:16:20.588625', 'step': 454, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:16:20.624854', 'step': 454, 'epoch': 1} {'type': 'loss', 'content': 0.010974062606692314, 'timestamp': '2025-10-01 04:16:20.639012', 'step': 455, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:16:20.676163', 'step': 455, 'epoch': 1} {'type': 'loss', 'content': 0.014528082683682442, 'timestamp': '2025-10-01 04:16:20.709894', 'step': 456, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:16:20.746844', 'step': 456, 'epoch': 1} {'type': 'loss', 'content': 0.010905051603913307, 'timestamp': '2025-10-01 04:16:20.756921', 'step': 457, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:16:20.789346', 'step': 457, 'epoch': 1} {'type': 'loss', 'content': 0.017777007073163986, 'timestamp': '2025-10-01 04:16:20.802130', 'step': 458, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:20.834059', 'step': 458, 'epoch': 1} {'type': 'loss', 'content': 0.015021678991615772, 'timestamp': '2025-10-01 04:16:20.842155', 'step': 459, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:16:20.884130', 'step': 459, 'epoch': 1} {'type': 'loss', 'content': 0.01040259376168251, 'timestamp': '2025-10-01 04:16:20.920930', 'step': 460, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-10-01 04:16:23.318711', 'step': 460, 'epoch': 1} {'type': 'pplx', 'content': 5.634935429598939, 'timestamp': '2025-10-01 04:16:23.324766', 'step': 460, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:16:23.370048', 'step': 460, 'epoch': 1} {'type': 'loss', 'content': 0.011050896719098091, 'timestamp': '2025-10-01 04:16:23.385618', 'step': 461, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:16:23.424919', 'step': 461, 'epoch': 1} {'type': 'loss', 'content': 0.009396439418196678, 'timestamp': '2025-10-01 04:16:23.438889', 'step': 462, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:16:23.472665', 'step': 462, 'epoch': 1} {'type': 'loss', 'content': 0.01873224787414074, 'timestamp': '2025-10-01 04:16:23.485467', 'step': 463, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:16:23.523440', 'step': 463, 'epoch': 1} {'type': 'loss', 'content': 0.009660149924457073, 'timestamp': '2025-10-01 04:16:23.558638', 'step': 464, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:16:23.595883', 'step': 464, 'epoch': 1} {'type': 'loss', 'content': 0.0223027803003788, 'timestamp': '2025-10-01 04:16:23.600934', 'step': 465, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:16:23.640310', 'step': 465, 'epoch': 1} {'type': 'loss', 'content': 0.012049822136759758, 'timestamp': '2025-10-01 04:16:23.654321', 'step': 466, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:23.687881', 'step': 466, 'epoch': 1} {'type': 'loss', 'content': 0.03269190713763237, 'timestamp': '2025-10-01 04:16:23.696155', 'step': 467, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:16:23.730231', 'step': 467, 'epoch': 1} {'type': 'loss', 'content': 0.02621115930378437, 'timestamp': '2025-10-01 04:16:23.759017', 'step': 468, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 14238374516352}, 'timestamp': '2025-10-01 04:16:23.805244', 'step': 468, 'epoch': 1} {'type': 'loss', 'content': 0.008199824020266533, 'timestamp': '2025-10-01 04:16:23.822427', 'step': 469, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:23.861629', 'step': 469, 'epoch': 1} {'type': 'loss', 'content': 0.018999703228473663, 'timestamp': '2025-10-01 04:16:23.869965', 'step': 470, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:16:23.908720', 'step': 470, 'epoch': 1} {'type': 'loss', 'content': 0.017878664657473564, 'timestamp': '2025-10-01 04:16:23.916435', 'step': 471, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:23.954555', 'step': 471, 'epoch': 1} {'type': 'loss', 'content': 0.018295977264642715, 'timestamp': '2025-10-01 04:16:23.986301', 'step': 472, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:16:24.023006', 'step': 472, 'epoch': 1} {'type': 'loss', 'content': 0.01877659559249878, 'timestamp': '2025-10-01 04:16:24.033901', 'step': 473, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-10-01 04:16:24.076336', 'step': 473, 'epoch': 1} {'type': 'loss', 'content': 0.00944078341126442, 'timestamp': '2025-10-01 04:16:24.092706', 'step': 474, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:16:24.126324', 'step': 474, 'epoch': 1} {'type': 'loss', 'content': 0.014955062419176102, 'timestamp': '2025-10-01 04:16:24.138885', 'step': 475, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:16:24.172663', 'step': 475, 'epoch': 1} {'type': 'loss', 'content': 0.013251311145722866, 'timestamp': '2025-10-01 04:16:24.206217', 'step': 476, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:24.239740', 'step': 476, 'epoch': 1} {'type': 'loss', 'content': 0.026530764997005463, 'timestamp': '2025-10-01 04:16:24.248206', 'step': 477, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:24.282138', 'step': 477, 'epoch': 1} {'type': 'loss', 'content': 0.015789855271577835, 'timestamp': '2025-10-01 04:16:24.290469', 'step': 478, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:16:24.324253', 'step': 478, 'epoch': 1} {'type': 'loss', 'content': 0.023042850196361542, 'timestamp': '2025-10-01 04:16:24.332021', 'step': 479, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:16:24.368313', 'step': 479, 'epoch': 1} {'type': 'loss', 'content': 0.008028646931052208, 'timestamp': '2025-10-01 04:16:24.401858', 'step': 480, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:16:24.436315', 'step': 480, 'epoch': 1} {'type': 'loss', 'content': 0.0071920002810657024, 'timestamp': '2025-10-01 04:16:24.449170', 'step': 481, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:16:24.484425', 'step': 481, 'epoch': 1} {'type': 'loss', 'content': 0.015702953562140465, 'timestamp': '2025-10-01 04:16:24.497169', 'step': 482, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:16:24.538525', 'step': 482, 'epoch': 1} {'type': 'loss', 'content': 0.014195307157933712, 'timestamp': '2025-10-01 04:16:24.552530', 'step': 483, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:24.586109', 'step': 483, 'epoch': 1} {'type': 'loss', 'content': 0.012473647482693195, 'timestamp': '2025-10-01 04:16:24.618783', 'step': 484, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:24.653688', 'step': 484, 'epoch': 1} {'type': 'loss', 'content': 0.02003420516848564, 'timestamp': '2025-10-01 04:16:24.662735', 'step': 485, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:24.700849', 'step': 485, 'epoch': 1} {'type': 'loss', 'content': 0.026843110099434853, 'timestamp': '2025-10-01 04:16:24.712476', 'step': 486, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:24.746678', 'step': 486, 'epoch': 1} {'type': 'loss', 'content': 0.017934231087565422, 'timestamp': '2025-10-01 04:16:24.757540', 'step': 487, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:16:24.801326', 'step': 487, 'epoch': 1} {'type': 'loss', 'content': 0.01092158630490303, 'timestamp': '2025-10-01 04:16:24.838124', 'step': 488, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:16:24.873677', 'step': 488, 'epoch': 1} {'type': 'loss', 'content': 0.020836612209677696, 'timestamp': '2025-10-01 04:16:24.886528', 'step': 489, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:16:24.922116', 'step': 489, 'epoch': 1} {'type': 'loss', 'content': 0.011122267693281174, 'timestamp': '2025-10-01 04:16:24.936132', 'step': 490, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:24.968587', 'step': 490, 'epoch': 1} {'type': 'loss', 'content': 0.023144520819187164, 'timestamp': '2025-10-01 04:16:24.976939', 'step': 491, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:25.010360', 'step': 491, 'epoch': 1} {'type': 'loss', 'content': 0.015137414447963238, 'timestamp': '2025-10-01 04:16:25.042194', 'step': 492, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:16:25.075039', 'step': 492, 'epoch': 1} {'type': 'loss', 'content': 0.01186875719577074, 'timestamp': '2025-10-01 04:16:25.086215', 'step': 493, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:16:25.123548', 'step': 493, 'epoch': 1} {'type': 'loss', 'content': 0.012813642621040344, 'timestamp': '2025-10-01 04:16:25.137551', 'step': 494, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:16:25.174499', 'step': 494, 'epoch': 1} {'type': 'loss', 'content': 0.0062550934962928295, 'timestamp': '2025-10-01 04:16:25.188537', 'step': 495, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:16:25.226814', 'step': 495, 'epoch': 1} {'type': 'loss', 'content': 0.006602271925657988, 'timestamp': '2025-10-01 04:16:25.261713', 'step': 496, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:16:25.296256', 'step': 496, 'epoch': 1} {'type': 'loss', 'content': 0.009082534350454807, 'timestamp': '2025-10-01 04:16:25.309690', 'step': 497, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:16:25.342118', 'step': 497, 'epoch': 1} {'type': 'loss', 'content': 0.014635121449828148, 'timestamp': '2025-10-01 04:16:25.354659', 'step': 498, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:25.385848', 'step': 498, 'epoch': 1} {'type': 'loss', 'content': 0.018802709877490997, 'timestamp': '2025-10-01 04:16:25.394068', 'step': 499, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:16:25.426446', 'step': 499, 'epoch': 1} {'type': 'loss', 'content': 0.013009629212319851, 'timestamp': '2025-10-01 04:16:25.459967', 'step': 500, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 500', 'timestamp': '2025-10-01 04:16:30.586856', 'step': 500, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:30.629510', 'step': 500, 'epoch': 1} {'type': 'loss', 'content': 0.0157314520329237, 'timestamp': '2025-10-01 04:16:30.636878', 'step': 501, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:30.669598', 'step': 501, 'epoch': 1} {'type': 'loss', 'content': 0.01944146119058132, 'timestamp': '2025-10-01 04:16:30.677771', 'step': 502, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:16:30.716393', 'step': 502, 'epoch': 1} {'type': 'loss', 'content': 0.009394222870469093, 'timestamp': '2025-10-01 04:16:30.732095', 'step': 503, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:30.764004', 'step': 503, 'epoch': 1} {'type': 'loss', 'content': 0.015702074393630028, 'timestamp': '2025-10-01 04:16:30.792949', 'step': 504, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:16:30.827681', 'step': 504, 'epoch': 1} {'type': 'loss', 'content': 0.017846781760454178, 'timestamp': '2025-10-01 04:16:30.840502', 'step': 505, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:30.872826', 'step': 505, 'epoch': 1} {'type': 'loss', 'content': 0.014160624705255032, 'timestamp': '2025-10-01 04:16:30.884357', 'step': 506, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:16:30.924135', 'step': 506, 'epoch': 1} {'type': 'loss', 'content': 0.008405692875385284, 'timestamp': '2025-10-01 04:16:30.938186', 'step': 507, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:16:30.973100', 'step': 507, 'epoch': 1} {'type': 'loss', 'content': 0.017884839326143265, 'timestamp': '2025-10-01 04:16:31.007656', 'step': 508, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:16:31.039529', 'step': 508, 'epoch': 1} {'type': 'loss', 'content': 0.015724297612905502, 'timestamp': '2025-10-01 04:16:31.050637', 'step': 509, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:16:31.083446', 'step': 509, 'epoch': 1} {'type': 'loss', 'content': 0.016847331076860428, 'timestamp': '2025-10-01 04:16:31.095977', 'step': 510, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:16:31.136208', 'step': 510, 'epoch': 1} {'type': 'loss', 'content': 0.009716455824673176, 'timestamp': '2025-10-01 04:16:31.152263', 'step': 511, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:16:31.189932', 'step': 511, 'epoch': 1} {'type': 'loss', 'content': 0.010077309794723988, 'timestamp': '2025-10-01 04:16:31.225093', 'step': 512, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:31.257399', 'step': 512, 'epoch': 1} {'type': 'loss', 'content': 0.020557444542646408, 'timestamp': '2025-10-01 04:16:31.265728', 'step': 513, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:16:31.301865', 'step': 513, 'epoch': 1} {'type': 'loss', 'content': 0.011356167495250702, 'timestamp': '2025-10-01 04:16:31.315913', 'step': 514, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:16:31.347185', 'step': 514, 'epoch': 1} {'type': 'loss', 'content': 0.01906018704175949, 'timestamp': '2025-10-01 04:16:31.359706', 'step': 515, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:31.393691', 'step': 515, 'epoch': 1} {'type': 'loss', 'content': 0.018788890913128853, 'timestamp': '2025-10-01 04:16:31.426126', 'step': 516, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:16:31.457697', 'step': 516, 'epoch': 1} {'type': 'loss', 'content': 0.015265836380422115, 'timestamp': '2025-10-01 04:16:31.462820', 'step': 517, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:31.493668', 'step': 517, 'epoch': 1} {'type': 'loss', 'content': 0.016705531626939774, 'timestamp': '2025-10-01 04:16:31.504386', 'step': 518, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:31.556025', 'step': 518, 'epoch': 1} {'type': 'loss', 'content': 0.008585241623222828, 'timestamp': '2025-10-01 04:16:31.567659', 'step': 519, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:16:31.607155', 'step': 519, 'epoch': 1} {'type': 'loss', 'content': 0.01143752783536911, 'timestamp': '2025-10-01 04:16:31.635679', 'step': 520, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:31.695796', 'step': 520, 'epoch': 1} {'type': 'loss', 'content': 0.012304618954658508, 'timestamp': '2025-10-01 04:16:31.701830', 'step': 521, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:16:31.753084', 'step': 521, 'epoch': 1} {'type': 'loss', 'content': 0.016778891906142235, 'timestamp': '2025-10-01 04:16:31.760928', 'step': 522, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:16:31.797718', 'step': 522, 'epoch': 1} {'type': 'loss', 'content': 0.013098997995257378, 'timestamp': '2025-10-01 04:16:31.811700', 'step': 523, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:31.843424', 'step': 523, 'epoch': 1} {'type': 'loss', 'content': 0.009864656254649162, 'timestamp': '2025-10-01 04:16:31.872774', 'step': 524, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:16:31.908177', 'step': 524, 'epoch': 1} {'type': 'loss', 'content': 0.008583220653235912, 'timestamp': '2025-10-01 04:16:31.921699', 'step': 525, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:16:31.961379', 'step': 525, 'epoch': 1} {'type': 'loss', 'content': 0.012645641341805458, 'timestamp': '2025-10-01 04:16:31.974200', 'step': 526, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:32.008518', 'step': 526, 'epoch': 1} {'type': 'loss', 'content': 0.015280468389391899, 'timestamp': '2025-10-01 04:16:32.016763', 'step': 527, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:16:32.051176', 'step': 527, 'epoch': 1} {'type': 'loss', 'content': 0.02029372937977314, 'timestamp': '2025-10-01 04:16:32.084616', 'step': 528, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:16:32.120742', 'step': 528, 'epoch': 1} {'type': 'loss', 'content': 0.015497477725148201, 'timestamp': '2025-10-01 04:16:32.134213', 'step': 529, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:16:32.171158', 'step': 529, 'epoch': 1} {'type': 'loss', 'content': 0.008214034140110016, 'timestamp': '2025-10-01 04:16:32.183904', 'step': 530, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:16:32.224136', 'step': 530, 'epoch': 1} {'type': 'loss', 'content': 0.004829376470297575, 'timestamp': '2025-10-01 04:16:32.238145', 'step': 531, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:16:32.272218', 'step': 531, 'epoch': 1} {'type': 'loss', 'content': 0.032258838415145874, 'timestamp': '2025-10-01 04:16:32.300589', 'step': 532, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:16:32.335795', 'step': 532, 'epoch': 1} {'type': 'loss', 'content': 0.03910328447818756, 'timestamp': '2025-10-01 04:16:32.341005', 'step': 533, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:32.379107', 'step': 533, 'epoch': 1} {'type': 'loss', 'content': 0.01312317792326212, 'timestamp': '2025-10-01 04:16:32.387468', 'step': 534, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:16:32.424600', 'step': 534, 'epoch': 1} {'type': 'loss', 'content': 0.011629111133515835, 'timestamp': '2025-10-01 04:16:32.438160', 'step': 535, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:16:32.471341', 'step': 535, 'epoch': 1} {'type': 'loss', 'content': 0.011790932156145573, 'timestamp': '2025-10-01 04:16:32.500166', 'step': 536, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:16:32.533180', 'step': 536, 'epoch': 1} {'type': 'loss', 'content': 0.024120526388287544, 'timestamp': '2025-10-01 04:16:32.538354', 'step': 537, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:16:32.571754', 'step': 537, 'epoch': 1} {'type': 'loss', 'content': 0.01741904206573963, 'timestamp': '2025-10-01 04:16:32.579858', 'step': 538, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:16:32.618120', 'step': 538, 'epoch': 1} {'type': 'loss', 'content': 0.00925917737185955, 'timestamp': '2025-10-01 04:16:32.631576', 'step': 539, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:32.666155', 'step': 539, 'epoch': 1} {'type': 'loss', 'content': 0.028451379388570786, 'timestamp': '2025-10-01 04:16:32.695415', 'step': 540, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:16:32.733534', 'step': 540, 'epoch': 1} {'type': 'loss', 'content': 0.022103803232312202, 'timestamp': '2025-10-01 04:16:32.746773', 'step': 541, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:16:32.787152', 'step': 541, 'epoch': 1} {'type': 'loss', 'content': 0.014795825816690922, 'timestamp': '2025-10-01 04:16:32.801270', 'step': 542, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:16:32.839900', 'step': 542, 'epoch': 1} {'type': 'loss', 'content': 0.013649027794599533, 'timestamp': '2025-10-01 04:16:32.853457', 'step': 543, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:16:32.889508', 'step': 543, 'epoch': 1} {'type': 'loss', 'content': 0.014748976565897465, 'timestamp': '2025-10-01 04:16:32.924521', 'step': 544, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:16:32.964030', 'step': 544, 'epoch': 1} {'type': 'loss', 'content': 0.02524609863758087, 'timestamp': '2025-10-01 04:16:32.974339', 'step': 545, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:33.006430', 'step': 545, 'epoch': 1} {'type': 'loss', 'content': 0.01887904293835163, 'timestamp': '2025-10-01 04:16:33.018123', 'step': 546, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:16:33.053706', 'step': 546, 'epoch': 1} {'type': 'loss', 'content': 0.015603136271238327, 'timestamp': '2025-10-01 04:16:33.066498', 'step': 547, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:16:33.105249', 'step': 547, 'epoch': 1} {'type': 'loss', 'content': 0.018916502594947815, 'timestamp': '2025-10-01 04:16:33.138911', 'step': 548, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:33.171812', 'step': 548, 'epoch': 1} {'type': 'loss', 'content': 0.011366521008312702, 'timestamp': '2025-10-01 04:16:33.180149', 'step': 549, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:16:33.222330', 'step': 549, 'epoch': 1} {'type': 'loss', 'content': 0.021209245547652245, 'timestamp': '2025-10-01 04:16:33.229991', 'step': 550, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:16:33.262477', 'step': 550, 'epoch': 1} {'type': 'loss', 'content': 0.025289107114076614, 'timestamp': '2025-10-01 04:16:33.269749', 'step': 551, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:16:33.305124', 'step': 551, 'epoch': 1} {'type': 'loss', 'content': 0.018362894654273987, 'timestamp': '2025-10-01 04:16:33.333188', 'step': 552, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:33.393969', 'step': 552, 'epoch': 1} {'type': 'loss', 'content': 0.015405273996293545, 'timestamp': '2025-10-01 04:16:33.399854', 'step': 553, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:16:33.437057', 'step': 553, 'epoch': 1} {'type': 'loss', 'content': 0.02026311121881008, 'timestamp': '2025-10-01 04:16:33.445814', 'step': 554, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:16:33.500529', 'step': 554, 'epoch': 1} {'type': 'loss', 'content': 0.018493840470910072, 'timestamp': '2025-10-01 04:16:33.516392', 'step': 555, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:16:33.552994', 'step': 555, 'epoch': 1} {'type': 'loss', 'content': 0.013878155499696732, 'timestamp': '2025-10-01 04:16:33.587893', 'step': 556, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:16:33.622283', 'step': 556, 'epoch': 1} {'type': 'loss', 'content': 0.018180886283516884, 'timestamp': '2025-10-01 04:16:33.627839', 'step': 557, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-10-01 04:16:33.662569', 'step': 557, 'epoch': 1} {'type': 'loss', 'content': 0.03681842237710953, 'timestamp': '2025-10-01 04:16:33.667072', 'step': 558, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:16:33.704756', 'step': 558, 'epoch': 1} {'type': 'loss', 'content': 0.01799214817583561, 'timestamp': '2025-10-01 04:16:33.712773', 'step': 559, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:16:33.761923', 'step': 559, 'epoch': 1} {'type': 'loss', 'content': 0.01017005369067192, 'timestamp': '2025-10-01 04:16:33.805656', 'step': 560, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:16:33.842452', 'step': 560, 'epoch': 1} {'type': 'loss', 'content': 0.015979500487446785, 'timestamp': '2025-10-01 04:16:33.848240', 'step': 561, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:16:33.887002', 'step': 561, 'epoch': 1} {'type': 'loss', 'content': 0.016387928277254105, 'timestamp': '2025-10-01 04:16:33.899622', 'step': 562, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:33.931700', 'step': 562, 'epoch': 1} {'type': 'loss', 'content': 0.01384213287383318, 'timestamp': '2025-10-01 04:16:33.940170', 'step': 563, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:33.985339', 'step': 563, 'epoch': 1} {'type': 'loss', 'content': 0.02001025527715683, 'timestamp': '2025-10-01 04:16:34.014736', 'step': 564, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:34.049048', 'step': 564, 'epoch': 1} {'type': 'loss', 'content': 0.020732207223773003, 'timestamp': '2025-10-01 04:16:34.055017', 'step': 565, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:34.088366', 'step': 565, 'epoch': 1} {'type': 'loss', 'content': 0.00788446981459856, 'timestamp': '2025-10-01 04:16:34.099272', 'step': 566, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:16:34.135807', 'step': 566, 'epoch': 1} {'type': 'loss', 'content': 0.016988039016723633, 'timestamp': '2025-10-01 04:16:34.149777', 'step': 567, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-10-01 04:16:34.197784', 'step': 567, 'epoch': 1} {'type': 'loss', 'content': 0.044161681085824966, 'timestamp': '2025-10-01 04:16:34.221898', 'step': 568, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:34.253922', 'step': 568, 'epoch': 1} {'type': 'loss', 'content': 0.011900019831955433, 'timestamp': '2025-10-01 04:16:34.262196', 'step': 569, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:16:34.294101', 'step': 569, 'epoch': 1} {'type': 'loss', 'content': 0.017807448282837868, 'timestamp': '2025-10-01 04:16:34.301884', 'step': 570, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:16:34.337432', 'step': 570, 'epoch': 1} {'type': 'loss', 'content': 0.017624812200665474, 'timestamp': '2025-10-01 04:16:34.351435', 'step': 571, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:16:34.387892', 'step': 571, 'epoch': 1} {'type': 'loss', 'content': 0.012039892375469208, 'timestamp': '2025-10-01 04:16:34.422867', 'step': 572, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:16:34.463447', 'step': 572, 'epoch': 1} {'type': 'loss', 'content': 0.012084291316568851, 'timestamp': '2025-10-01 04:16:34.476318', 'step': 573, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:34.507321', 'step': 573, 'epoch': 1} {'type': 'loss', 'content': 0.032570064067840576, 'timestamp': '2025-10-01 04:16:34.515470', 'step': 574, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:16:34.547028', 'step': 574, 'epoch': 1} {'type': 'loss', 'content': 0.015443594194948673, 'timestamp': '2025-10-01 04:16:34.554963', 'step': 575, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-10-01 04:16:36.986555', 'step': 575, 'epoch': 1} {'type': 'pplx', 'content': 5.62872549668118, 'timestamp': '2025-10-01 04:16:36.989837', 'step': 575, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:37.021320', 'step': 575, 'epoch': 1} {'type': 'loss', 'content': 0.014985146000981331, 'timestamp': '2025-10-01 04:16:37.052221', 'step': 576, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-10-01 04:16:37.093020', 'step': 576, 'epoch': 1} {'type': 'loss', 'content': 0.01151040568947792, 'timestamp': '2025-10-01 04:16:37.109014', 'step': 577, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:37.143019', 'step': 577, 'epoch': 1} {'type': 'loss', 'content': 0.014620871283113956, 'timestamp': '2025-10-01 04:16:37.154683', 'step': 578, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:16:37.187312', 'step': 578, 'epoch': 1} {'type': 'loss', 'content': 0.025567810982465744, 'timestamp': '2025-10-01 04:16:37.199842', 'step': 579, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:37.233509', 'step': 579, 'epoch': 1} {'type': 'loss', 'content': 0.021836701780557632, 'timestamp': '2025-10-01 04:16:37.262695', 'step': 580, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 17085996872448}, 'timestamp': '2025-10-01 04:16:37.309095', 'step': 580, 'epoch': 1} {'type': 'loss', 'content': 0.013711890205740929, 'timestamp': '2025-10-01 04:16:37.328535', 'step': 581, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:16:37.364798', 'step': 581, 'epoch': 1} {'type': 'loss', 'content': 0.0083592738956213, 'timestamp': '2025-10-01 04:16:37.378945', 'step': 582, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:37.415645', 'step': 582, 'epoch': 1} {'type': 'loss', 'content': 0.02470059134066105, 'timestamp': '2025-10-01 04:16:37.426205', 'step': 583, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:37.458048', 'step': 583, 'epoch': 1} {'type': 'loss', 'content': 0.029120059683918953, 'timestamp': '2025-10-01 04:16:37.489820', 'step': 584, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:16:37.528627', 'step': 584, 'epoch': 1} {'type': 'loss', 'content': 0.015066621825098991, 'timestamp': '2025-10-01 04:16:37.539562', 'step': 585, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:16:37.575556', 'step': 585, 'epoch': 1} {'type': 'loss', 'content': 0.004456342663615942, 'timestamp': '2025-10-01 04:16:37.589612', 'step': 586, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:16:37.631817', 'step': 586, 'epoch': 1} {'type': 'loss', 'content': 0.009280052036046982, 'timestamp': '2025-10-01 04:16:37.645745', 'step': 587, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:16:37.679794', 'step': 587, 'epoch': 1} {'type': 'loss', 'content': 0.020181341096758842, 'timestamp': '2025-10-01 04:16:37.708134', 'step': 588, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:37.742515', 'step': 588, 'epoch': 1} {'type': 'loss', 'content': 0.02166646718978882, 'timestamp': '2025-10-01 04:16:37.751404', 'step': 589, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:37.784598', 'step': 589, 'epoch': 1} {'type': 'loss', 'content': 0.014675210230052471, 'timestamp': '2025-10-01 04:16:37.796112', 'step': 590, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:37.828586', 'step': 590, 'epoch': 1} {'type': 'loss', 'content': 0.022725818678736687, 'timestamp': '2025-10-01 04:16:37.840083', 'step': 591, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:37.872844', 'step': 591, 'epoch': 1} {'type': 'loss', 'content': 0.010646157898008823, 'timestamp': '2025-10-01 04:16:37.904654', 'step': 592, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-10-01 04:16:37.945030', 'step': 592, 'epoch': 1} {'type': 'loss', 'content': 0.0057301283814013, 'timestamp': '2025-10-01 04:16:37.961934', 'step': 593, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:16:38.001866', 'step': 593, 'epoch': 1} {'type': 'loss', 'content': 0.01853780820965767, 'timestamp': '2025-10-01 04:16:38.012881', 'step': 594, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:38.053748', 'step': 594, 'epoch': 1} {'type': 'loss', 'content': 0.01644781418144703, 'timestamp': '2025-10-01 04:16:38.064410', 'step': 595, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:38.101165', 'step': 595, 'epoch': 1} {'type': 'loss', 'content': 0.027854841202497482, 'timestamp': '2025-10-01 04:16:38.130368', 'step': 596, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:38.172022', 'step': 596, 'epoch': 1} {'type': 'loss', 'content': 0.016529506072402, 'timestamp': '2025-10-01 04:16:38.187332', 'step': 597, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:38.231160', 'step': 597, 'epoch': 1} {'type': 'loss', 'content': 0.015290052630007267, 'timestamp': '2025-10-01 04:16:38.242726', 'step': 598, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:16:38.278159', 'step': 598, 'epoch': 1} {'type': 'loss', 'content': 0.013877566903829575, 'timestamp': '2025-10-01 04:16:38.293304', 'step': 599, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:38.336152', 'step': 599, 'epoch': 1} {'type': 'loss', 'content': 0.022351175546646118, 'timestamp': '2025-10-01 04:16:38.372890', 'step': 600, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:16:38.418886', 'step': 600, 'epoch': 1} {'type': 'loss', 'content': 0.023492619395256042, 'timestamp': '2025-10-01 04:16:38.429139', 'step': 601, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:16:38.467418', 'step': 601, 'epoch': 1} {'type': 'loss', 'content': 0.014662696048617363, 'timestamp': '2025-10-01 04:16:38.478263', 'step': 602, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:16:38.514745', 'step': 602, 'epoch': 1} {'type': 'loss', 'content': 0.017407968640327454, 'timestamp': '2025-10-01 04:16:38.522594', 'step': 603, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:38.560498', 'step': 603, 'epoch': 1} {'type': 'loss', 'content': 0.0180354081094265, 'timestamp': '2025-10-01 04:16:38.592852', 'step': 604, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:16:38.631820', 'step': 604, 'epoch': 1} {'type': 'loss', 'content': 0.017368538305163383, 'timestamp': '2025-10-01 04:16:38.642499', 'step': 605, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:38.679887', 'step': 605, 'epoch': 1} {'type': 'loss', 'content': 0.024110915139317513, 'timestamp': '2025-10-01 04:16:38.690680', 'step': 606, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:16:38.732319', 'step': 606, 'epoch': 1} {'type': 'loss', 'content': 0.014784153550863266, 'timestamp': '2025-10-01 04:16:38.746356', 'step': 607, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:16:38.777903', 'step': 607, 'epoch': 1} {'type': 'loss', 'content': 0.030517838895320892, 'timestamp': '2025-10-01 04:16:38.809617', 'step': 608, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:38.842552', 'step': 608, 'epoch': 1} {'type': 'loss', 'content': 0.016505921259522438, 'timestamp': '2025-10-01 04:16:38.851522', 'step': 609, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:38.890571', 'step': 609, 'epoch': 1} {'type': 'loss', 'content': 0.026399411261081696, 'timestamp': '2025-10-01 04:16:38.901325', 'step': 610, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:38.933626', 'step': 610, 'epoch': 1} {'type': 'loss', 'content': 0.01915208250284195, 'timestamp': '2025-10-01 04:16:38.945040', 'step': 611, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:38.980472', 'step': 611, 'epoch': 1} {'type': 'loss', 'content': 0.013446131721138954, 'timestamp': '2025-10-01 04:16:39.013012', 'step': 612, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:16:39.048564', 'step': 612, 'epoch': 1} {'type': 'loss', 'content': 0.01642761006951332, 'timestamp': '2025-10-01 04:16:39.053730', 'step': 613, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:39.085888', 'step': 613, 'epoch': 1} {'type': 'loss', 'content': 0.01624266803264618, 'timestamp': '2025-10-01 04:16:39.100020', 'step': 614, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:39.140216', 'step': 614, 'epoch': 1} {'type': 'loss', 'content': 0.014057927764952183, 'timestamp': '2025-10-01 04:16:39.151820', 'step': 615, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:16:39.190435', 'step': 615, 'epoch': 1} {'type': 'loss', 'content': 0.010314121842384338, 'timestamp': '2025-10-01 04:16:39.224234', 'step': 616, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:39.258804', 'step': 616, 'epoch': 1} {'type': 'loss', 'content': 0.021248875185847282, 'timestamp': '2025-10-01 04:16:39.272082', 'step': 617, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:39.313563', 'step': 617, 'epoch': 1} {'type': 'loss', 'content': 0.019683245569467545, 'timestamp': '2025-10-01 04:16:39.324346', 'step': 618, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:39.365313', 'step': 618, 'epoch': 1} {'type': 'loss', 'content': 0.023884739726781845, 'timestamp': '2025-10-01 04:16:39.377080', 'step': 619, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:39.419119', 'step': 619, 'epoch': 1} {'type': 'loss', 'content': 0.018881382420659065, 'timestamp': '2025-10-01 04:16:39.451026', 'step': 620, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:16:39.492594', 'step': 620, 'epoch': 1} {'type': 'loss', 'content': 0.012647491879761219, 'timestamp': '2025-10-01 04:16:39.505985', 'step': 621, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 14238374516352}, 'timestamp': '2025-10-01 04:16:39.560192', 'step': 621, 'epoch': 1} {'type': 'loss', 'content': 0.01871722750365734, 'timestamp': '2025-10-01 04:16:39.577679', 'step': 622, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:16:39.621874', 'step': 622, 'epoch': 1} {'type': 'loss', 'content': 0.009179331362247467, 'timestamp': '2025-10-01 04:16:39.636177', 'step': 623, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:16:39.672855', 'step': 623, 'epoch': 1} {'type': 'loss', 'content': 0.015048723667860031, 'timestamp': '2025-10-01 04:16:39.707789', 'step': 624, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:16:39.741224', 'step': 624, 'epoch': 1} {'type': 'loss', 'content': 0.01513579674065113, 'timestamp': '2025-10-01 04:16:39.746779', 'step': 625, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:16:39.783075', 'step': 625, 'epoch': 1} {'type': 'loss', 'content': 0.014886002987623215, 'timestamp': '2025-10-01 04:16:39.797204', 'step': 626, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-10-01 04:16:39.846540', 'step': 626, 'epoch': 1} {'type': 'loss', 'content': 0.009812970645725727, 'timestamp': '2025-10-01 04:16:39.863746', 'step': 627, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:16:39.908171', 'step': 627, 'epoch': 1} {'type': 'loss', 'content': 0.011499045416712761, 'timestamp': '2025-10-01 04:16:39.943188', 'step': 628, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:16:39.978177', 'step': 628, 'epoch': 1} {'type': 'loss', 'content': 0.014034989289939404, 'timestamp': '2025-10-01 04:16:39.991715', 'step': 629, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-10-01 04:16:40.041781', 'step': 629, 'epoch': 1} {'type': 'loss', 'content': 0.015712713822722435, 'timestamp': '2025-10-01 04:16:40.058347', 'step': 630, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 14238374516352}, 'timestamp': '2025-10-01 04:16:40.102348', 'step': 630, 'epoch': 1} {'type': 'loss', 'content': 0.00798475369811058, 'timestamp': '2025-10-01 04:16:40.119820', 'step': 631, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:16:40.167354', 'step': 631, 'epoch': 1} {'type': 'loss', 'content': 0.01154161337763071, 'timestamp': '2025-10-01 04:16:40.202519', 'step': 632, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-10-01 04:16:40.248611', 'step': 632, 'epoch': 1} {'type': 'loss', 'content': 0.007479345425963402, 'timestamp': '2025-10-01 04:16:40.264585', 'step': 633, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:16:40.305122', 'step': 633, 'epoch': 1} {'type': 'loss', 'content': 0.027549872174859047, 'timestamp': '2025-10-01 04:16:40.314961', 'step': 634, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:40.348434', 'step': 634, 'epoch': 1} {'type': 'loss', 'content': 0.00849152822047472, 'timestamp': '2025-10-01 04:16:40.357568', 'step': 635, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:16:40.399476', 'step': 635, 'epoch': 1} {'type': 'loss', 'content': 0.013311270624399185, 'timestamp': '2025-10-01 04:16:40.432945', 'step': 636, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:16:40.473412', 'step': 636, 'epoch': 1} {'type': 'loss', 'content': 0.019073138013482094, 'timestamp': '2025-10-01 04:16:40.483770', 'step': 637, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:16:40.518382', 'step': 637, 'epoch': 1} {'type': 'loss', 'content': 0.014832253567874432, 'timestamp': '2025-10-01 04:16:40.530972', 'step': 638, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:16:40.574658', 'step': 638, 'epoch': 1} {'type': 'loss', 'content': 0.014259315095841885, 'timestamp': '2025-10-01 04:16:40.587418', 'step': 639, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:40.625594', 'step': 639, 'epoch': 1} {'type': 'loss', 'content': 0.013466636650264263, 'timestamp': '2025-10-01 04:16:40.657895', 'step': 640, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:16:40.697078', 'step': 640, 'epoch': 1} {'type': 'loss', 'content': 0.013605115003883839, 'timestamp': '2025-10-01 04:16:40.709967', 'step': 641, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:16:40.750831', 'step': 641, 'epoch': 1} {'type': 'loss', 'content': 0.01785687357187271, 'timestamp': '2025-10-01 04:16:40.764431', 'step': 642, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:16:40.803757', 'step': 642, 'epoch': 1} {'type': 'loss', 'content': 0.015827007591724396, 'timestamp': '2025-10-01 04:16:40.811651', 'step': 643, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:16:40.849249', 'step': 643, 'epoch': 1} {'type': 'loss', 'content': 0.02333347499370575, 'timestamp': '2025-10-01 04:16:40.877416', 'step': 644, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:40.909049', 'step': 644, 'epoch': 1} {'type': 'loss', 'content': 0.01258419081568718, 'timestamp': '2025-10-01 04:16:40.918239', 'step': 645, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:16:40.956138', 'step': 645, 'epoch': 1} {'type': 'loss', 'content': 0.009791082702577114, 'timestamp': '2025-10-01 04:16:40.968630', 'step': 646, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:16:41.004002', 'step': 646, 'epoch': 1} {'type': 'loss', 'content': 0.016495438292622566, 'timestamp': '2025-10-01 04:16:41.014846', 'step': 647, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:16:41.048512', 'step': 647, 'epoch': 1} {'type': 'loss', 'content': 0.015987880527973175, 'timestamp': '2025-10-01 04:16:41.081991', 'step': 648, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:16:41.123351', 'step': 648, 'epoch': 1} {'type': 'loss', 'content': 0.017873890697956085, 'timestamp': '2025-10-01 04:16:41.135340', 'step': 649, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:41.175355', 'step': 649, 'epoch': 1} {'type': 'loss', 'content': 0.017425892874598503, 'timestamp': '2025-10-01 04:16:41.187780', 'step': 650, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:16:41.227584', 'step': 650, 'epoch': 1} {'type': 'loss', 'content': 0.029081469401717186, 'timestamp': '2025-10-01 04:16:41.237911', 'step': 651, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:41.278025', 'step': 651, 'epoch': 1} {'type': 'loss', 'content': 0.020363079383969307, 'timestamp': '2025-10-01 04:16:41.309924', 'step': 652, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:41.343699', 'step': 652, 'epoch': 1} {'type': 'loss', 'content': 0.01601259596645832, 'timestamp': '2025-10-01 04:16:41.354396', 'step': 653, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:16:41.395456', 'step': 653, 'epoch': 1} {'type': 'loss', 'content': 0.020761186257004738, 'timestamp': '2025-10-01 04:16:41.408168', 'step': 654, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:41.448434', 'step': 654, 'epoch': 1} {'type': 'loss', 'content': 0.011018872261047363, 'timestamp': '2025-10-01 04:16:41.459309', 'step': 655, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:41.493384', 'step': 655, 'epoch': 1} {'type': 'loss', 'content': 0.012344557791948318, 'timestamp': '2025-10-01 04:16:41.525083', 'step': 656, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:16:41.558698', 'step': 656, 'epoch': 1} {'type': 'loss', 'content': 0.018967201933264732, 'timestamp': '2025-10-01 04:16:41.569084', 'step': 657, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:16:41.607742', 'step': 657, 'epoch': 1} {'type': 'loss', 'content': 0.016839321702718735, 'timestamp': '2025-10-01 04:16:41.620005', 'step': 658, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:41.658627', 'step': 658, 'epoch': 1} {'type': 'loss', 'content': 0.026782343164086342, 'timestamp': '2025-10-01 04:16:41.667883', 'step': 659, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:41.710340', 'step': 659, 'epoch': 1} {'type': 'loss', 'content': 0.016056621447205544, 'timestamp': '2025-10-01 04:16:41.741924', 'step': 660, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:41.776241', 'step': 660, 'epoch': 1} {'type': 'loss', 'content': 0.010630931705236435, 'timestamp': '2025-10-01 04:16:41.784693', 'step': 661, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:16:41.823307', 'step': 661, 'epoch': 1} {'type': 'loss', 'content': 0.009628698229789734, 'timestamp': '2025-10-01 04:16:41.836061', 'step': 662, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:16:41.876676', 'step': 662, 'epoch': 1} {'type': 'loss', 'content': 0.024163223803043365, 'timestamp': '2025-10-01 04:16:41.883845', 'step': 663, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:16:41.923207', 'step': 663, 'epoch': 1} {'type': 'loss', 'content': 0.037558022886514664, 'timestamp': '2025-10-01 04:16:41.956577', 'step': 664, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:16:41.998869', 'step': 664, 'epoch': 1} {'type': 'loss', 'content': 0.03183635696768761, 'timestamp': '2025-10-01 04:16:42.008746', 'step': 665, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:42.050003', 'step': 665, 'epoch': 1} {'type': 'loss', 'content': 0.00965924933552742, 'timestamp': '2025-10-01 04:16:42.061003', 'step': 666, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:16:42.109329', 'step': 666, 'epoch': 1} {'type': 'loss', 'content': 0.011774723418056965, 'timestamp': '2025-10-01 04:16:42.122906', 'step': 667, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:42.165431', 'step': 667, 'epoch': 1} {'type': 'loss', 'content': 0.023414263501763344, 'timestamp': '2025-10-01 04:16:42.196037', 'step': 668, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:42.238455', 'step': 668, 'epoch': 1} {'type': 'loss', 'content': 0.018099745735526085, 'timestamp': '2025-10-01 04:16:42.246761', 'step': 669, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:16:42.288115', 'step': 669, 'epoch': 1} {'type': 'loss', 'content': 0.010663410648703575, 'timestamp': '2025-10-01 04:16:42.304072', 'step': 670, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:16:42.346100', 'step': 670, 'epoch': 1} {'type': 'loss', 'content': 0.015877481549978256, 'timestamp': '2025-10-01 04:16:42.359608', 'step': 671, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:16:42.406337', 'step': 671, 'epoch': 1} {'type': 'loss', 'content': 0.010524841025471687, 'timestamp': '2025-10-01 04:16:42.441284', 'step': 672, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:16:42.479177', 'step': 672, 'epoch': 1} {'type': 'loss', 'content': 0.013478215783834457, 'timestamp': '2025-10-01 04:16:42.492461', 'step': 673, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:16:42.531172', 'step': 673, 'epoch': 1} {'type': 'loss', 'content': 0.009967412799596786, 'timestamp': '2025-10-01 04:16:42.544001', 'step': 674, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:42.577602', 'step': 674, 'epoch': 1} {'type': 'loss', 'content': 0.014554332010447979, 'timestamp': '2025-10-01 04:16:42.585773', 'step': 675, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:16:42.632224', 'step': 675, 'epoch': 1} {'type': 'loss', 'content': 0.011013897135853767, 'timestamp': '2025-10-01 04:16:42.667127', 'step': 676, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:16:42.709968', 'step': 676, 'epoch': 1} {'type': 'loss', 'content': 0.021102357655763626, 'timestamp': '2025-10-01 04:16:42.722813', 'step': 677, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:16:42.769325', 'step': 677, 'epoch': 1} {'type': 'loss', 'content': 0.013679328374564648, 'timestamp': '2025-10-01 04:16:42.783333', 'step': 678, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:16:42.816128', 'step': 678, 'epoch': 1} {'type': 'loss', 'content': 0.01489955373108387, 'timestamp': '2025-10-01 04:16:42.828527', 'step': 679, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:16:42.872973', 'step': 679, 'epoch': 1} {'type': 'loss', 'content': 0.016977690160274506, 'timestamp': '2025-10-01 04:16:42.901935', 'step': 680, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:16:42.940030', 'step': 680, 'epoch': 1} {'type': 'loss', 'content': 0.017208809033036232, 'timestamp': '2025-10-01 04:16:42.955294', 'step': 681, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:42.988565', 'step': 681, 'epoch': 1} {'type': 'loss', 'content': 0.01761534810066223, 'timestamp': '2025-10-01 04:16:42.999387', 'step': 682, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:16:43.039803', 'step': 682, 'epoch': 1} {'type': 'loss', 'content': 0.01491020992398262, 'timestamp': '2025-10-01 04:16:43.047403', 'step': 683, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:43.081454', 'step': 683, 'epoch': 1} {'type': 'loss', 'content': 0.019351888447999954, 'timestamp': '2025-10-01 04:16:43.114912', 'step': 684, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:43.149678', 'step': 684, 'epoch': 1} {'type': 'loss', 'content': 0.012286268174648285, 'timestamp': '2025-10-01 04:16:43.155605', 'step': 685, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:16:43.189605', 'step': 685, 'epoch': 1} {'type': 'loss', 'content': 0.020967518910765648, 'timestamp': '2025-10-01 04:16:43.202145', 'step': 686, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:16:43.244645', 'step': 686, 'epoch': 1} {'type': 'loss', 'content': 0.05052589625120163, 'timestamp': '2025-10-01 04:16:43.253711', 'step': 687, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:16:43.292630', 'step': 687, 'epoch': 1} {'type': 'loss', 'content': 0.0472000353038311, 'timestamp': '2025-10-01 04:16:43.326401', 'step': 688, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:16:43.369101', 'step': 688, 'epoch': 1} {'type': 'loss', 'content': 0.019349941983819008, 'timestamp': '2025-10-01 04:16:43.382442', 'step': 689, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:43.421117', 'step': 689, 'epoch': 1} {'type': 'loss', 'content': 0.018962422385811806, 'timestamp': '2025-10-01 04:16:43.432034', 'step': 690, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-10-01 04:16:46.021878', 'step': 690, 'epoch': 1} {'type': 'pplx', 'content': 5.590228915613709, 'timestamp': '2025-10-01 04:16:46.032467', 'step': 690, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:46.068456', 'step': 690, 'epoch': 1} {'type': 'loss', 'content': 0.02387959510087967, 'timestamp': '2025-10-01 04:16:46.078901', 'step': 691, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:46.128664', 'step': 691, 'epoch': 1} {'type': 'loss', 'content': 0.01408354938030243, 'timestamp': '2025-10-01 04:16:46.161268', 'step': 692, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:46.211395', 'step': 692, 'epoch': 1} {'type': 'loss', 'content': 0.010192803107202053, 'timestamp': '2025-10-01 04:16:46.219794', 'step': 693, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:16:46.263725', 'step': 693, 'epoch': 1} {'type': 'loss', 'content': 0.016929572448134422, 'timestamp': '2025-10-01 04:16:46.273748', 'step': 694, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:46.326238', 'step': 694, 'epoch': 1} {'type': 'loss', 'content': 0.013880823738873005, 'timestamp': '2025-10-01 04:16:46.337817', 'step': 695, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-10-01 04:16:46.396302', 'step': 695, 'epoch': 1} {'type': 'loss', 'content': 0.009154722094535828, 'timestamp': '2025-10-01 04:16:46.433616', 'step': 696, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:16:46.476627', 'step': 696, 'epoch': 1} {'type': 'loss', 'content': 0.03798308223485947, 'timestamp': '2025-10-01 04:16:46.481156', 'step': 697, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:16:46.530535', 'step': 697, 'epoch': 1} {'type': 'loss', 'content': 0.04562222585082054, 'timestamp': '2025-10-01 04:16:46.545418', 'step': 698, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:16:46.601179', 'step': 698, 'epoch': 1} {'type': 'loss', 'content': 0.01600504480302334, 'timestamp': '2025-10-01 04:16:46.609044', 'step': 699, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:46.665507', 'step': 699, 'epoch': 1} {'type': 'loss', 'content': 0.008479262702167034, 'timestamp': '2025-10-01 04:16:46.700220', 'step': 700, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:46.749052', 'step': 700, 'epoch': 1} {'type': 'loss', 'content': 0.020695393905043602, 'timestamp': '2025-10-01 04:16:46.754991', 'step': 701, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:46.801311', 'step': 701, 'epoch': 1} {'type': 'loss', 'content': 0.015564492903649807, 'timestamp': '2025-10-01 04:16:46.816024', 'step': 702, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:16:46.873816', 'step': 702, 'epoch': 1} {'type': 'loss', 'content': 0.011668744497001171, 'timestamp': '2025-10-01 04:16:46.886220', 'step': 703, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:16:46.935425', 'step': 703, 'epoch': 1} {'type': 'loss', 'content': 0.014316283166408539, 'timestamp': '2025-10-01 04:16:46.973269', 'step': 704, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:47.038620', 'step': 704, 'epoch': 1} {'type': 'loss', 'content': 0.026227593421936035, 'timestamp': '2025-10-01 04:16:47.054179', 'step': 705, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:16:47.108186', 'step': 705, 'epoch': 1} {'type': 'loss', 'content': 0.017259083688259125, 'timestamp': '2025-10-01 04:16:47.127678', 'step': 706, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:16:47.179934', 'step': 706, 'epoch': 1} {'type': 'loss', 'content': 0.011930634267628193, 'timestamp': '2025-10-01 04:16:47.187094', 'step': 707, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:16:47.240744', 'step': 707, 'epoch': 1} {'type': 'loss', 'content': 0.02400364726781845, 'timestamp': '2025-10-01 04:16:47.269708', 'step': 708, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:16:47.320004', 'step': 708, 'epoch': 1} {'type': 'loss', 'content': 0.01854260452091694, 'timestamp': '2025-10-01 04:16:47.330374', 'step': 709, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:16:47.377989', 'step': 709, 'epoch': 1} {'type': 'loss', 'content': 0.014384285546839237, 'timestamp': '2025-10-01 04:16:47.387931', 'step': 710, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:47.430349', 'step': 710, 'epoch': 1} {'type': 'loss', 'content': 0.02442542463541031, 'timestamp': '2025-10-01 04:16:47.438578', 'step': 711, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:47.491173', 'step': 711, 'epoch': 1} {'type': 'loss', 'content': 0.016675086691975594, 'timestamp': '2025-10-01 04:16:47.522991', 'step': 712, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:16:47.562276', 'step': 712, 'epoch': 1} {'type': 'loss', 'content': 0.019871501252055168, 'timestamp': '2025-10-01 04:16:47.571595', 'step': 713, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:47.623622', 'step': 713, 'epoch': 1} {'type': 'loss', 'content': 0.022324305027723312, 'timestamp': '2025-10-01 04:16:47.634368', 'step': 714, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-10-01 04:16:47.680209', 'step': 714, 'epoch': 1} {'type': 'loss', 'content': 0.02458564005792141, 'timestamp': '2025-10-01 04:16:47.686792', 'step': 715, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:16:47.731563', 'step': 715, 'epoch': 1} {'type': 'loss', 'content': 0.020005865022540092, 'timestamp': '2025-10-01 04:16:47.759725', 'step': 716, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:16:47.804235', 'step': 716, 'epoch': 1} {'type': 'loss', 'content': 0.02151688188314438, 'timestamp': '2025-10-01 04:16:47.811432', 'step': 717, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:16:47.856730', 'step': 717, 'epoch': 1} {'type': 'loss', 'content': 0.03326771780848503, 'timestamp': '2025-10-01 04:16:47.864129', 'step': 718, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:16:47.906462', 'step': 718, 'epoch': 1} {'type': 'loss', 'content': 0.03364454582333565, 'timestamp': '2025-10-01 04:16:47.914450', 'step': 719, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:16:47.956133', 'step': 719, 'epoch': 1} {'type': 'loss', 'content': 0.00644028652459383, 'timestamp': '2025-10-01 04:16:47.984804', 'step': 720, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:48.030631', 'step': 720, 'epoch': 1} {'type': 'loss', 'content': 0.01876862719655037, 'timestamp': '2025-10-01 04:16:48.041456', 'step': 721, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:16:48.080524', 'step': 721, 'epoch': 1} {'type': 'loss', 'content': 0.018277429044246674, 'timestamp': '2025-10-01 04:16:48.094222', 'step': 722, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:16:48.143437', 'step': 722, 'epoch': 1} {'type': 'loss', 'content': 0.020421523600816727, 'timestamp': '2025-10-01 04:16:48.156433', 'step': 723, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:16:48.211948', 'step': 723, 'epoch': 1} {'type': 'loss', 'content': 0.018195098266005516, 'timestamp': '2025-10-01 04:16:48.249018', 'step': 724, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:16:48.287307', 'step': 724, 'epoch': 1} {'type': 'loss', 'content': 0.02277027815580368, 'timestamp': '2025-10-01 04:16:48.292976', 'step': 725, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:16:48.337347', 'step': 725, 'epoch': 1} {'type': 'loss', 'content': 0.011157083325088024, 'timestamp': '2025-10-01 04:16:48.348525', 'step': 726, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:48.387494', 'step': 726, 'epoch': 1} {'type': 'loss', 'content': 0.011660732328891754, 'timestamp': '2025-10-01 04:16:48.399170', 'step': 727, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:16:48.448991', 'step': 727, 'epoch': 1} {'type': 'loss', 'content': 0.009424228221178055, 'timestamp': '2025-10-01 04:16:48.483925', 'step': 728, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:48.534080', 'step': 728, 'epoch': 1} {'type': 'loss', 'content': 0.018405731767416, 'timestamp': '2025-10-01 04:16:48.544191', 'step': 729, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:16:48.595956', 'step': 729, 'epoch': 1} {'type': 'loss', 'content': 0.01693171076476574, 'timestamp': '2025-10-01 04:16:48.603398', 'step': 730, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:16:48.650398', 'step': 730, 'epoch': 1} {'type': 'loss', 'content': 0.013470661826431751, 'timestamp': '2025-10-01 04:16:48.658265', 'step': 731, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:48.700251', 'step': 731, 'epoch': 1} {'type': 'loss', 'content': 0.026925494894385338, 'timestamp': '2025-10-01 04:16:48.732024', 'step': 732, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:48.787558', 'step': 732, 'epoch': 1} {'type': 'loss', 'content': 0.019588999450206757, 'timestamp': '2025-10-01 04:16:48.796236', 'step': 733, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:16:48.842583', 'step': 733, 'epoch': 1} {'type': 'loss', 'content': 0.028291838243603706, 'timestamp': '2025-10-01 04:16:48.855273', 'step': 734, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:48.902844', 'step': 734, 'epoch': 1} {'type': 'loss', 'content': 0.016895201057195663, 'timestamp': '2025-10-01 04:16:48.914569', 'step': 735, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:16:48.960728', 'step': 735, 'epoch': 1} {'type': 'loss', 'content': 0.01547302957624197, 'timestamp': '2025-10-01 04:16:48.989171', 'step': 736, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:16:49.040049', 'step': 736, 'epoch': 1} {'type': 'loss', 'content': 0.018005814403295517, 'timestamp': '2025-10-01 04:16:49.055381', 'step': 737, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:49.093846', 'step': 737, 'epoch': 1} {'type': 'loss', 'content': 0.013466088101267815, 'timestamp': '2025-10-01 04:16:49.102233', 'step': 738, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:49.147057', 'step': 738, 'epoch': 1} {'type': 'loss', 'content': 0.01404781173914671, 'timestamp': '2025-10-01 04:16:49.157996', 'step': 739, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:49.201692', 'step': 739, 'epoch': 1} {'type': 'loss', 'content': 0.018180960789322853, 'timestamp': '2025-10-01 04:16:49.233858', 'step': 740, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:49.272302', 'step': 740, 'epoch': 1} {'type': 'loss', 'content': 0.02159731648862362, 'timestamp': '2025-10-01 04:16:49.278207', 'step': 741, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 14238374516352}, 'timestamp': '2025-10-01 04:16:49.337013', 'step': 741, 'epoch': 1} {'type': 'loss', 'content': 0.010468915104866028, 'timestamp': '2025-10-01 04:16:49.354501', 'step': 742, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:16:49.394595', 'step': 742, 'epoch': 1} {'type': 'loss', 'content': 0.01538703590631485, 'timestamp': '2025-10-01 04:16:49.407118', 'step': 743, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:16:49.455973', 'step': 743, 'epoch': 1} {'type': 'loss', 'content': 0.019241638481616974, 'timestamp': '2025-10-01 04:16:49.485015', 'step': 744, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:16:49.524613', 'step': 744, 'epoch': 1} {'type': 'loss', 'content': 0.0115292863920331, 'timestamp': '2025-10-01 04:16:49.536465', 'step': 745, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:49.585545', 'step': 745, 'epoch': 1} {'type': 'loss', 'content': 0.023424144834280014, 'timestamp': '2025-10-01 04:16:49.596511', 'step': 746, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:16:49.644771', 'step': 746, 'epoch': 1} {'type': 'loss', 'content': 0.019222324714064598, 'timestamp': '2025-10-01 04:16:49.660002', 'step': 747, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:49.709728', 'step': 747, 'epoch': 1} {'type': 'loss', 'content': 0.013502438552677631, 'timestamp': '2025-10-01 04:16:49.738971', 'step': 748, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:49.789163', 'step': 748, 'epoch': 1} {'type': 'loss', 'content': 0.020029990002512932, 'timestamp': '2025-10-01 04:16:49.798412', 'step': 749, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:49.842375', 'step': 749, 'epoch': 1} {'type': 'loss', 'content': 0.02200963906943798, 'timestamp': '2025-10-01 04:16:49.853881', 'step': 750, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:49.895554', 'step': 750, 'epoch': 1} {'type': 'loss', 'content': 0.0070251948200166225, 'timestamp': '2025-10-01 04:16:49.909998', 'step': 751, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:16:49.959860', 'step': 751, 'epoch': 1} {'type': 'loss', 'content': 0.012174601666629314, 'timestamp': '2025-10-01 04:16:49.994767', 'step': 752, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:50.041496', 'step': 752, 'epoch': 1} {'type': 'loss', 'content': 0.014177221804857254, 'timestamp': '2025-10-01 04:16:50.052138', 'step': 753, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:50.091792', 'step': 753, 'epoch': 1} {'type': 'loss', 'content': 0.012754111550748348, 'timestamp': '2025-10-01 04:16:50.103525', 'step': 754, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:50.149295', 'step': 754, 'epoch': 1} {'type': 'loss', 'content': 0.01481959130614996, 'timestamp': '2025-10-01 04:16:50.160832', 'step': 755, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:16:50.203182', 'step': 755, 'epoch': 1} {'type': 'loss', 'content': 0.018928740173578262, 'timestamp': '2025-10-01 04:16:50.236878', 'step': 756, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:50.290444', 'step': 756, 'epoch': 1} {'type': 'loss', 'content': 0.027050744742155075, 'timestamp': '2025-10-01 04:16:50.302584', 'step': 757, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:50.361133', 'step': 757, 'epoch': 1} {'type': 'loss', 'content': 0.016713252291083336, 'timestamp': '2025-10-01 04:16:50.372014', 'step': 758, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:16:50.461976', 'step': 758, 'epoch': 1} {'type': 'loss', 'content': 0.006769175641238689, 'timestamp': '2025-10-01 04:16:50.476038', 'step': 759, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:16:50.541439', 'step': 759, 'epoch': 1} {'type': 'loss', 'content': 0.020681047812104225, 'timestamp': '2025-10-01 04:16:50.576442', 'step': 760, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:16:50.637511', 'step': 760, 'epoch': 1} {'type': 'loss', 'content': 0.01555036474019289, 'timestamp': '2025-10-01 04:16:50.650756', 'step': 761, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:50.701531', 'step': 761, 'epoch': 1} {'type': 'loss', 'content': 0.025080092251300812, 'timestamp': '2025-10-01 04:16:50.713065', 'step': 762, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:16:50.769715', 'step': 762, 'epoch': 1} {'type': 'loss', 'content': 0.008333074860274792, 'timestamp': '2025-10-01 04:16:50.783658', 'step': 763, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:50.835984', 'step': 763, 'epoch': 1} {'type': 'loss', 'content': 0.019820524379611015, 'timestamp': '2025-10-01 04:16:50.868280', 'step': 764, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:16:50.915608', 'step': 764, 'epoch': 1} {'type': 'loss', 'content': 0.015321627259254456, 'timestamp': '2025-10-01 04:16:50.925749', 'step': 765, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 18984411776512}, 'timestamp': '2025-10-01 04:16:50.993296', 'step': 765, 'epoch': 1} {'type': 'loss', 'content': 0.007602608762681484, 'timestamp': '2025-10-01 04:16:51.015214', 'step': 766, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:51.067961', 'step': 766, 'epoch': 1} {'type': 'loss', 'content': 0.010044293478131294, 'timestamp': '2025-10-01 04:16:51.078853', 'step': 767, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:16:51.121593', 'step': 767, 'epoch': 1} {'type': 'loss', 'content': 0.01247943565249443, 'timestamp': '2025-10-01 04:16:51.153925', 'step': 768, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:16:51.207693', 'step': 768, 'epoch': 1} {'type': 'loss', 'content': 0.043616537004709244, 'timestamp': '2025-10-01 04:16:51.212667', 'step': 769, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:16:51.258601', 'step': 769, 'epoch': 1} {'type': 'loss', 'content': 0.012262927368283272, 'timestamp': '2025-10-01 04:16:51.271651', 'step': 770, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:16:51.317527', 'step': 770, 'epoch': 1} {'type': 'loss', 'content': 0.022417211905121803, 'timestamp': '2025-10-01 04:16:51.325025', 'step': 771, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:16:51.384267', 'step': 771, 'epoch': 1} {'type': 'loss', 'content': 0.014431762509047985, 'timestamp': '2025-10-01 04:16:51.419271', 'step': 772, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:51.469547', 'step': 772, 'epoch': 1} {'type': 'loss', 'content': 0.015046104788780212, 'timestamp': '2025-10-01 04:16:51.477961', 'step': 773, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:16:51.539215', 'step': 773, 'epoch': 1} {'type': 'loss', 'content': 0.010180965065956116, 'timestamp': '2025-10-01 04:16:51.552833', 'step': 774, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:16:51.610234', 'step': 774, 'epoch': 1} {'type': 'loss', 'content': 0.010857229121029377, 'timestamp': '2025-10-01 04:16:51.623808', 'step': 775, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:16:51.686926', 'step': 775, 'epoch': 1} {'type': 'loss', 'content': 0.03315185382962227, 'timestamp': '2025-10-01 04:16:51.721880', 'step': 776, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:16:51.777469', 'step': 776, 'epoch': 1} {'type': 'loss', 'content': 0.01910868100821972, 'timestamp': '2025-10-01 04:16:51.791013', 'step': 777, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:51.841613', 'step': 777, 'epoch': 1} {'type': 'loss', 'content': 0.018048323690891266, 'timestamp': '2025-10-01 04:16:51.849944', 'step': 778, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:16:51.899101', 'step': 778, 'epoch': 1} {'type': 'loss', 'content': 0.020141078159213066, 'timestamp': '2025-10-01 04:16:51.910884', 'step': 779, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:16:51.965360', 'step': 779, 'epoch': 1} {'type': 'loss', 'content': 0.025138767436146736, 'timestamp': '2025-10-01 04:16:51.999082', 'step': 780, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:16:52.051163', 'step': 780, 'epoch': 1} {'type': 'loss', 'content': 0.010838349349796772, 'timestamp': '2025-10-01 04:16:52.066772', 'step': 781, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:52.112884', 'step': 781, 'epoch': 1} {'type': 'loss', 'content': 0.0244025569409132, 'timestamp': '2025-10-01 04:16:52.121181', 'step': 782, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:16:52.169650', 'step': 782, 'epoch': 1} {'type': 'loss', 'content': 0.015856366604566574, 'timestamp': '2025-10-01 04:16:52.183224', 'step': 783, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:16:52.243341', 'step': 783, 'epoch': 1} {'type': 'loss', 'content': 0.017230717465281487, 'timestamp': '2025-10-01 04:16:52.279799', 'step': 784, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:52.336079', 'step': 784, 'epoch': 1} {'type': 'loss', 'content': 0.010687578469514847, 'timestamp': '2025-10-01 04:16:52.353364', 'step': 785, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:16:52.411608', 'step': 785, 'epoch': 1} {'type': 'loss', 'content': 0.02089807391166687, 'timestamp': '2025-10-01 04:16:52.429236', 'step': 786, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 16136789420416}, 'timestamp': '2025-10-01 04:16:52.502687', 'step': 786, 'epoch': 1} {'type': 'loss', 'content': 0.006695413496345282, 'timestamp': '2025-10-01 04:16:52.521981', 'step': 787, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:16:52.566735', 'step': 787, 'epoch': 1} {'type': 'loss', 'content': 0.03935248777270317, 'timestamp': '2025-10-01 04:16:52.605832', 'step': 788, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:52.665618', 'step': 788, 'epoch': 1} {'type': 'loss', 'content': 0.012783926911652088, 'timestamp': '2025-10-01 04:16:52.683073', 'step': 789, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:16:52.744331', 'step': 789, 'epoch': 1} {'type': 'loss', 'content': 0.023558177053928375, 'timestamp': '2025-10-01 04:16:52.760007', 'step': 790, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:16:52.811346', 'step': 790, 'epoch': 1} {'type': 'loss', 'content': 0.022504406049847603, 'timestamp': '2025-10-01 04:16:52.825877', 'step': 791, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:52.879743', 'step': 791, 'epoch': 1} {'type': 'loss', 'content': 0.024039259180426598, 'timestamp': '2025-10-01 04:16:52.909129', 'step': 792, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:16:52.959127', 'step': 792, 'epoch': 1} {'type': 'loss', 'content': 0.015601268038153648, 'timestamp': '2025-10-01 04:16:52.970863', 'step': 793, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:53.019640', 'step': 793, 'epoch': 1} {'type': 'loss', 'content': 0.017442021518945694, 'timestamp': '2025-10-01 04:16:53.036907', 'step': 794, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:53.090804', 'step': 794, 'epoch': 1} {'type': 'loss', 'content': 0.030175672844052315, 'timestamp': '2025-10-01 04:16:53.104335', 'step': 795, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:53.162158', 'step': 795, 'epoch': 1} {'type': 'loss', 'content': 0.01763024739921093, 'timestamp': '2025-10-01 04:16:53.196884', 'step': 796, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:16:53.258307', 'step': 796, 'epoch': 1} {'type': 'loss', 'content': 0.02012578956782818, 'timestamp': '2025-10-01 04:16:53.273641', 'step': 797, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:16:53.334380', 'step': 797, 'epoch': 1} {'type': 'loss', 'content': 0.03616732731461525, 'timestamp': '2025-10-01 04:16:53.341774', 'step': 798, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:53.393400', 'step': 798, 'epoch': 1} {'type': 'loss', 'content': 0.03826550394296646, 'timestamp': '2025-10-01 04:16:53.401622', 'step': 799, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:16:53.437559', 'step': 799, 'epoch': 1} {'type': 'loss', 'content': 0.01905038207769394, 'timestamp': '2025-10-01 04:16:53.466176', 'step': 800, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:53.520059', 'step': 800, 'epoch': 1} {'type': 'loss', 'content': 0.02519439533352852, 'timestamp': '2025-10-01 04:16:53.532704', 'step': 801, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:53.588249', 'step': 801, 'epoch': 1} {'type': 'loss', 'content': 0.026725957170128822, 'timestamp': '2025-10-01 04:16:53.596497', 'step': 802, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:16:53.645263', 'step': 802, 'epoch': 1} {'type': 'loss', 'content': 0.03588542342185974, 'timestamp': '2025-10-01 04:16:53.657753', 'step': 803, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:53.701955', 'step': 803, 'epoch': 1} {'type': 'loss', 'content': 0.016685785725712776, 'timestamp': '2025-10-01 04:16:53.734763', 'step': 804, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:53.777890', 'step': 804, 'epoch': 1} {'type': 'loss', 'content': 0.020222829654812813, 'timestamp': '2025-10-01 04:16:53.786299', 'step': 805, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-10-01 04:16:56.628165', 'step': 805, 'epoch': 1} {'type': 'pplx', 'content': 5.506343331262317, 'timestamp': '2025-10-01 04:16:56.637718', 'step': 805, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:16:56.686580', 'step': 805, 'epoch': 1} {'type': 'loss', 'content': 0.013610522262752056, 'timestamp': '2025-10-01 04:16:56.693583', 'step': 806, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:16:56.758435', 'step': 806, 'epoch': 1} {'type': 'loss', 'content': 0.01297046709805727, 'timestamp': '2025-10-01 04:16:56.774122', 'step': 807, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:16:56.836687', 'step': 807, 'epoch': 1} {'type': 'loss', 'content': 0.02586032636463642, 'timestamp': '2025-10-01 04:16:56.865080', 'step': 808, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:16:56.919604', 'step': 808, 'epoch': 1} {'type': 'loss', 'content': 0.012200028635561466, 'timestamp': '2025-10-01 04:16:56.933057', 'step': 809, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 14238374516352}, 'timestamp': '2025-10-01 04:16:57.002392', 'step': 809, 'epoch': 1} {'type': 'loss', 'content': 0.006216016598045826, 'timestamp': '2025-10-01 04:16:57.019912', 'step': 810, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:57.073183', 'step': 810, 'epoch': 1} {'type': 'loss', 'content': 0.01763732358813286, 'timestamp': '2025-10-01 04:16:57.081412', 'step': 811, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:57.138179', 'step': 811, 'epoch': 1} {'type': 'loss', 'content': 0.008464297279715538, 'timestamp': '2025-10-01 04:16:57.169682', 'step': 812, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:16:57.227928', 'step': 812, 'epoch': 1} {'type': 'loss', 'content': 0.010054657235741615, 'timestamp': '2025-10-01 04:16:57.241191', 'step': 813, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:57.292434', 'step': 813, 'epoch': 1} {'type': 'loss', 'content': 0.01683088019490242, 'timestamp': '2025-10-01 04:16:57.303947', 'step': 814, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:57.369339', 'step': 814, 'epoch': 1} {'type': 'loss', 'content': 0.02320648543536663, 'timestamp': '2025-10-01 04:16:57.377587', 'step': 815, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:16:57.444015', 'step': 815, 'epoch': 1} {'type': 'loss', 'content': 0.014246467500925064, 'timestamp': '2025-10-01 04:16:57.479115', 'step': 816, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:16:57.544796', 'step': 816, 'epoch': 1} {'type': 'loss', 'content': 0.00997338816523552, 'timestamp': '2025-10-01 04:16:57.555726', 'step': 817, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:57.602361', 'step': 817, 'epoch': 1} {'type': 'loss', 'content': 0.015141251496970654, 'timestamp': '2025-10-01 04:16:57.613969', 'step': 818, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:16:57.664603', 'step': 818, 'epoch': 1} {'type': 'loss', 'content': 0.013046610169112682, 'timestamp': '2025-10-01 04:16:57.678809', 'step': 819, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:16:57.732154', 'step': 819, 'epoch': 1} {'type': 'loss', 'content': 0.013539626263082027, 'timestamp': '2025-10-01 04:16:57.760956', 'step': 820, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:16:57.796916', 'step': 820, 'epoch': 1} {'type': 'loss', 'content': 0.01382614765316248, 'timestamp': '2025-10-01 04:16:57.807654', 'step': 821, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:16:57.864689', 'step': 821, 'epoch': 1} {'type': 'loss', 'content': 0.024039553478360176, 'timestamp': '2025-10-01 04:16:57.872718', 'step': 822, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:16:57.922365', 'step': 822, 'epoch': 1} {'type': 'loss', 'content': 0.022881977260112762, 'timestamp': '2025-10-01 04:16:57.930206', 'step': 823, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:57.981997', 'step': 823, 'epoch': 1} {'type': 'loss', 'content': 0.010355522856116295, 'timestamp': '2025-10-01 04:16:58.013796', 'step': 824, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:58.058419', 'step': 824, 'epoch': 1} {'type': 'loss', 'content': 0.01690341904759407, 'timestamp': '2025-10-01 04:16:58.064285', 'step': 825, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:16:58.115617', 'step': 825, 'epoch': 1} {'type': 'loss', 'content': 0.010961581952869892, 'timestamp': '2025-10-01 04:16:58.128351', 'step': 826, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:16:58.183127', 'step': 826, 'epoch': 1} {'type': 'loss', 'content': 0.014490699395537376, 'timestamp': '2025-10-01 04:16:58.195892', 'step': 827, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:58.238275', 'step': 827, 'epoch': 1} {'type': 'loss', 'content': 0.00930944923311472, 'timestamp': '2025-10-01 04:16:58.270758', 'step': 828, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:16:58.315947', 'step': 828, 'epoch': 1} {'type': 'loss', 'content': 0.022879360243678093, 'timestamp': '2025-10-01 04:16:58.321654', 'step': 829, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:58.369404', 'step': 829, 'epoch': 1} {'type': 'loss', 'content': 0.018127089366316795, 'timestamp': '2025-10-01 04:16:58.380902', 'step': 830, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:16:58.450617', 'step': 830, 'epoch': 1} {'type': 'loss', 'content': 0.012494026683270931, 'timestamp': '2025-10-01 04:16:58.464606', 'step': 831, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:58.516594', 'step': 831, 'epoch': 1} {'type': 'loss', 'content': 0.014895457774400711, 'timestamp': '2025-10-01 04:16:58.548098', 'step': 832, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:58.591393', 'step': 832, 'epoch': 1} {'type': 'loss', 'content': 0.019058866426348686, 'timestamp': '2025-10-01 04:16:58.597347', 'step': 833, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:58.641232', 'step': 833, 'epoch': 1} {'type': 'loss', 'content': 0.009113918989896774, 'timestamp': '2025-10-01 04:16:58.652630', 'step': 834, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:58.695204', 'step': 834, 'epoch': 1} {'type': 'loss', 'content': 0.030856117606163025, 'timestamp': '2025-10-01 04:16:58.705829', 'step': 835, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:16:58.747057', 'step': 835, 'epoch': 1} {'type': 'loss', 'content': 0.010709251277148724, 'timestamp': '2025-10-01 04:16:58.780513', 'step': 836, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:58.826237', 'step': 836, 'epoch': 1} {'type': 'loss', 'content': 0.015633676201105118, 'timestamp': '2025-10-01 04:16:58.835521', 'step': 837, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:58.888643', 'step': 837, 'epoch': 1} {'type': 'loss', 'content': 0.02038966305553913, 'timestamp': '2025-10-01 04:16:58.896958', 'step': 838, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:16:58.951483', 'step': 838, 'epoch': 1} {'type': 'loss', 'content': 0.011192546226084232, 'timestamp': '2025-10-01 04:16:58.959094', 'step': 839, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:16:58.999646', 'step': 839, 'epoch': 1} {'type': 'loss', 'content': 0.012934654019773006, 'timestamp': '2025-10-01 04:16:59.028443', 'step': 840, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:16:59.065463', 'step': 840, 'epoch': 1} {'type': 'loss', 'content': 0.017935598269104958, 'timestamp': '2025-10-01 04:16:59.075675', 'step': 841, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:16:59.131698', 'step': 841, 'epoch': 1} {'type': 'loss', 'content': 0.009617184288799763, 'timestamp': '2025-10-01 04:16:59.144497', 'step': 842, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:16:59.184974', 'step': 842, 'epoch': 1} {'type': 'loss', 'content': 0.013499625958502293, 'timestamp': '2025-10-01 04:16:59.199046', 'step': 843, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:16:59.243195', 'step': 843, 'epoch': 1} {'type': 'loss', 'content': 0.023238593712449074, 'timestamp': '2025-10-01 04:16:59.272112', 'step': 844, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:59.323694', 'step': 844, 'epoch': 1} {'type': 'loss', 'content': 0.008817454800009727, 'timestamp': '2025-10-01 04:16:59.329558', 'step': 845, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:16:59.380849', 'step': 845, 'epoch': 1} {'type': 'loss', 'content': 0.021097274497151375, 'timestamp': '2025-10-01 04:16:59.391535', 'step': 846, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:16:59.436366', 'step': 846, 'epoch': 1} {'type': 'loss', 'content': 0.009904223494231701, 'timestamp': '2025-10-01 04:16:59.447777', 'step': 847, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:16:59.493859', 'step': 847, 'epoch': 1} {'type': 'loss', 'content': 0.022737471386790276, 'timestamp': '2025-10-01 04:16:59.523005', 'step': 848, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 15662185694400}, 'timestamp': '2025-10-01 04:16:59.586329', 'step': 848, 'epoch': 1} {'type': 'loss', 'content': 0.013106442987918854, 'timestamp': '2025-10-01 04:16:59.602347', 'step': 849, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:16:59.653626', 'step': 849, 'epoch': 1} {'type': 'loss', 'content': 0.008213101886212826, 'timestamp': '2025-10-01 04:16:59.666394', 'step': 850, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:16:59.723526', 'step': 850, 'epoch': 1} {'type': 'loss', 'content': 0.01002099085599184, 'timestamp': '2025-10-01 04:16:59.737571', 'step': 851, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:16:59.774629', 'step': 851, 'epoch': 1} {'type': 'loss', 'content': 0.011630872264504433, 'timestamp': '2025-10-01 04:16:59.809107', 'step': 852, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:16:59.860829', 'step': 852, 'epoch': 1} {'type': 'loss', 'content': 0.01773335039615631, 'timestamp': '2025-10-01 04:16:59.871095', 'step': 853, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:16:59.920431', 'step': 853, 'epoch': 1} {'type': 'loss', 'content': 0.02595398761332035, 'timestamp': '2025-10-01 04:16:59.934433', 'step': 854, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:16:59.979215', 'step': 854, 'epoch': 1} {'type': 'loss', 'content': 0.022645751014351845, 'timestamp': '2025-10-01 04:16:59.991751', 'step': 855, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:00.034793', 'step': 855, 'epoch': 1} {'type': 'loss', 'content': 0.016527969390153885, 'timestamp': '2025-10-01 04:17:00.068253', 'step': 856, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:00.113548', 'step': 856, 'epoch': 1} {'type': 'loss', 'content': 0.03069513663649559, 'timestamp': '2025-10-01 04:17:00.124023', 'step': 857, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:00.163248', 'step': 857, 'epoch': 1} {'type': 'loss', 'content': 0.01643683761358261, 'timestamp': '2025-10-01 04:17:00.175776', 'step': 858, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:17:00.225374', 'step': 858, 'epoch': 1} {'type': 'loss', 'content': 0.023538677021861076, 'timestamp': '2025-10-01 04:17:00.239546', 'step': 859, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:00.294037', 'step': 859, 'epoch': 1} {'type': 'loss', 'content': 0.013248346745967865, 'timestamp': '2025-10-01 04:17:00.323081', 'step': 860, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:17:00.373231', 'step': 860, 'epoch': 1} {'type': 'loss', 'content': 0.008704650215804577, 'timestamp': '2025-10-01 04:17:00.386739', 'step': 861, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:17:00.438838', 'step': 861, 'epoch': 1} {'type': 'loss', 'content': 0.02060210146009922, 'timestamp': '2025-10-01 04:17:00.451534', 'step': 862, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:17:00.506149', 'step': 862, 'epoch': 1} {'type': 'loss', 'content': 0.01663345843553543, 'timestamp': '2025-10-01 04:17:00.513897', 'step': 863, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:00.580383', 'step': 863, 'epoch': 1} {'type': 'loss', 'content': 0.022348545491695404, 'timestamp': '2025-10-01 04:17:00.609595', 'step': 864, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:17:00.665330', 'step': 864, 'epoch': 1} {'type': 'loss', 'content': 0.012999248690903187, 'timestamp': '2025-10-01 04:17:00.676098', 'step': 865, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:17:00.725164', 'step': 865, 'epoch': 1} {'type': 'loss', 'content': 0.01441412977874279, 'timestamp': '2025-10-01 04:17:00.739113', 'step': 866, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:00.785504', 'step': 866, 'epoch': 1} {'type': 'loss', 'content': 0.01226433739066124, 'timestamp': '2025-10-01 04:17:00.798075', 'step': 867, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:00.857220', 'step': 867, 'epoch': 1} {'type': 'loss', 'content': 0.013449556194245815, 'timestamp': '2025-10-01 04:17:00.890639', 'step': 868, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:00.954156', 'step': 868, 'epoch': 1} {'type': 'loss', 'content': 0.018676169216632843, 'timestamp': '2025-10-01 04:17:00.964424', 'step': 869, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:01.013305', 'step': 869, 'epoch': 1} {'type': 'loss', 'content': 0.016962604597210884, 'timestamp': '2025-10-01 04:17:01.025791', 'step': 870, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:01.076720', 'step': 870, 'epoch': 1} {'type': 'loss', 'content': 0.018597086891531944, 'timestamp': '2025-10-01 04:17:01.088050', 'step': 871, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:17:01.137628', 'step': 871, 'epoch': 1} {'type': 'loss', 'content': 0.023726342245936394, 'timestamp': '2025-10-01 04:17:01.166351', 'step': 872, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:17:01.217056', 'step': 872, 'epoch': 1} {'type': 'loss', 'content': 0.01320921815931797, 'timestamp': '2025-10-01 04:17:01.230598', 'step': 873, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:17:01.276518', 'step': 873, 'epoch': 1} {'type': 'loss', 'content': 0.015139179304242134, 'timestamp': '2025-10-01 04:17:01.290489', 'step': 874, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:17:01.333494', 'step': 874, 'epoch': 1} {'type': 'loss', 'content': 0.021889282390475273, 'timestamp': '2025-10-01 04:17:01.346208', 'step': 875, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:17:01.398257', 'step': 875, 'epoch': 1} {'type': 'loss', 'content': 0.019694596529006958, 'timestamp': '2025-10-01 04:17:01.432782', 'step': 876, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:01.480199', 'step': 876, 'epoch': 1} {'type': 'loss', 'content': 0.012034914456307888, 'timestamp': '2025-10-01 04:17:01.490306', 'step': 877, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:01.533801', 'step': 877, 'epoch': 1} {'type': 'loss', 'content': 0.018297376111149788, 'timestamp': '2025-10-01 04:17:01.544475', 'step': 878, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:17:01.591910', 'step': 878, 'epoch': 1} {'type': 'loss', 'content': 0.02078056149184704, 'timestamp': '2025-10-01 04:17:01.599494', 'step': 879, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:17:01.650929', 'step': 879, 'epoch': 1} {'type': 'loss', 'content': 0.008642885833978653, 'timestamp': '2025-10-01 04:17:01.686066', 'step': 880, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:01.733240', 'step': 880, 'epoch': 1} {'type': 'loss', 'content': 0.027024896815419197, 'timestamp': '2025-10-01 04:17:01.743408', 'step': 881, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:17:01.792443', 'step': 881, 'epoch': 1} {'type': 'loss', 'content': 0.014238250441849232, 'timestamp': '2025-10-01 04:17:01.806498', 'step': 882, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:01.860601', 'step': 882, 'epoch': 1} {'type': 'loss', 'content': 0.012963378801941872, 'timestamp': '2025-10-01 04:17:01.873051', 'step': 883, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:01.922458', 'step': 883, 'epoch': 1} {'type': 'loss', 'content': 0.01333408709615469, 'timestamp': '2025-10-01 04:17:01.954759', 'step': 884, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:02.015033', 'step': 884, 'epoch': 1} {'type': 'loss', 'content': 0.025041280314326286, 'timestamp': '2025-10-01 04:17:02.023282', 'step': 885, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:17:02.075051', 'step': 885, 'epoch': 1} {'type': 'loss', 'content': 0.011433732695877552, 'timestamp': '2025-10-01 04:17:02.089027', 'step': 886, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:02.126209', 'step': 886, 'epoch': 1} {'type': 'loss', 'content': 0.013294695876538754, 'timestamp': '2025-10-01 04:17:02.138692', 'step': 887, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:17:02.193778', 'step': 887, 'epoch': 1} {'type': 'loss', 'content': 0.012769946828484535, 'timestamp': '2025-10-01 04:17:02.230450', 'step': 888, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:17:02.277528', 'step': 888, 'epoch': 1} {'type': 'loss', 'content': 0.016544749960303307, 'timestamp': '2025-10-01 04:17:02.288317', 'step': 889, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:02.336120', 'step': 889, 'epoch': 1} {'type': 'loss', 'content': 0.017434030771255493, 'timestamp': '2025-10-01 04:17:02.348618', 'step': 890, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:17:02.411348', 'step': 890, 'epoch': 1} {'type': 'loss', 'content': 0.011312839575111866, 'timestamp': '2025-10-01 04:17:02.418916', 'step': 891, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:17:02.465544', 'step': 891, 'epoch': 1} {'type': 'loss', 'content': 0.014835567213594913, 'timestamp': '2025-10-01 04:17:02.500177', 'step': 892, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:17:02.560548', 'step': 892, 'epoch': 1} {'type': 'loss', 'content': 0.01765669323503971, 'timestamp': '2025-10-01 04:17:02.570972', 'step': 893, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:02.621715', 'step': 893, 'epoch': 1} {'type': 'loss', 'content': 0.011812472715973854, 'timestamp': '2025-10-01 04:17:02.639278', 'step': 894, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:02.685457', 'step': 894, 'epoch': 1} {'type': 'loss', 'content': 0.0198637954890728, 'timestamp': '2025-10-01 04:17:02.696119', 'step': 895, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:02.744363', 'step': 895, 'epoch': 1} {'type': 'loss', 'content': 0.012054824270308018, 'timestamp': '2025-10-01 04:17:02.776769', 'step': 896, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:02.852239', 'step': 896, 'epoch': 1} {'type': 'loss', 'content': 0.014130085706710815, 'timestamp': '2025-10-01 04:17:02.863296', 'step': 897, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:17:02.927178', 'step': 897, 'epoch': 1} {'type': 'loss', 'content': 0.011421571485698223, 'timestamp': '2025-10-01 04:17:02.940682', 'step': 898, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:02.997291', 'step': 898, 'epoch': 1} {'type': 'loss', 'content': 0.01976010575890541, 'timestamp': '2025-10-01 04:17:03.008424', 'step': 899, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:17:03.054085', 'step': 899, 'epoch': 1} {'type': 'loss', 'content': 0.01844518817961216, 'timestamp': '2025-10-01 04:17:03.088387', 'step': 900, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:03.131689', 'step': 900, 'epoch': 1} {'type': 'loss', 'content': 0.020613864064216614, 'timestamp': '2025-10-01 04:17:03.140643', 'step': 901, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:03.217610', 'step': 901, 'epoch': 1} {'type': 'loss', 'content': 0.021135471761226654, 'timestamp': '2025-10-01 04:17:03.234275', 'step': 902, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:03.287691', 'step': 902, 'epoch': 1} {'type': 'loss', 'content': 0.01764553040266037, 'timestamp': '2025-10-01 04:17:03.299816', 'step': 903, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:17:03.361013', 'step': 903, 'epoch': 1} {'type': 'loss', 'content': 0.016472624614834785, 'timestamp': '2025-10-01 04:17:03.395577', 'step': 904, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:03.462188', 'step': 904, 'epoch': 1} {'type': 'loss', 'content': 0.01109653152525425, 'timestamp': '2025-10-01 04:17:03.475408', 'step': 905, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:17:03.531966', 'step': 905, 'epoch': 1} {'type': 'loss', 'content': 0.01725088804960251, 'timestamp': '2025-10-01 04:17:03.548581', 'step': 906, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:03.601080', 'step': 906, 'epoch': 1} {'type': 'loss', 'content': 0.01417796965688467, 'timestamp': '2025-10-01 04:17:03.616965', 'step': 907, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:17:03.665270', 'step': 907, 'epoch': 1} {'type': 'loss', 'content': 0.014167236164212227, 'timestamp': '2025-10-01 04:17:03.689635', 'step': 908, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:17:03.734564', 'step': 908, 'epoch': 1} {'type': 'loss', 'content': 0.015330505557358265, 'timestamp': '2025-10-01 04:17:03.739814', 'step': 909, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:17:03.782773', 'step': 909, 'epoch': 1} {'type': 'loss', 'content': 0.026079654693603516, 'timestamp': '2025-10-01 04:17:03.790711', 'step': 910, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:03.831112', 'step': 910, 'epoch': 1} {'type': 'loss', 'content': 0.017369648441672325, 'timestamp': '2025-10-01 04:17:03.839328', 'step': 911, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:03.888409', 'step': 911, 'epoch': 1} {'type': 'loss', 'content': 0.019596360623836517, 'timestamp': '2025-10-01 04:17:03.920212', 'step': 912, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:03.962598', 'step': 912, 'epoch': 1} {'type': 'loss', 'content': 0.0161263570189476, 'timestamp': '2025-10-01 04:17:03.971014', 'step': 913, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:04.016668', 'step': 913, 'epoch': 1} {'type': 'loss', 'content': 0.016215099021792412, 'timestamp': '2025-10-01 04:17:04.024971', 'step': 914, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 15187581968384}, 'timestamp': '2025-10-01 04:17:04.085832', 'step': 914, 'epoch': 1} {'type': 'loss', 'content': 0.00881049782037735, 'timestamp': '2025-10-01 04:17:04.103755', 'step': 915, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:17:04.142198', 'step': 915, 'epoch': 1} {'type': 'loss', 'content': 0.01635187491774559, 'timestamp': '2025-10-01 04:17:04.175930', 'step': 916, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:17:04.226691', 'step': 916, 'epoch': 1} {'type': 'loss', 'content': 0.017244182527065277, 'timestamp': '2025-10-01 04:17:04.239463', 'step': 917, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:04.284030', 'step': 917, 'epoch': 1} {'type': 'loss', 'content': 0.015881482511758804, 'timestamp': '2025-10-01 04:17:04.294867', 'step': 918, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:04.335369', 'step': 918, 'epoch': 1} {'type': 'loss', 'content': 0.013053612783551216, 'timestamp': '2025-10-01 04:17:04.346254', 'step': 919, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:04.398902', 'step': 919, 'epoch': 1} {'type': 'loss', 'content': 0.008609047159552574, 'timestamp': '2025-10-01 04:17:04.428671', 'step': 920, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-10-01 04:17:07.467111', 'step': 920, 'epoch': 1} {'type': 'pplx', 'content': 5.5432861083425236, 'timestamp': '2025-10-01 04:17:07.473831', 'step': 920, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:07.512595', 'step': 920, 'epoch': 1} {'type': 'loss', 'content': 0.016901895403862, 'timestamp': '2025-10-01 04:17:07.519443', 'step': 921, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:07.576793', 'step': 921, 'epoch': 1} {'type': 'loss', 'content': 0.008426344953477383, 'timestamp': '2025-10-01 04:17:07.589320', 'step': 922, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-10-01 04:17:07.644218', 'step': 922, 'epoch': 1} {'type': 'loss', 'content': 0.007949614897370338, 'timestamp': '2025-10-01 04:17:07.661494', 'step': 923, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:07.708683', 'step': 923, 'epoch': 1} {'type': 'loss', 'content': 0.005497085861861706, 'timestamp': '2025-10-01 04:17:07.740427', 'step': 924, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:17:07.785847', 'step': 924, 'epoch': 1} {'type': 'loss', 'content': 0.01417266670614481, 'timestamp': '2025-10-01 04:17:07.791323', 'step': 925, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:17:07.835689', 'step': 925, 'epoch': 1} {'type': 'loss', 'content': 0.0099391620606184, 'timestamp': '2025-10-01 04:17:07.843213', 'step': 926, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:07.876083', 'step': 926, 'epoch': 1} {'type': 'loss', 'content': 0.020078355446457863, 'timestamp': '2025-10-01 04:17:07.886651', 'step': 927, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:17:07.933165', 'step': 927, 'epoch': 1} {'type': 'loss', 'content': 0.04026108607649803, 'timestamp': '2025-10-01 04:17:07.959081', 'step': 928, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:17:08.007057', 'step': 928, 'epoch': 1} {'type': 'loss', 'content': 0.017475688830018044, 'timestamp': '2025-10-01 04:17:08.012287', 'step': 929, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:17:08.062000', 'step': 929, 'epoch': 1} {'type': 'loss', 'content': 0.013841044157743454, 'timestamp': '2025-10-01 04:17:08.075949', 'step': 930, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:17:08.128162', 'step': 930, 'epoch': 1} {'type': 'loss', 'content': 0.005659683607518673, 'timestamp': '2025-10-01 04:17:08.135473', 'step': 931, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:08.176307', 'step': 931, 'epoch': 1} {'type': 'loss', 'content': 0.01997227594256401, 'timestamp': '2025-10-01 04:17:08.205512', 'step': 932, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:17:08.246782', 'step': 932, 'epoch': 1} {'type': 'loss', 'content': 0.007908342406153679, 'timestamp': '2025-10-01 04:17:08.257696', 'step': 933, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:17:08.302800', 'step': 933, 'epoch': 1} {'type': 'loss', 'content': 0.020705798640847206, 'timestamp': '2025-10-01 04:17:08.307750', 'step': 934, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:17:08.351793', 'step': 934, 'epoch': 1} {'type': 'loss', 'content': 0.009327217936515808, 'timestamp': '2025-10-01 04:17:08.359345', 'step': 935, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:17:08.406697', 'step': 935, 'epoch': 1} {'type': 'loss', 'content': 0.010983669199049473, 'timestamp': '2025-10-01 04:17:08.441215', 'step': 936, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:17:08.487841', 'step': 936, 'epoch': 1} {'type': 'loss', 'content': 0.008349429816007614, 'timestamp': '2025-10-01 04:17:08.503428', 'step': 937, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:08.544270', 'step': 937, 'epoch': 1} {'type': 'loss', 'content': 0.016281604766845703, 'timestamp': '2025-10-01 04:17:08.555067', 'step': 938, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:17:08.600379', 'step': 938, 'epoch': 1} {'type': 'loss', 'content': 0.013434935361146927, 'timestamp': '2025-10-01 04:17:08.614289', 'step': 939, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-10-01 04:17:08.669545', 'step': 939, 'epoch': 1} {'type': 'loss', 'content': 0.012434078380465508, 'timestamp': '2025-10-01 04:17:08.707761', 'step': 940, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:08.753383', 'step': 940, 'epoch': 1} {'type': 'loss', 'content': 0.020199837163090706, 'timestamp': '2025-10-01 04:17:08.763627', 'step': 941, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:08.812277', 'step': 941, 'epoch': 1} {'type': 'loss', 'content': 0.014172638766467571, 'timestamp': '2025-10-01 04:17:08.823009', 'step': 942, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:17:08.875626', 'step': 942, 'epoch': 1} {'type': 'loss', 'content': 0.012070458382368088, 'timestamp': '2025-10-01 04:17:08.889631', 'step': 943, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:08.933683', 'step': 943, 'epoch': 1} {'type': 'loss', 'content': 0.026408003643155098, 'timestamp': '2025-10-01 04:17:08.965595', 'step': 944, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:09.008811', 'step': 944, 'epoch': 1} {'type': 'loss', 'content': 0.015459176152944565, 'timestamp': '2025-10-01 04:17:09.017304', 'step': 945, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:17:09.070049', 'step': 945, 'epoch': 1} {'type': 'loss', 'content': 0.012429009191691875, 'timestamp': '2025-10-01 04:17:09.085836', 'step': 946, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:17:09.129131', 'step': 946, 'epoch': 1} {'type': 'loss', 'content': 0.015096650458872318, 'timestamp': '2025-10-01 04:17:09.141903', 'step': 947, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:17:09.192961', 'step': 947, 'epoch': 1} {'type': 'loss', 'content': 0.0163682010024786, 'timestamp': '2025-10-01 04:17:09.227931', 'step': 948, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:09.270571', 'step': 948, 'epoch': 1} {'type': 'loss', 'content': 0.028742358088493347, 'timestamp': '2025-10-01 04:17:09.280820', 'step': 949, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:17:09.331362', 'step': 949, 'epoch': 1} {'type': 'loss', 'content': 0.027111686766147614, 'timestamp': '2025-10-01 04:17:09.339438', 'step': 950, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:17:09.375012', 'step': 950, 'epoch': 1} {'type': 'loss', 'content': 0.0131864994764328, 'timestamp': '2025-10-01 04:17:09.382978', 'step': 951, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:17:09.435866', 'step': 951, 'epoch': 1} {'type': 'loss', 'content': 0.024152593687176704, 'timestamp': '2025-10-01 04:17:09.464092', 'step': 952, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:17:09.517920', 'step': 952, 'epoch': 1} {'type': 'loss', 'content': 0.01976967789232731, 'timestamp': '2025-10-01 04:17:09.523525', 'step': 953, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:09.570343', 'step': 953, 'epoch': 1} {'type': 'loss', 'content': 0.017132768407464027, 'timestamp': '2025-10-01 04:17:09.581887', 'step': 954, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:09.620908', 'step': 954, 'epoch': 1} {'type': 'loss', 'content': 0.023847542703151703, 'timestamp': '2025-10-01 04:17:09.628966', 'step': 955, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:09.671117', 'step': 955, 'epoch': 1} {'type': 'loss', 'content': 0.014603063464164734, 'timestamp': '2025-10-01 04:17:09.703322', 'step': 956, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:17:09.737620', 'step': 956, 'epoch': 1} {'type': 'loss', 'content': 0.010361293330788612, 'timestamp': '2025-10-01 04:17:09.748617', 'step': 957, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:09.788693', 'step': 957, 'epoch': 1} {'type': 'loss', 'content': 0.019059544429183006, 'timestamp': '2025-10-01 04:17:09.800217', 'step': 958, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:17:09.843499', 'step': 958, 'epoch': 1} {'type': 'loss', 'content': 0.015362321399152279, 'timestamp': '2025-10-01 04:17:09.856237', 'step': 959, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:17:09.914440', 'step': 959, 'epoch': 1} {'type': 'loss', 'content': 0.020894832909107208, 'timestamp': '2025-10-01 04:17:09.943160', 'step': 960, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:09.986626', 'step': 960, 'epoch': 1} {'type': 'loss', 'content': 0.0187650416046381, 'timestamp': '2025-10-01 04:17:09.995773', 'step': 961, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:17:10.033666', 'step': 961, 'epoch': 1} {'type': 'loss', 'content': 0.013322071172297001, 'timestamp': '2025-10-01 04:17:10.041272', 'step': 962, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:10.084238', 'step': 962, 'epoch': 1} {'type': 'loss', 'content': 0.025049332529306412, 'timestamp': '2025-10-01 04:17:10.095770', 'step': 963, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:17:10.141535', 'step': 963, 'epoch': 1} {'type': 'loss', 'content': 0.010547764599323273, 'timestamp': '2025-10-01 04:17:10.169821', 'step': 964, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:17:10.221677', 'step': 964, 'epoch': 1} {'type': 'loss', 'content': 0.014628770761191845, 'timestamp': '2025-10-01 04:17:10.235189', 'step': 965, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:10.281442', 'step': 965, 'epoch': 1} {'type': 'loss', 'content': 0.02251378260552883, 'timestamp': '2025-10-01 04:17:10.289771', 'step': 966, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:17:10.341042', 'step': 966, 'epoch': 1} {'type': 'loss', 'content': 0.023234965279698372, 'timestamp': '2025-10-01 04:17:10.348396', 'step': 967, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:17:10.395320', 'step': 967, 'epoch': 1} {'type': 'loss', 'content': 0.011769128032028675, 'timestamp': '2025-10-01 04:17:10.430319', 'step': 968, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:17:10.473526', 'step': 968, 'epoch': 1} {'type': 'loss', 'content': 0.014143574051558971, 'timestamp': '2025-10-01 04:17:10.486439', 'step': 969, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:10.530207', 'step': 969, 'epoch': 1} {'type': 'loss', 'content': 0.017799802124500275, 'timestamp': '2025-10-01 04:17:10.542709', 'step': 970, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:17:10.582847', 'step': 970, 'epoch': 1} {'type': 'loss', 'content': 0.029853766784071922, 'timestamp': '2025-10-01 04:17:10.595547', 'step': 971, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:17:10.648895', 'step': 971, 'epoch': 1} {'type': 'loss', 'content': 0.01440099161118269, 'timestamp': '2025-10-01 04:17:10.685237', 'step': 972, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 15662185694400}, 'timestamp': '2025-10-01 04:17:10.746800', 'step': 972, 'epoch': 1} {'type': 'loss', 'content': 0.00686343340203166, 'timestamp': '2025-10-01 04:17:10.765903', 'step': 973, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:17:10.831336', 'step': 973, 'epoch': 1} {'type': 'loss', 'content': 0.012691162526607513, 'timestamp': '2025-10-01 04:17:10.847152', 'step': 974, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:10.899383', 'step': 974, 'epoch': 1} {'type': 'loss', 'content': 0.014483325183391571, 'timestamp': '2025-10-01 04:17:10.910707', 'step': 975, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:17:10.962942', 'step': 975, 'epoch': 1} {'type': 'loss', 'content': 0.0181131474673748, 'timestamp': '2025-10-01 04:17:10.997938', 'step': 976, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:11.043919', 'step': 976, 'epoch': 1} {'type': 'loss', 'content': 0.013813386671245098, 'timestamp': '2025-10-01 04:17:11.050032', 'step': 977, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:17:11.100565', 'step': 977, 'epoch': 1} {'type': 'loss', 'content': 0.016023170202970505, 'timestamp': '2025-10-01 04:17:11.113278', 'step': 978, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:17:11.167707', 'step': 978, 'epoch': 1} {'type': 'loss', 'content': 0.010002211667597294, 'timestamp': '2025-10-01 04:17:11.183571', 'step': 979, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:17:11.236243', 'step': 979, 'epoch': 1} {'type': 'loss', 'content': 0.007833736017346382, 'timestamp': '2025-10-01 04:17:11.271259', 'step': 980, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:11.318369', 'step': 980, 'epoch': 1} {'type': 'loss', 'content': 0.03426284343004227, 'timestamp': '2025-10-01 04:17:11.327587', 'step': 981, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:17:11.377695', 'step': 981, 'epoch': 1} {'type': 'loss', 'content': 0.01682828553020954, 'timestamp': '2025-10-01 04:17:11.391234', 'step': 982, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:17:11.438176', 'step': 982, 'epoch': 1} {'type': 'loss', 'content': 0.0335991270840168, 'timestamp': '2025-10-01 04:17:11.447081', 'step': 983, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:11.493985', 'step': 983, 'epoch': 1} {'type': 'loss', 'content': 0.012295123189687729, 'timestamp': '2025-10-01 04:17:11.525881', 'step': 984, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:17:11.576600', 'step': 984, 'epoch': 1} {'type': 'loss', 'content': 0.008960021659731865, 'timestamp': '2025-10-01 04:17:11.587387', 'step': 985, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:17:11.626259', 'step': 985, 'epoch': 1} {'type': 'loss', 'content': 0.014460061676800251, 'timestamp': '2025-10-01 04:17:11.633894', 'step': 986, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:11.679973', 'step': 986, 'epoch': 1} {'type': 'loss', 'content': 0.026935024186968803, 'timestamp': '2025-10-01 04:17:11.691464', 'step': 987, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 16136789420416}, 'timestamp': '2025-10-01 04:17:11.755796', 'step': 987, 'epoch': 1} {'type': 'loss', 'content': 0.019129833206534386, 'timestamp': '2025-10-01 04:17:11.795952', 'step': 988, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:11.829563', 'step': 988, 'epoch': 1} {'type': 'loss', 'content': 0.01680721901357174, 'timestamp': '2025-10-01 04:17:11.838788', 'step': 989, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:17:11.885631', 'step': 989, 'epoch': 1} {'type': 'loss', 'content': 0.011667316779494286, 'timestamp': '2025-10-01 04:17:11.893136', 'step': 990, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-10-01 04:17:11.947723', 'step': 990, 'epoch': 1} {'type': 'loss', 'content': 0.00897667370736599, 'timestamp': '2025-10-01 04:17:11.964301', 'step': 991, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:12.009065', 'step': 991, 'epoch': 1} {'type': 'loss', 'content': 0.012527829967439175, 'timestamp': '2025-10-01 04:17:12.038252', 'step': 992, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:12.085558', 'step': 992, 'epoch': 1} {'type': 'loss', 'content': 0.011188849806785583, 'timestamp': '2025-10-01 04:17:12.095919', 'step': 993, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:12.143672', 'step': 993, 'epoch': 1} {'type': 'loss', 'content': 0.013121136464178562, 'timestamp': '2025-10-01 04:17:12.151898', 'step': 994, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:12.201583', 'step': 994, 'epoch': 1} {'type': 'loss', 'content': 0.01433892734348774, 'timestamp': '2025-10-01 04:17:12.209753', 'step': 995, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:17:12.260143', 'step': 995, 'epoch': 1} {'type': 'loss', 'content': 0.01560207549482584, 'timestamp': '2025-10-01 04:17:12.294623', 'step': 996, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:17:12.335119', 'step': 996, 'epoch': 1} {'type': 'loss', 'content': 0.015023935586214066, 'timestamp': '2025-10-01 04:17:12.340903', 'step': 997, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:17:12.390127', 'step': 997, 'epoch': 1} {'type': 'loss', 'content': 0.013590299524366856, 'timestamp': '2025-10-01 04:17:12.397254', 'step': 998, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:17:12.438925', 'step': 998, 'epoch': 1} {'type': 'loss', 'content': 0.018429504707455635, 'timestamp': '2025-10-01 04:17:12.446974', 'step': 999, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:12.486625', 'step': 999, 'epoch': 1} {'type': 'loss', 'content': 0.016587775200605392, 'timestamp': '2025-10-01 04:17:12.519153', 'step': 1000, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 1000', 'timestamp': '2025-10-01 04:17:17.425141', 'step': 1000, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:17:17.459693', 'step': 1000, 'epoch': 1} {'type': 'loss', 'content': 0.029363514855504036, 'timestamp': '2025-10-01 04:17:17.462061', 'step': 1001, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:17.501664', 'step': 1001, 'epoch': 1} {'type': 'loss', 'content': 0.025521263480186462, 'timestamp': '2025-10-01 04:17:17.512334', 'step': 1002, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-10-01 04:17:17.557320', 'step': 1002, 'epoch': 1} {'type': 'loss', 'content': 0.022868966683745384, 'timestamp': '2025-10-01 04:17:17.561629', 'step': 1003, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:17:17.603517', 'step': 1003, 'epoch': 1} {'type': 'loss', 'content': 0.022702863439917564, 'timestamp': '2025-10-01 04:17:17.631841', 'step': 1004, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:17:17.683724', 'step': 1004, 'epoch': 1} {'type': 'loss', 'content': 0.015869054943323135, 'timestamp': '2025-10-01 04:17:17.688797', 'step': 1005, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:17:17.742823', 'step': 1005, 'epoch': 1} {'type': 'loss', 'content': 0.023160457611083984, 'timestamp': '2025-10-01 04:17:17.747783', 'step': 1006, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:17.794507', 'step': 1006, 'epoch': 1} {'type': 'loss', 'content': 0.01627097837626934, 'timestamp': '2025-10-01 04:17:17.805473', 'step': 1007, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:17:17.859505', 'step': 1007, 'epoch': 1} {'type': 'loss', 'content': 0.007242567837238312, 'timestamp': '2025-10-01 04:17:17.894480', 'step': 1008, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:17:17.936905', 'step': 1008, 'epoch': 1} {'type': 'loss', 'content': 0.009492221288383007, 'timestamp': '2025-10-01 04:17:17.950276', 'step': 1009, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:17:18.001021', 'step': 1009, 'epoch': 1} {'type': 'loss', 'content': 0.01652020402252674, 'timestamp': '2025-10-01 04:17:18.008603', 'step': 1010, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:17:18.058602', 'step': 1010, 'epoch': 1} {'type': 'loss', 'content': 0.024068601429462433, 'timestamp': '2025-10-01 04:17:18.066527', 'step': 1011, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:18.121864', 'step': 1011, 'epoch': 1} {'type': 'loss', 'content': 0.01947433315217495, 'timestamp': '2025-10-01 04:17:18.154626', 'step': 1012, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:18.203882', 'step': 1012, 'epoch': 1} {'type': 'loss', 'content': 0.009662994183599949, 'timestamp': '2025-10-01 04:17:18.213049', 'step': 1013, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:18.248813', 'step': 1013, 'epoch': 1} {'type': 'loss', 'content': 0.01760336384177208, 'timestamp': '2025-10-01 04:17:18.260374', 'step': 1014, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:17:18.321597', 'step': 1014, 'epoch': 1} {'type': 'loss', 'content': 0.016163619235157967, 'timestamp': '2025-10-01 04:17:18.335610', 'step': 1015, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:18.386435', 'step': 1015, 'epoch': 1} {'type': 'loss', 'content': 0.01403619721531868, 'timestamp': '2025-10-01 04:17:18.419105', 'step': 1016, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:17:18.460256', 'step': 1016, 'epoch': 1} {'type': 'loss', 'content': 0.03473404422402382, 'timestamp': '2025-10-01 04:17:18.465334', 'step': 1017, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:17:18.513103', 'step': 1017, 'epoch': 1} {'type': 'loss', 'content': 0.02103380300104618, 'timestamp': '2025-10-01 04:17:18.521087', 'step': 1018, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:17:18.565065', 'step': 1018, 'epoch': 1} {'type': 'loss', 'content': 0.011321358382701874, 'timestamp': '2025-10-01 04:17:18.579279', 'step': 1019, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 17560600598464}, 'timestamp': '2025-10-01 04:17:18.637346', 'step': 1019, 'epoch': 1} {'type': 'loss', 'content': 0.007383960299193859, 'timestamp': '2025-10-01 04:17:18.679611', 'step': 1020, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:17:18.727949', 'step': 1020, 'epoch': 1} {'type': 'loss', 'content': 0.024456460028886795, 'timestamp': '2025-10-01 04:17:18.733771', 'step': 1021, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:17:18.789968', 'step': 1021, 'epoch': 1} {'type': 'loss', 'content': 0.016897715628147125, 'timestamp': '2025-10-01 04:17:18.803933', 'step': 1022, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:17:18.847653', 'step': 1022, 'epoch': 1} {'type': 'loss', 'content': 0.019029518589377403, 'timestamp': '2025-10-01 04:17:18.854902', 'step': 1023, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:18.891358', 'step': 1023, 'epoch': 1} {'type': 'loss', 'content': 0.019498202949762344, 'timestamp': '2025-10-01 04:17:18.923117', 'step': 1024, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:17:18.985085', 'step': 1024, 'epoch': 1} {'type': 'loss', 'content': 0.01652212254703045, 'timestamp': '2025-10-01 04:17:18.990854', 'step': 1025, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:17:19.034182', 'step': 1025, 'epoch': 1} {'type': 'loss', 'content': 0.014433389529585838, 'timestamp': '2025-10-01 04:17:19.042319', 'step': 1026, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:17:19.096271', 'step': 1026, 'epoch': 1} {'type': 'loss', 'content': 0.021852705627679825, 'timestamp': '2025-10-01 04:17:19.104025', 'step': 1027, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:17:19.158270', 'step': 1027, 'epoch': 1} {'type': 'loss', 'content': 0.015619592741131783, 'timestamp': '2025-10-01 04:17:19.192001', 'step': 1028, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:19.244137', 'step': 1028, 'epoch': 1} {'type': 'loss', 'content': 0.030794495716691017, 'timestamp': '2025-10-01 04:17:19.253342', 'step': 1029, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:17:19.305957', 'step': 1029, 'epoch': 1} {'type': 'loss', 'content': 0.012157322838902473, 'timestamp': '2025-10-01 04:17:19.318721', 'step': 1030, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:17:19.368457', 'step': 1030, 'epoch': 1} {'type': 'loss', 'content': 0.01442454382777214, 'timestamp': '2025-10-01 04:17:19.376599', 'step': 1031, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:19.418795', 'step': 1031, 'epoch': 1} {'type': 'loss', 'content': 0.02153847925364971, 'timestamp': '2025-10-01 04:17:19.450411', 'step': 1032, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-10-01 04:17:19.499156', 'step': 1032, 'epoch': 1} {'type': 'loss', 'content': 0.02511165663599968, 'timestamp': '2025-10-01 04:17:19.501138', 'step': 1033, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:17:19.545107', 'step': 1033, 'epoch': 1} {'type': 'loss', 'content': 0.013348649255931377, 'timestamp': '2025-10-01 04:17:19.552998', 'step': 1034, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:17:19.601332', 'step': 1034, 'epoch': 1} {'type': 'loss', 'content': 0.008794315159320831, 'timestamp': '2025-10-01 04:17:19.614843', 'step': 1035, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-10-01 04:17:22.675290', 'step': 1035, 'epoch': 1} {'type': 'pplx', 'content': 5.589016328582297, 'timestamp': '2025-10-01 04:17:22.678040', 'step': 1035, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:22.718928', 'step': 1035, 'epoch': 1} {'type': 'loss', 'content': 0.01977011188864708, 'timestamp': '2025-10-01 04:17:22.747475', 'step': 1036, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:22.805391', 'step': 1036, 'epoch': 1} {'type': 'loss', 'content': 0.020062269642949104, 'timestamp': '2025-10-01 04:17:22.813510', 'step': 1037, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:22.869889', 'step': 1037, 'epoch': 1} {'type': 'loss', 'content': 0.018885672092437744, 'timestamp': '2025-10-01 04:17:22.878157', 'step': 1038, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:22.923617', 'step': 1038, 'epoch': 1} {'type': 'loss', 'content': 0.013230674900114536, 'timestamp': '2025-10-01 04:17:22.936147', 'step': 1039, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:17:22.991980', 'step': 1039, 'epoch': 1} {'type': 'loss', 'content': 0.013500719331204891, 'timestamp': '2025-10-01 04:17:23.026501', 'step': 1040, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:23.071948', 'step': 1040, 'epoch': 1} {'type': 'loss', 'content': 0.015912940725684166, 'timestamp': '2025-10-01 04:17:23.077755', 'step': 1041, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:23.127977', 'step': 1041, 'epoch': 1} {'type': 'loss', 'content': 0.016636217013001442, 'timestamp': '2025-10-01 04:17:23.136231', 'step': 1042, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:23.174422', 'step': 1042, 'epoch': 1} {'type': 'loss', 'content': 0.009479730390012264, 'timestamp': '2025-10-01 04:17:23.182793', 'step': 1043, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:17:23.228339', 'step': 1043, 'epoch': 1} {'type': 'loss', 'content': 0.010722165927290916, 'timestamp': '2025-10-01 04:17:23.262017', 'step': 1044, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:17:23.321593', 'step': 1044, 'epoch': 1} {'type': 'loss', 'content': 0.008445756509900093, 'timestamp': '2025-10-01 04:17:23.336896', 'step': 1045, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:17:23.379215', 'step': 1045, 'epoch': 1} {'type': 'loss', 'content': 0.012430555187165737, 'timestamp': '2025-10-01 04:17:23.393216', 'step': 1046, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:17:23.442798', 'step': 1046, 'epoch': 1} {'type': 'loss', 'content': 0.019145909696817398, 'timestamp': '2025-10-01 04:17:23.450035', 'step': 1047, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-10-01 04:17:23.511321', 'step': 1047, 'epoch': 1} {'type': 'loss', 'content': 0.00631029112264514, 'timestamp': '2025-10-01 04:17:23.548685', 'step': 1048, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:17:23.590980', 'step': 1048, 'epoch': 1} {'type': 'loss', 'content': 0.014578403905034065, 'timestamp': '2025-10-01 04:17:23.603735', 'step': 1049, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:17:23.648898', 'step': 1049, 'epoch': 1} {'type': 'loss', 'content': 0.008258086629211903, 'timestamp': '2025-10-01 04:17:23.662863', 'step': 1050, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:23.707675', 'step': 1050, 'epoch': 1} {'type': 'loss', 'content': 0.01694604568183422, 'timestamp': '2025-10-01 04:17:23.715940', 'step': 1051, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:17:23.768956', 'step': 1051, 'epoch': 1} {'type': 'loss', 'content': 0.012945438735187054, 'timestamp': '2025-10-01 04:17:23.804036', 'step': 1052, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:17:23.854511', 'step': 1052, 'epoch': 1} {'type': 'loss', 'content': 0.017242493107914925, 'timestamp': '2025-10-01 04:17:23.860033', 'step': 1053, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:23.903694', 'step': 1053, 'epoch': 1} {'type': 'loss', 'content': 0.006263604387640953, 'timestamp': '2025-10-01 04:17:23.911857', 'step': 1054, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:23.964164', 'step': 1054, 'epoch': 1} {'type': 'loss', 'content': 0.012724227271974087, 'timestamp': '2025-10-01 04:17:23.974925', 'step': 1055, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:17:24.011809', 'step': 1055, 'epoch': 1} {'type': 'loss', 'content': 0.02388584241271019, 'timestamp': '2025-10-01 04:17:24.045464', 'step': 1056, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-10-01 04:17:24.092953', 'step': 1056, 'epoch': 1} {'type': 'loss', 'content': 0.022655868902802467, 'timestamp': '2025-10-01 04:17:24.108941', 'step': 1057, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:24.149086', 'step': 1057, 'epoch': 1} {'type': 'loss', 'content': 0.02951449528336525, 'timestamp': '2025-10-01 04:17:24.159867', 'step': 1058, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:24.207989', 'step': 1058, 'epoch': 1} {'type': 'loss', 'content': 0.01995285600423813, 'timestamp': '2025-10-01 04:17:24.216237', 'step': 1059, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:17:24.251005', 'step': 1059, 'epoch': 1} {'type': 'loss', 'content': 0.009827209636569023, 'timestamp': '2025-10-01 04:17:24.279827', 'step': 1060, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:17:24.333890', 'step': 1060, 'epoch': 1} {'type': 'loss', 'content': 0.015267257578670979, 'timestamp': '2025-10-01 04:17:24.347203', 'step': 1061, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:17:24.404327', 'step': 1061, 'epoch': 1} {'type': 'loss', 'content': 0.019011657685041428, 'timestamp': '2025-10-01 04:17:24.411741', 'step': 1062, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:17:24.445988', 'step': 1062, 'epoch': 1} {'type': 'loss', 'content': 0.036112286150455475, 'timestamp': '2025-10-01 04:17:24.450960', 'step': 1063, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:24.501512', 'step': 1063, 'epoch': 1} {'type': 'loss', 'content': 0.01583307608962059, 'timestamp': '2025-10-01 04:17:24.533843', 'step': 1064, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:24.569204', 'step': 1064, 'epoch': 1} {'type': 'loss', 'content': 0.010830563493072987, 'timestamp': '2025-10-01 04:17:24.575180', 'step': 1065, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:24.627578', 'step': 1065, 'epoch': 1} {'type': 'loss', 'content': 0.02353152260184288, 'timestamp': '2025-10-01 04:17:24.638461', 'step': 1066, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:17:24.698157', 'step': 1066, 'epoch': 1} {'type': 'loss', 'content': 0.03244931623339653, 'timestamp': '2025-10-01 04:17:24.703221', 'step': 1067, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:17:24.748572', 'step': 1067, 'epoch': 1} {'type': 'loss', 'content': 0.0076681384816765785, 'timestamp': '2025-10-01 04:17:24.783122', 'step': 1068, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:24.821734', 'step': 1068, 'epoch': 1} {'type': 'loss', 'content': 0.02198783867061138, 'timestamp': '2025-10-01 04:17:24.830102', 'step': 1069, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:17:24.880866', 'step': 1069, 'epoch': 1} {'type': 'loss', 'content': 0.011638913303613663, 'timestamp': '2025-10-01 04:17:24.888815', 'step': 1070, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:17:24.943250', 'step': 1070, 'epoch': 1} {'type': 'loss', 'content': 0.014068571850657463, 'timestamp': '2025-10-01 04:17:24.948143', 'step': 1071, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:25.002616', 'step': 1071, 'epoch': 1} {'type': 'loss', 'content': 0.019632212817668915, 'timestamp': '2025-10-01 04:17:25.034496', 'step': 1072, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:17:25.071323', 'step': 1072, 'epoch': 1} {'type': 'loss', 'content': 0.03335629776120186, 'timestamp': '2025-10-01 04:17:25.075631', 'step': 1073, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:17:25.130802', 'step': 1073, 'epoch': 1} {'type': 'loss', 'content': 0.028185226023197174, 'timestamp': '2025-10-01 04:17:25.138230', 'step': 1074, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-10-01 04:17:25.173755', 'step': 1074, 'epoch': 1} {'type': 'loss', 'content': 0.07017187029123306, 'timestamp': '2025-10-01 04:17:25.178448', 'step': 1075, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:17:25.233600', 'step': 1075, 'epoch': 1} {'type': 'loss', 'content': 0.0075536505319178104, 'timestamp': '2025-10-01 04:17:25.267267', 'step': 1076, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-10-01 04:17:25.314629', 'step': 1076, 'epoch': 1} {'type': 'loss', 'content': 0.016657954081892967, 'timestamp': '2025-10-01 04:17:25.316849', 'step': 1077, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:17:25.369606', 'step': 1077, 'epoch': 1} {'type': 'loss', 'content': 0.01976245641708374, 'timestamp': '2025-10-01 04:17:25.374668', 'step': 1078, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:17:25.419207', 'step': 1078, 'epoch': 1} {'type': 'loss', 'content': 0.019535692408680916, 'timestamp': '2025-10-01 04:17:25.424301', 'step': 1079, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:25.474939', 'step': 1079, 'epoch': 1} {'type': 'loss', 'content': 0.03400663658976555, 'timestamp': '2025-10-01 04:17:25.504228', 'step': 1080, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:17:25.549837', 'step': 1080, 'epoch': 1} {'type': 'loss', 'content': 0.017138982191681862, 'timestamp': '2025-10-01 04:17:25.555247', 'step': 1081, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:17:25.602295', 'step': 1081, 'epoch': 1} {'type': 'loss', 'content': 0.013252847827970982, 'timestamp': '2025-10-01 04:17:25.609873', 'step': 1082, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:17:25.658853', 'step': 1082, 'epoch': 1} {'type': 'loss', 'content': 0.014612991362810135, 'timestamp': '2025-10-01 04:17:25.666808', 'step': 1083, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:17:25.711126', 'step': 1083, 'epoch': 1} {'type': 'loss', 'content': 0.02832266315817833, 'timestamp': '2025-10-01 04:17:25.739450', 'step': 1084, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:17:25.779389', 'step': 1084, 'epoch': 1} {'type': 'loss', 'content': 0.02650672011077404, 'timestamp': '2025-10-01 04:17:25.784562', 'step': 1085, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:25.831292', 'step': 1085, 'epoch': 1} {'type': 'loss', 'content': 0.03629383072257042, 'timestamp': '2025-10-01 04:17:25.842885', 'step': 1086, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:25.878167', 'step': 1086, 'epoch': 1} {'type': 'loss', 'content': 0.023915940895676613, 'timestamp': '2025-10-01 04:17:25.890700', 'step': 1087, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:17:25.948663', 'step': 1087, 'epoch': 1} {'type': 'loss', 'content': 0.015359682962298393, 'timestamp': '2025-10-01 04:17:25.983790', 'step': 1088, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:26.034179', 'step': 1088, 'epoch': 1} {'type': 'loss', 'content': 0.019221914932131767, 'timestamp': '2025-10-01 04:17:26.044647', 'step': 1089, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:26.096978', 'step': 1089, 'epoch': 1} {'type': 'loss', 'content': 0.01492657046765089, 'timestamp': '2025-10-01 04:17:26.108597', 'step': 1090, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:26.159250', 'step': 1090, 'epoch': 1} {'type': 'loss', 'content': 0.021398749202489853, 'timestamp': '2025-10-01 04:17:26.170056', 'step': 1091, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:17:26.211093', 'step': 1091, 'epoch': 1} {'type': 'loss', 'content': 0.04730100557208061, 'timestamp': '2025-10-01 04:17:26.239742', 'step': 1092, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:17:26.289944', 'step': 1092, 'epoch': 1} {'type': 'loss', 'content': 0.011430156417191029, 'timestamp': '2025-10-01 04:17:26.305232', 'step': 1093, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:17:26.351082', 'step': 1093, 'epoch': 1} {'type': 'loss', 'content': 0.00968689564615488, 'timestamp': '2025-10-01 04:17:26.365121', 'step': 1094, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:26.412189', 'step': 1094, 'epoch': 1} {'type': 'loss', 'content': 0.026927629485726357, 'timestamp': '2025-10-01 04:17:26.423013', 'step': 1095, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:26.478969', 'step': 1095, 'epoch': 1} {'type': 'loss', 'content': 0.01579202152788639, 'timestamp': '2025-10-01 04:17:26.508289', 'step': 1096, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:17:26.551359', 'step': 1096, 'epoch': 1} {'type': 'loss', 'content': 0.02208098955452442, 'timestamp': '2025-10-01 04:17:26.557031', 'step': 1097, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:17:26.611023', 'step': 1097, 'epoch': 1} {'type': 'loss', 'content': 0.025204770267009735, 'timestamp': '2025-10-01 04:17:26.618748', 'step': 1098, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:17:26.667509', 'step': 1098, 'epoch': 1} {'type': 'loss', 'content': 0.023734422400593758, 'timestamp': '2025-10-01 04:17:26.681685', 'step': 1099, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:17:26.732182', 'step': 1099, 'epoch': 1} {'type': 'loss', 'content': 0.016469068825244904, 'timestamp': '2025-10-01 04:17:26.761297', 'step': 1100, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 14238374516352}, 'timestamp': '2025-10-01 04:17:26.822922', 'step': 1100, 'epoch': 1} {'type': 'loss', 'content': 0.011381858959794044, 'timestamp': '2025-10-01 04:17:26.840053', 'step': 1101, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:26.891802', 'step': 1101, 'epoch': 1} {'type': 'loss', 'content': 0.007237431593239307, 'timestamp': '2025-10-01 04:17:26.900118', 'step': 1102, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:17:26.940943', 'step': 1102, 'epoch': 1} {'type': 'loss', 'content': 0.02881944365799427, 'timestamp': '2025-10-01 04:17:26.948120', 'step': 1103, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:17:27.027558', 'step': 1103, 'epoch': 1} {'type': 'loss', 'content': 0.014294079504907131, 'timestamp': '2025-10-01 04:17:27.056484', 'step': 1104, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:27.118074', 'step': 1104, 'epoch': 1} {'type': 'loss', 'content': 0.0212942473590374, 'timestamp': '2025-10-01 04:17:27.124017', 'step': 1105, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:17:27.201583', 'step': 1105, 'epoch': 1} {'type': 'loss', 'content': 0.01620054617524147, 'timestamp': '2025-10-01 04:17:27.209257', 'step': 1106, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:27.250324', 'step': 1106, 'epoch': 1} {'type': 'loss', 'content': 0.014214823953807354, 'timestamp': '2025-10-01 04:17:27.264415', 'step': 1107, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:17:27.315403', 'step': 1107, 'epoch': 1} {'type': 'loss', 'content': 0.017317280173301697, 'timestamp': '2025-10-01 04:17:27.347904', 'step': 1108, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:17:27.415549', 'step': 1108, 'epoch': 1} {'type': 'loss', 'content': 0.025598622858524323, 'timestamp': '2025-10-01 04:17:27.431444', 'step': 1109, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:17:27.486429', 'step': 1109, 'epoch': 1} {'type': 'loss', 'content': 0.010355249047279358, 'timestamp': '2025-10-01 04:17:27.499235', 'step': 1110, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:27.560596', 'step': 1110, 'epoch': 1} {'type': 'loss', 'content': 0.02140643633902073, 'timestamp': '2025-10-01 04:17:27.571389', 'step': 1111, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:17:27.616476', 'step': 1111, 'epoch': 1} {'type': 'loss', 'content': 0.019826127216219902, 'timestamp': '2025-10-01 04:17:27.654371', 'step': 1112, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:17:27.702586', 'step': 1112, 'epoch': 1} {'type': 'loss', 'content': 0.016084056347608566, 'timestamp': '2025-10-01 04:17:27.721922', 'step': 1113, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:17:27.777088', 'step': 1113, 'epoch': 1} {'type': 'loss', 'content': 0.018217677250504494, 'timestamp': '2025-10-01 04:17:27.784485', 'step': 1114, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:27.833832', 'step': 1114, 'epoch': 1} {'type': 'loss', 'content': 0.03015352599322796, 'timestamp': '2025-10-01 04:17:27.844458', 'step': 1115, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:17:27.901882', 'step': 1115, 'epoch': 1} {'type': 'loss', 'content': 0.03604268282651901, 'timestamp': '2025-10-01 04:17:27.939999', 'step': 1116, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-10-01 04:17:28.001977', 'step': 1116, 'epoch': 1} {'type': 'loss', 'content': 0.0051101199351251125, 'timestamp': '2025-10-01 04:17:28.018219', 'step': 1117, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:17:28.070408', 'step': 1117, 'epoch': 1} {'type': 'loss', 'content': 0.026685962453484535, 'timestamp': '2025-10-01 04:17:28.085398', 'step': 1118, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:17:28.143330', 'step': 1118, 'epoch': 1} {'type': 'loss', 'content': 0.014942211098968983, 'timestamp': '2025-10-01 04:17:28.156048', 'step': 1119, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:28.208051', 'step': 1119, 'epoch': 1} {'type': 'loss', 'content': 0.030991006642580032, 'timestamp': '2025-10-01 04:17:28.239610', 'step': 1120, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:17:28.286595', 'step': 1120, 'epoch': 1} {'type': 'loss', 'content': 0.01682109385728836, 'timestamp': '2025-10-01 04:17:28.300166', 'step': 1121, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:17:28.355098', 'step': 1121, 'epoch': 1} {'type': 'loss', 'content': 0.00787026435136795, 'timestamp': '2025-10-01 04:17:28.369277', 'step': 1122, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:28.423143', 'step': 1122, 'epoch': 1} {'type': 'loss', 'content': 0.05004487559199333, 'timestamp': '2025-10-01 04:17:28.435676', 'step': 1123, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:17:28.472160', 'step': 1123, 'epoch': 1} {'type': 'loss', 'content': 0.03538050875067711, 'timestamp': '2025-10-01 04:17:28.501121', 'step': 1124, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:17:28.563963', 'step': 1124, 'epoch': 1} {'type': 'loss', 'content': 0.016451191157102585, 'timestamp': '2025-10-01 04:17:28.577505', 'step': 1125, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:17:28.639556', 'step': 1125, 'epoch': 1} {'type': 'loss', 'content': 0.01214256975799799, 'timestamp': '2025-10-01 04:17:28.652361', 'step': 1126, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:17:28.736214', 'step': 1126, 'epoch': 1} {'type': 'loss', 'content': 0.029804132878780365, 'timestamp': '2025-10-01 04:17:28.752007', 'step': 1127, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:28.799239', 'step': 1127, 'epoch': 1} {'type': 'loss', 'content': 0.030172690749168396, 'timestamp': '2025-10-01 04:17:28.828526', 'step': 1128, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:17:28.876605', 'step': 1128, 'epoch': 1} {'type': 'loss', 'content': 0.009061760269105434, 'timestamp': '2025-10-01 04:17:28.889969', 'step': 1129, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:17:28.936280', 'step': 1129, 'epoch': 1} {'type': 'loss', 'content': 0.009687329642474651, 'timestamp': '2025-10-01 04:17:28.950254', 'step': 1130, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:17:28.991938', 'step': 1130, 'epoch': 1} {'type': 'loss', 'content': 0.037153247743844986, 'timestamp': '2025-10-01 04:17:28.999546', 'step': 1131, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:17:29.050652', 'step': 1131, 'epoch': 1} {'type': 'loss', 'content': 0.01489564310759306, 'timestamp': '2025-10-01 04:17:29.085149', 'step': 1132, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:17:29.127433', 'step': 1132, 'epoch': 1} {'type': 'loss', 'content': 0.018879631534218788, 'timestamp': '2025-10-01 04:17:29.138350', 'step': 1133, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:29.201914', 'step': 1133, 'epoch': 1} {'type': 'loss', 'content': 0.02617711015045643, 'timestamp': '2025-10-01 04:17:29.214478', 'step': 1134, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:29.251164', 'step': 1134, 'epoch': 1} {'type': 'loss', 'content': 0.01058194600045681, 'timestamp': '2025-10-01 04:17:29.262550', 'step': 1135, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:17:29.317341', 'step': 1135, 'epoch': 1} {'type': 'loss', 'content': 0.029065648093819618, 'timestamp': '2025-10-01 04:17:29.345921', 'step': 1136, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:17:29.401134', 'step': 1136, 'epoch': 1} {'type': 'loss', 'content': 0.005697314161807299, 'timestamp': '2025-10-01 04:17:29.406263', 'step': 1137, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:29.447527', 'step': 1137, 'epoch': 1} {'type': 'loss', 'content': 0.01511340495198965, 'timestamp': '2025-10-01 04:17:29.460049', 'step': 1138, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:29.508541', 'step': 1138, 'epoch': 1} {'type': 'loss', 'content': 0.014119033701717854, 'timestamp': '2025-10-01 04:17:29.520232', 'step': 1139, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:29.562998', 'step': 1139, 'epoch': 1} {'type': 'loss', 'content': 0.03635350614786148, 'timestamp': '2025-10-01 04:17:29.595496', 'step': 1140, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:17:29.645824', 'step': 1140, 'epoch': 1} {'type': 'loss', 'content': 0.008033953607082367, 'timestamp': '2025-10-01 04:17:29.659217', 'step': 1141, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:17:29.713942', 'step': 1141, 'epoch': 1} {'type': 'loss', 'content': 0.010798295959830284, 'timestamp': '2025-10-01 04:17:29.721940', 'step': 1142, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:29.770831', 'step': 1142, 'epoch': 1} {'type': 'loss', 'content': 0.010950013063848019, 'timestamp': '2025-10-01 04:17:29.783343', 'step': 1143, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:29.840313', 'step': 1143, 'epoch': 1} {'type': 'loss', 'content': 0.019146915525197983, 'timestamp': '2025-10-01 04:17:29.872672', 'step': 1144, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:29.921617', 'step': 1144, 'epoch': 1} {'type': 'loss', 'content': 0.03491201624274254, 'timestamp': '2025-10-01 04:17:29.927468', 'step': 1145, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:29.964622', 'step': 1145, 'epoch': 1} {'type': 'loss', 'content': 0.019177889451384544, 'timestamp': '2025-10-01 04:17:29.977139', 'step': 1146, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:17:30.034406', 'step': 1146, 'epoch': 1} {'type': 'loss', 'content': 0.020844656974077225, 'timestamp': '2025-10-01 04:17:30.041884', 'step': 1147, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:30.077580', 'step': 1147, 'epoch': 1} {'type': 'loss', 'content': 0.02545192278921604, 'timestamp': '2025-10-01 04:17:30.111041', 'step': 1148, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:30.148389', 'step': 1148, 'epoch': 1} {'type': 'loss', 'content': 0.007937092334032059, 'timestamp': '2025-10-01 04:17:30.157380', 'step': 1149, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:17:30.196411', 'step': 1149, 'epoch': 1} {'type': 'loss', 'content': 0.01861865632236004, 'timestamp': '2025-10-01 04:17:30.204629', 'step': 1150, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-10-01 04:17:33.099818', 'step': 1150, 'epoch': 1} {'type': 'pplx', 'content': 5.663423561900352, 'timestamp': '2025-10-01 04:17:33.101914', 'step': 1150, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:33.140483', 'step': 1150, 'epoch': 1} {'type': 'loss', 'content': 0.022721409797668457, 'timestamp': '2025-10-01 04:17:33.151774', 'step': 1151, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:33.186690', 'step': 1151, 'epoch': 1} {'type': 'loss', 'content': 0.030405888333916664, 'timestamp': '2025-10-01 04:17:33.219081', 'step': 1152, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-10-01 04:17:33.270804', 'step': 1152, 'epoch': 1} {'type': 'loss', 'content': 0.006386559922248125, 'timestamp': '2025-10-01 04:17:33.287615', 'step': 1153, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:17:33.335689', 'step': 1153, 'epoch': 1} {'type': 'loss', 'content': 0.007791842333972454, 'timestamp': '2025-10-01 04:17:33.349262', 'step': 1154, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:17:33.399859', 'step': 1154, 'epoch': 1} {'type': 'loss', 'content': 0.013099238276481628, 'timestamp': '2025-10-01 04:17:33.413830', 'step': 1155, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:17:33.453678', 'step': 1155, 'epoch': 1} {'type': 'loss', 'content': 0.014108271338045597, 'timestamp': '2025-10-01 04:17:33.488115', 'step': 1156, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:17:33.528380', 'step': 1156, 'epoch': 1} {'type': 'loss', 'content': 0.007845171727240086, 'timestamp': '2025-10-01 04:17:33.541757', 'step': 1157, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:17:33.584213', 'step': 1157, 'epoch': 1} {'type': 'loss', 'content': 0.014060585759580135, 'timestamp': '2025-10-01 04:17:33.596939', 'step': 1158, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:33.647091', 'step': 1158, 'epoch': 1} {'type': 'loss', 'content': 0.017174696549773216, 'timestamp': '2025-10-01 04:17:33.659605', 'step': 1159, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:17:33.705814', 'step': 1159, 'epoch': 1} {'type': 'loss', 'content': 0.010423105210065842, 'timestamp': '2025-10-01 04:17:33.740700', 'step': 1160, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-10-01 04:17:33.791679', 'step': 1160, 'epoch': 1} {'type': 'loss', 'content': 0.011193135753273964, 'timestamp': '2025-10-01 04:17:33.807573', 'step': 1161, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:33.852849', 'step': 1161, 'epoch': 1} {'type': 'loss', 'content': 0.02807903103530407, 'timestamp': '2025-10-01 04:17:33.864514', 'step': 1162, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:17:33.920046', 'step': 1162, 'epoch': 1} {'type': 'loss', 'content': 0.008977462537586689, 'timestamp': '2025-10-01 04:17:33.932846', 'step': 1163, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:17:33.993739', 'step': 1163, 'epoch': 1} {'type': 'loss', 'content': 0.011503579095005989, 'timestamp': '2025-10-01 04:17:34.028680', 'step': 1164, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:17:34.081535', 'step': 1164, 'epoch': 1} {'type': 'loss', 'content': 0.01877966895699501, 'timestamp': '2025-10-01 04:17:34.092691', 'step': 1165, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:17:34.130133', 'step': 1165, 'epoch': 1} {'type': 'loss', 'content': 0.015073059126734734, 'timestamp': '2025-10-01 04:17:34.142891', 'step': 1166, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:17:34.192080', 'step': 1166, 'epoch': 1} {'type': 'loss', 'content': 0.01644594967365265, 'timestamp': '2025-10-01 04:17:34.204855', 'step': 1167, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:17:34.254968', 'step': 1167, 'epoch': 1} {'type': 'loss', 'content': 0.011638459749519825, 'timestamp': '2025-10-01 04:17:34.288727', 'step': 1168, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:17:34.343005', 'step': 1168, 'epoch': 1} {'type': 'loss', 'content': 0.012958639301359653, 'timestamp': '2025-10-01 04:17:34.354155', 'step': 1169, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:17:34.418272', 'step': 1169, 'epoch': 1} {'type': 'loss', 'content': 0.018649686127901077, 'timestamp': '2025-10-01 04:17:34.432217', 'step': 1170, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:17:34.476904', 'step': 1170, 'epoch': 1} {'type': 'loss', 'content': 0.010125924833118916, 'timestamp': '2025-10-01 04:17:34.492740', 'step': 1171, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:17:34.547439', 'step': 1171, 'epoch': 1} {'type': 'loss', 'content': 0.010869583114981651, 'timestamp': '2025-10-01 04:17:34.582422', 'step': 1172, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 15187581968384}, 'timestamp': '2025-10-01 04:17:34.629976', 'step': 1172, 'epoch': 1} {'type': 'loss', 'content': 0.010262774303555489, 'timestamp': '2025-10-01 04:17:34.647541', 'step': 1173, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:17:34.691291', 'step': 1173, 'epoch': 1} {'type': 'loss', 'content': 0.017048321664333344, 'timestamp': '2025-10-01 04:17:34.704864', 'step': 1174, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:34.741276', 'step': 1174, 'epoch': 1} {'type': 'loss', 'content': 0.014908799901604652, 'timestamp': '2025-10-01 04:17:34.752673', 'step': 1175, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:34.801453', 'step': 1175, 'epoch': 1} {'type': 'loss', 'content': 0.016088491305708885, 'timestamp': '2025-10-01 04:17:34.834947', 'step': 1176, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:17:34.885615', 'step': 1176, 'epoch': 1} {'type': 'loss', 'content': 0.013978206552565098, 'timestamp': '2025-10-01 04:17:34.898442', 'step': 1177, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:17:34.950594', 'step': 1177, 'epoch': 1} {'type': 'loss', 'content': 0.010303840972483158, 'timestamp': '2025-10-01 04:17:34.964560', 'step': 1178, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:17:35.007260', 'step': 1178, 'epoch': 1} {'type': 'loss', 'content': 0.011007364839315414, 'timestamp': '2025-10-01 04:17:35.021232', 'step': 1179, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:17:35.068547', 'step': 1179, 'epoch': 1} {'type': 'loss', 'content': 0.013410693034529686, 'timestamp': '2025-10-01 04:17:35.103056', 'step': 1180, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:35.151897', 'step': 1180, 'epoch': 1} {'type': 'loss', 'content': 0.01630300097167492, 'timestamp': '2025-10-01 04:17:35.162179', 'step': 1181, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:35.214879', 'step': 1181, 'epoch': 1} {'type': 'loss', 'content': 0.011947453953325748, 'timestamp': '2025-10-01 04:17:35.226704', 'step': 1182, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:35.273200', 'step': 1182, 'epoch': 1} {'type': 'loss', 'content': 0.020029593259096146, 'timestamp': '2025-10-01 04:17:35.284593', 'step': 1183, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:35.322193', 'step': 1183, 'epoch': 1} {'type': 'loss', 'content': 0.020637547597289085, 'timestamp': '2025-10-01 04:17:35.355714', 'step': 1184, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:17:35.414735', 'step': 1184, 'epoch': 1} {'type': 'loss', 'content': 0.012878896668553352, 'timestamp': '2025-10-01 04:17:35.419920', 'step': 1185, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:35.466086', 'step': 1185, 'epoch': 1} {'type': 'loss', 'content': 0.015593130141496658, 'timestamp': '2025-10-01 04:17:35.477729', 'step': 1186, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-10-01 04:17:35.545562', 'step': 1186, 'epoch': 1} {'type': 'loss', 'content': 0.020625686272978783, 'timestamp': '2025-10-01 04:17:35.562774', 'step': 1187, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:35.615709', 'step': 1187, 'epoch': 1} {'type': 'loss', 'content': 0.01320985984057188, 'timestamp': '2025-10-01 04:17:35.649191', 'step': 1188, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:17:35.691579', 'step': 1188, 'epoch': 1} {'type': 'loss', 'content': 0.015200401656329632, 'timestamp': '2025-10-01 04:17:35.704441', 'step': 1189, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:17:35.757945', 'step': 1189, 'epoch': 1} {'type': 'loss', 'content': 0.006745077669620514, 'timestamp': '2025-10-01 04:17:35.773758', 'step': 1190, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 16136789420416}, 'timestamp': '2025-10-01 04:17:35.835483', 'step': 1190, 'epoch': 1} {'type': 'loss', 'content': 0.006565955001860857, 'timestamp': '2025-10-01 04:17:35.854805', 'step': 1191, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:17:35.914857', 'step': 1191, 'epoch': 1} {'type': 'loss', 'content': 0.028700105845928192, 'timestamp': '2025-10-01 04:17:35.949999', 'step': 1192, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:35.987471', 'step': 1192, 'epoch': 1} {'type': 'loss', 'content': 0.029990121722221375, 'timestamp': '2025-10-01 04:17:35.997850', 'step': 1193, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:36.060231', 'step': 1193, 'epoch': 1} {'type': 'loss', 'content': 0.01301293633878231, 'timestamp': '2025-10-01 04:17:36.076114', 'step': 1194, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:17:36.131412', 'step': 1194, 'epoch': 1} {'type': 'loss', 'content': 0.008108424954116344, 'timestamp': '2025-10-01 04:17:36.147441', 'step': 1195, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:17:36.202997', 'step': 1195, 'epoch': 1} {'type': 'loss', 'content': 0.01209845021367073, 'timestamp': '2025-10-01 04:17:36.238039', 'step': 1196, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:17:36.297471', 'step': 1196, 'epoch': 1} {'type': 'loss', 'content': 0.006514541804790497, 'timestamp': '2025-10-01 04:17:36.313032', 'step': 1197, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:17:36.361682', 'step': 1197, 'epoch': 1} {'type': 'loss', 'content': 0.015631964430212975, 'timestamp': '2025-10-01 04:17:36.375694', 'step': 1198, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:36.425339', 'step': 1198, 'epoch': 1} {'type': 'loss', 'content': 0.01423635333776474, 'timestamp': '2025-10-01 04:17:36.436919', 'step': 1199, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:17:36.483466', 'step': 1199, 'epoch': 1} {'type': 'loss', 'content': 0.015068672597408295, 'timestamp': '2025-10-01 04:17:36.517167', 'step': 1200, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-10-01 04:17:36.568269', 'step': 1200, 'epoch': 1} {'type': 'loss', 'content': 0.021520573645830154, 'timestamp': '2025-10-01 04:17:36.584280', 'step': 1201, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:36.626926', 'step': 1201, 'epoch': 1} {'type': 'loss', 'content': 0.026878008618950844, 'timestamp': '2025-10-01 04:17:36.638577', 'step': 1202, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:17:36.675899', 'step': 1202, 'epoch': 1} {'type': 'loss', 'content': 0.01471509039402008, 'timestamp': '2025-10-01 04:17:36.689470', 'step': 1203, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-10-01 04:17:36.742794', 'step': 1203, 'epoch': 1} {'type': 'loss', 'content': 0.005375762470066547, 'timestamp': '2025-10-01 04:17:36.780071', 'step': 1204, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:17:36.829314', 'step': 1204, 'epoch': 1} {'type': 'loss', 'content': 0.011437565088272095, 'timestamp': '2025-10-01 04:17:36.842650', 'step': 1205, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:17:36.891927', 'step': 1205, 'epoch': 1} {'type': 'loss', 'content': 0.012570335529744625, 'timestamp': '2025-10-01 04:17:36.904697', 'step': 1206, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 14238374516352}, 'timestamp': '2025-10-01 04:17:36.959278', 'step': 1206, 'epoch': 1} {'type': 'loss', 'content': 0.010444839484989643, 'timestamp': '2025-10-01 04:17:36.976784', 'step': 1207, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:17:37.027236', 'step': 1207, 'epoch': 1} {'type': 'loss', 'content': 0.007274944335222244, 'timestamp': '2025-10-01 04:17:37.062409', 'step': 1208, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:37.120114', 'step': 1208, 'epoch': 1} {'type': 'loss', 'content': 0.029986314475536346, 'timestamp': '2025-10-01 04:17:37.126222', 'step': 1209, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:17:37.171731', 'step': 1209, 'epoch': 1} {'type': 'loss', 'content': 0.026890775188803673, 'timestamp': '2025-10-01 04:17:37.179088', 'step': 1210, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:37.229646', 'step': 1210, 'epoch': 1} {'type': 'loss', 'content': 0.027737146243453026, 'timestamp': '2025-10-01 04:17:37.238038', 'step': 1211, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:37.286051', 'step': 1211, 'epoch': 1} {'type': 'loss', 'content': 0.027179617434740067, 'timestamp': '2025-10-01 04:17:37.315197', 'step': 1212, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:37.358696', 'step': 1212, 'epoch': 1} {'type': 'loss', 'content': 0.024390676990151405, 'timestamp': '2025-10-01 04:17:37.364535', 'step': 1213, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:37.410354', 'step': 1213, 'epoch': 1} {'type': 'loss', 'content': 0.020427120849490166, 'timestamp': '2025-10-01 04:17:37.418605', 'step': 1214, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:17:37.466459', 'step': 1214, 'epoch': 1} {'type': 'loss', 'content': 0.02609584666788578, 'timestamp': '2025-10-01 04:17:37.474217', 'step': 1215, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:37.518638', 'step': 1215, 'epoch': 1} {'type': 'loss', 'content': 0.02734307199716568, 'timestamp': '2025-10-01 04:17:37.550245', 'step': 1216, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:17:37.588501', 'step': 1216, 'epoch': 1} {'type': 'loss', 'content': 0.03118213266134262, 'timestamp': '2025-10-01 04:17:37.596001', 'step': 1217, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:37.641053', 'step': 1217, 'epoch': 1} {'type': 'loss', 'content': 0.027074245736002922, 'timestamp': '2025-10-01 04:17:37.651837', 'step': 1218, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:17:37.696414', 'step': 1218, 'epoch': 1} {'type': 'loss', 'content': 0.03415197506546974, 'timestamp': '2025-10-01 04:17:37.709164', 'step': 1219, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:37.752311', 'step': 1219, 'epoch': 1} {'type': 'loss', 'content': 0.0326828807592392, 'timestamp': '2025-10-01 04:17:37.781500', 'step': 1220, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:17:37.826461', 'step': 1220, 'epoch': 1} {'type': 'loss', 'content': 0.03238120675086975, 'timestamp': '2025-10-01 04:17:37.832131', 'step': 1221, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:37.882318', 'step': 1221, 'epoch': 1} {'type': 'loss', 'content': 0.027889519929885864, 'timestamp': '2025-10-01 04:17:37.893116', 'step': 1222, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:17:37.937624', 'step': 1222, 'epoch': 1} {'type': 'loss', 'content': 0.026949942111968994, 'timestamp': '2025-10-01 04:17:37.945152', 'step': 1223, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:17:37.989027', 'step': 1223, 'epoch': 1} {'type': 'loss', 'content': 0.020087607204914093, 'timestamp': '2025-10-01 04:17:38.017360', 'step': 1224, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:17:38.059196', 'step': 1224, 'epoch': 1} {'type': 'loss', 'content': 0.014230186119675636, 'timestamp': '2025-10-01 04:17:38.063821', 'step': 1225, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:17:38.107702', 'step': 1225, 'epoch': 1} {'type': 'loss', 'content': 0.012527287937700748, 'timestamp': '2025-10-01 04:17:38.115769', 'step': 1226, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:38.159597', 'step': 1226, 'epoch': 1} {'type': 'loss', 'content': 0.0270552821457386, 'timestamp': '2025-10-01 04:17:38.172115', 'step': 1227, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:38.229493', 'step': 1227, 'epoch': 1} {'type': 'loss', 'content': 0.021428698673844337, 'timestamp': '2025-10-01 04:17:38.258636', 'step': 1228, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:17:38.297707', 'step': 1228, 'epoch': 1} {'type': 'loss', 'content': 0.026954419910907745, 'timestamp': '2025-10-01 04:17:38.303208', 'step': 1229, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:38.349099', 'step': 1229, 'epoch': 1} {'type': 'loss', 'content': 0.012688005343079567, 'timestamp': '2025-10-01 04:17:38.361645', 'step': 1230, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:17:38.408561', 'step': 1230, 'epoch': 1} {'type': 'loss', 'content': 0.02250838465988636, 'timestamp': '2025-10-01 04:17:38.422134', 'step': 1231, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:17:38.462695', 'step': 1231, 'epoch': 1} {'type': 'loss', 'content': 0.03216647356748581, 'timestamp': '2025-10-01 04:17:38.492383', 'step': 1232, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:38.537979', 'step': 1232, 'epoch': 1} {'type': 'loss', 'content': 0.018679702654480934, 'timestamp': '2025-10-01 04:17:38.547768', 'step': 1233, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:38.587138', 'step': 1233, 'epoch': 1} {'type': 'loss', 'content': 0.02766079641878605, 'timestamp': '2025-10-01 04:17:38.595514', 'step': 1234, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:38.636798', 'step': 1234, 'epoch': 1} {'type': 'loss', 'content': 0.022592492401599884, 'timestamp': '2025-10-01 04:17:38.648292', 'step': 1235, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:38.688220', 'step': 1235, 'epoch': 1} {'type': 'loss', 'content': 0.023507826030254364, 'timestamp': '2025-10-01 04:17:38.719786', 'step': 1236, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:17:38.768386', 'step': 1236, 'epoch': 1} {'type': 'loss', 'content': 0.03685879707336426, 'timestamp': '2025-10-01 04:17:38.778479', 'step': 1237, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:17:38.829383', 'step': 1237, 'epoch': 1} {'type': 'loss', 'content': 0.031159665435552597, 'timestamp': '2025-10-01 04:17:38.837231', 'step': 1238, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:17:38.901911', 'step': 1238, 'epoch': 1} {'type': 'loss', 'content': 0.016343016177415848, 'timestamp': '2025-10-01 04:17:38.908492', 'step': 1239, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:17:38.960684', 'step': 1239, 'epoch': 1} {'type': 'loss', 'content': 0.024465572088956833, 'timestamp': '2025-10-01 04:17:38.994312', 'step': 1240, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:39.034335', 'step': 1240, 'epoch': 1} {'type': 'loss', 'content': 0.015612705610692501, 'timestamp': '2025-10-01 04:17:39.042372', 'step': 1241, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:17:39.100895', 'step': 1241, 'epoch': 1} {'type': 'loss', 'content': 0.014011003077030182, 'timestamp': '2025-10-01 04:17:39.108612', 'step': 1242, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:17:39.172128', 'step': 1242, 'epoch': 1} {'type': 'loss', 'content': 0.01962081715464592, 'timestamp': '2025-10-01 04:17:39.180090', 'step': 1243, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:39.256667', 'step': 1243, 'epoch': 1} {'type': 'loss', 'content': 0.02356959879398346, 'timestamp': '2025-10-01 04:17:39.285596', 'step': 1244, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:17:39.357242', 'step': 1244, 'epoch': 1} {'type': 'loss', 'content': 0.02147674560546875, 'timestamp': '2025-10-01 04:17:39.362464', 'step': 1245, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:39.404180', 'step': 1245, 'epoch': 1} {'type': 'loss', 'content': 0.030123447999358177, 'timestamp': '2025-10-01 04:17:39.412290', 'step': 1246, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:39.457872', 'step': 1246, 'epoch': 1} {'type': 'loss', 'content': 0.031157352030277252, 'timestamp': '2025-10-01 04:17:39.469127', 'step': 1247, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:39.536344', 'step': 1247, 'epoch': 1} {'type': 'loss', 'content': 0.029356516897678375, 'timestamp': '2025-10-01 04:17:39.567947', 'step': 1248, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:39.638799', 'step': 1248, 'epoch': 1} {'type': 'loss', 'content': 0.027411503717303276, 'timestamp': '2025-10-01 04:17:39.647742', 'step': 1249, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:17:39.704819', 'step': 1249, 'epoch': 1} {'type': 'loss', 'content': 0.020227093249559402, 'timestamp': '2025-10-01 04:17:39.713083', 'step': 1250, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:17:39.759333', 'step': 1250, 'epoch': 1} {'type': 'loss', 'content': 0.031126294285058975, 'timestamp': '2025-10-01 04:17:39.767320', 'step': 1251, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:39.830352', 'step': 1251, 'epoch': 1} {'type': 'loss', 'content': 0.015709269791841507, 'timestamp': '2025-10-01 04:17:39.859324', 'step': 1252, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:39.918276', 'step': 1252, 'epoch': 1} {'type': 'loss', 'content': 0.03319138288497925, 'timestamp': '2025-10-01 04:17:39.925850', 'step': 1253, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:39.989088', 'step': 1253, 'epoch': 1} {'type': 'loss', 'content': 0.019776884466409683, 'timestamp': '2025-10-01 04:17:39.999858', 'step': 1254, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:40.051599', 'step': 1254, 'epoch': 1} {'type': 'loss', 'content': 0.013148770667612553, 'timestamp': '2025-10-01 04:17:40.062622', 'step': 1255, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:40.098146', 'step': 1255, 'epoch': 1} {'type': 'loss', 'content': 0.015216666273772717, 'timestamp': '2025-10-01 04:17:40.130620', 'step': 1256, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:40.163260', 'step': 1256, 'epoch': 1} {'type': 'loss', 'content': 0.04050387814640999, 'timestamp': '2025-10-01 04:17:40.171521', 'step': 1257, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:40.209416', 'step': 1257, 'epoch': 1} {'type': 'loss', 'content': 0.02167111448943615, 'timestamp': '2025-10-01 04:17:40.221045', 'step': 1258, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:40.259408', 'step': 1258, 'epoch': 1} {'type': 'loss', 'content': 0.015278061851859093, 'timestamp': '2025-10-01 04:17:40.274715', 'step': 1259, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:40.308694', 'step': 1259, 'epoch': 1} {'type': 'loss', 'content': 0.019215909764170647, 'timestamp': '2025-10-01 04:17:40.341127', 'step': 1260, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:40.373221', 'step': 1260, 'epoch': 1} {'type': 'loss', 'content': 0.016482410952448845, 'timestamp': '2025-10-01 04:17:40.382353', 'step': 1261, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:17:40.415713', 'step': 1261, 'epoch': 1} {'type': 'loss', 'content': 0.0244333166629076, 'timestamp': '2025-10-01 04:17:40.428432', 'step': 1262, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:17:40.464400', 'step': 1262, 'epoch': 1} {'type': 'loss', 'content': 0.022354763001203537, 'timestamp': '2025-10-01 04:17:40.477199', 'step': 1263, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:40.511796', 'step': 1263, 'epoch': 1} {'type': 'loss', 'content': 0.013279612176120281, 'timestamp': '2025-10-01 04:17:40.544410', 'step': 1264, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:40.576723', 'step': 1264, 'epoch': 1} {'type': 'loss', 'content': 0.021984396502375603, 'timestamp': '2025-10-01 04:17:40.585842', 'step': 1265, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-10-01 04:17:43.190213', 'step': 1265, 'epoch': 1} {'type': 'pplx', 'content': 5.492337728141367, 'timestamp': '2025-10-01 04:17:43.194415', 'step': 1265, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:43.240449', 'step': 1265, 'epoch': 1} {'type': 'loss', 'content': 0.01686171442270279, 'timestamp': '2025-10-01 04:17:43.249800', 'step': 1266, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:43.299484', 'step': 1266, 'epoch': 1} {'type': 'loss', 'content': 0.016604052856564522, 'timestamp': '2025-10-01 04:17:43.312055', 'step': 1267, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:43.348521', 'step': 1267, 'epoch': 1} {'type': 'loss', 'content': 0.027909575030207634, 'timestamp': '2025-10-01 04:17:43.377701', 'step': 1268, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:17:43.413099', 'step': 1268, 'epoch': 1} {'type': 'loss', 'content': 0.02259516716003418, 'timestamp': '2025-10-01 04:17:43.425952', 'step': 1269, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:43.458857', 'step': 1269, 'epoch': 1} {'type': 'loss', 'content': 0.02021465264260769, 'timestamp': '2025-10-01 04:17:43.470338', 'step': 1270, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:43.508148', 'step': 1270, 'epoch': 1} {'type': 'loss', 'content': 0.023183852434158325, 'timestamp': '2025-10-01 04:17:43.520696', 'step': 1271, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:17:43.559096', 'step': 1271, 'epoch': 1} {'type': 'loss', 'content': 0.015795599669218063, 'timestamp': '2025-10-01 04:17:43.593632', 'step': 1272, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:43.631455', 'step': 1272, 'epoch': 1} {'type': 'loss', 'content': 0.028308458626270294, 'timestamp': '2025-10-01 04:17:43.639758', 'step': 1273, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:17:43.680216', 'step': 1273, 'epoch': 1} {'type': 'loss', 'content': 0.01798139326274395, 'timestamp': '2025-10-01 04:17:43.692982', 'step': 1274, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:43.729383', 'step': 1274, 'epoch': 1} {'type': 'loss', 'content': 0.03902094066143036, 'timestamp': '2025-10-01 04:17:43.741919', 'step': 1275, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:17:43.775767', 'step': 1275, 'epoch': 1} {'type': 'loss', 'content': 0.02079535648226738, 'timestamp': '2025-10-01 04:17:43.809445', 'step': 1276, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:17:43.850339', 'step': 1276, 'epoch': 1} {'type': 'loss', 'content': 0.021634308621287346, 'timestamp': '2025-10-01 04:17:43.863181', 'step': 1277, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:43.897041', 'step': 1277, 'epoch': 1} {'type': 'loss', 'content': 0.010929740965366364, 'timestamp': '2025-10-01 04:17:43.908447', 'step': 1278, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:43.946135', 'step': 1278, 'epoch': 1} {'type': 'loss', 'content': 0.021371889859437943, 'timestamp': '2025-10-01 04:17:43.957375', 'step': 1279, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:44.000956', 'step': 1279, 'epoch': 1} {'type': 'loss', 'content': 0.042651500552892685, 'timestamp': '2025-10-01 04:17:44.034496', 'step': 1280, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:44.070950', 'step': 1280, 'epoch': 1} {'type': 'loss', 'content': 0.026062199845910072, 'timestamp': '2025-10-01 04:17:44.080270', 'step': 1281, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:44.117714', 'step': 1281, 'epoch': 1} {'type': 'loss', 'content': 0.03550971671938896, 'timestamp': '2025-10-01 04:17:44.129220', 'step': 1282, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:17:44.165417', 'step': 1282, 'epoch': 1} {'type': 'loss', 'content': 0.02891443856060505, 'timestamp': '2025-10-01 04:17:44.178158', 'step': 1283, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:44.224271', 'step': 1283, 'epoch': 1} {'type': 'loss', 'content': 0.025692131370306015, 'timestamp': '2025-10-01 04:17:44.256536', 'step': 1284, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:17:44.302684', 'step': 1284, 'epoch': 1} {'type': 'loss', 'content': 0.021801907569169998, 'timestamp': '2025-10-01 04:17:44.313501', 'step': 1285, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:44.352096', 'step': 1285, 'epoch': 1} {'type': 'loss', 'content': 0.03619185835123062, 'timestamp': '2025-10-01 04:17:44.362780', 'step': 1286, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:17:44.400088', 'step': 1286, 'epoch': 1} {'type': 'loss', 'content': 0.033847156912088394, 'timestamp': '2025-10-01 04:17:44.407905', 'step': 1287, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:44.442837', 'step': 1287, 'epoch': 1} {'type': 'loss', 'content': 0.018534669652581215, 'timestamp': '2025-10-01 04:17:44.476249', 'step': 1288, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:44.517370', 'step': 1288, 'epoch': 1} {'type': 'loss', 'content': 0.03822058066725731, 'timestamp': '2025-10-01 04:17:44.526410', 'step': 1289, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:44.559528', 'step': 1289, 'epoch': 1} {'type': 'loss', 'content': 0.016333594918251038, 'timestamp': '2025-10-01 04:17:44.567741', 'step': 1290, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:44.602300', 'step': 1290, 'epoch': 1} {'type': 'loss', 'content': 0.024304017424583435, 'timestamp': '2025-10-01 04:17:44.614770', 'step': 1291, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:44.647711', 'step': 1291, 'epoch': 1} {'type': 'loss', 'content': 0.022015796974301338, 'timestamp': '2025-10-01 04:17:44.676862', 'step': 1292, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:17:44.715580', 'step': 1292, 'epoch': 1} {'type': 'loss', 'content': 0.023391572758555412, 'timestamp': '2025-10-01 04:17:44.726401', 'step': 1293, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:44.760959', 'step': 1293, 'epoch': 1} {'type': 'loss', 'content': 0.018955737352371216, 'timestamp': '2025-10-01 04:17:44.773499', 'step': 1294, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:44.808390', 'step': 1294, 'epoch': 1} {'type': 'loss', 'content': 0.020337548106908798, 'timestamp': '2025-10-01 04:17:44.820862', 'step': 1295, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:17:44.856231', 'step': 1295, 'epoch': 1} {'type': 'loss', 'content': 0.02703406848013401, 'timestamp': '2025-10-01 04:17:44.889910', 'step': 1296, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:17:44.924901', 'step': 1296, 'epoch': 1} {'type': 'loss', 'content': 0.018195156008005142, 'timestamp': '2025-10-01 04:17:44.937787', 'step': 1297, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:17:44.980018', 'step': 1297, 'epoch': 1} {'type': 'loss', 'content': 0.018607541918754578, 'timestamp': '2025-10-01 04:17:44.992783', 'step': 1298, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:45.027098', 'step': 1298, 'epoch': 1} {'type': 'loss', 'content': 0.01542731188237667, 'timestamp': '2025-10-01 04:17:45.038739', 'step': 1299, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:45.072906', 'step': 1299, 'epoch': 1} {'type': 'loss', 'content': 0.01874428242444992, 'timestamp': '2025-10-01 04:17:45.106445', 'step': 1300, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:17:45.142923', 'step': 1300, 'epoch': 1} {'type': 'loss', 'content': 0.020559163764119148, 'timestamp': '2025-10-01 04:17:45.155796', 'step': 1301, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:17:45.194939', 'step': 1301, 'epoch': 1} {'type': 'loss', 'content': 0.05335977301001549, 'timestamp': '2025-10-01 04:17:45.208935', 'step': 1302, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:17:45.244515', 'step': 1302, 'epoch': 1} {'type': 'loss', 'content': 0.019412174820899963, 'timestamp': '2025-10-01 04:17:45.257285', 'step': 1303, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:17:45.297579', 'step': 1303, 'epoch': 1} {'type': 'loss', 'content': 0.024255063384771347, 'timestamp': '2025-10-01 04:17:45.331255', 'step': 1304, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:45.367816', 'step': 1304, 'epoch': 1} {'type': 'loss', 'content': 0.033310417085886, 'timestamp': '2025-10-01 04:17:45.376853', 'step': 1305, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:17:45.415181', 'step': 1305, 'epoch': 1} {'type': 'loss', 'content': 0.03630458936095238, 'timestamp': '2025-10-01 04:17:45.427888', 'step': 1306, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:17:45.462515', 'step': 1306, 'epoch': 1} {'type': 'loss', 'content': 0.022850660607218742, 'timestamp': '2025-10-01 04:17:45.475182', 'step': 1307, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:45.509602', 'step': 1307, 'epoch': 1} {'type': 'loss', 'content': 0.024639906361699104, 'timestamp': '2025-10-01 04:17:45.543101', 'step': 1308, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:17:45.578393', 'step': 1308, 'epoch': 1} {'type': 'loss', 'content': 0.013606730848550797, 'timestamp': '2025-10-01 04:17:45.583956', 'step': 1309, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:45.620523', 'step': 1309, 'epoch': 1} {'type': 'loss', 'content': 0.023541061207652092, 'timestamp': '2025-10-01 04:17:45.632982', 'step': 1310, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:17:45.667004', 'step': 1310, 'epoch': 1} {'type': 'loss', 'content': 0.02182057872414589, 'timestamp': '2025-10-01 04:17:45.679729', 'step': 1311, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:45.715097', 'step': 1311, 'epoch': 1} {'type': 'loss', 'content': 0.025034120306372643, 'timestamp': '2025-10-01 04:17:45.746759', 'step': 1312, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:45.780570', 'step': 1312, 'epoch': 1} {'type': 'loss', 'content': 0.022200141102075577, 'timestamp': '2025-10-01 04:17:45.788964', 'step': 1313, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:45.823835', 'step': 1313, 'epoch': 1} {'type': 'loss', 'content': 0.025101641193032265, 'timestamp': '2025-10-01 04:17:45.834701', 'step': 1314, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:17:45.869441', 'step': 1314, 'epoch': 1} {'type': 'loss', 'content': 0.026965370401740074, 'timestamp': '2025-10-01 04:17:45.877160', 'step': 1315, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:17:45.909149', 'step': 1315, 'epoch': 1} {'type': 'loss', 'content': 0.013870656490325928, 'timestamp': '2025-10-01 04:17:45.937993', 'step': 1316, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:17:45.971627', 'step': 1316, 'epoch': 1} {'type': 'loss', 'content': 0.018582269549369812, 'timestamp': '2025-10-01 04:17:45.977334', 'step': 1317, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:17:46.012247', 'step': 1317, 'epoch': 1} {'type': 'loss', 'content': 0.0073986416682600975, 'timestamp': '2025-10-01 04:17:46.020115', 'step': 1318, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:17:46.053079', 'step': 1318, 'epoch': 1} {'type': 'loss', 'content': 0.025576066225767136, 'timestamp': '2025-10-01 04:17:46.061018', 'step': 1319, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:17:46.096317', 'step': 1319, 'epoch': 1} {'type': 'loss', 'content': 0.035522423684597015, 'timestamp': '2025-10-01 04:17:46.125294', 'step': 1320, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:46.158305', 'step': 1320, 'epoch': 1} {'type': 'loss', 'content': 0.019052352756261826, 'timestamp': '2025-10-01 04:17:46.164165', 'step': 1321, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:46.200052', 'step': 1321, 'epoch': 1} {'type': 'loss', 'content': 0.015524309128522873, 'timestamp': '2025-10-01 04:17:46.210770', 'step': 1322, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:17:46.246199', 'step': 1322, 'epoch': 1} {'type': 'loss', 'content': 0.012862238101661205, 'timestamp': '2025-10-01 04:17:46.253661', 'step': 1323, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:17:46.293811', 'step': 1323, 'epoch': 1} {'type': 'loss', 'content': 0.017116829752922058, 'timestamp': '2025-10-01 04:17:46.322719', 'step': 1324, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:17:46.361362', 'step': 1324, 'epoch': 1} {'type': 'loss', 'content': 0.023478763177990913, 'timestamp': '2025-10-01 04:17:46.367020', 'step': 1325, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-10-01 04:17:46.405144', 'step': 1325, 'epoch': 1} {'type': 'loss', 'content': 0.03501209244132042, 'timestamp': '2025-10-01 04:17:46.409456', 'step': 1326, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:46.445310', 'step': 1326, 'epoch': 1} {'type': 'loss', 'content': 0.027715127915143967, 'timestamp': '2025-10-01 04:17:46.456020', 'step': 1327, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:46.496280', 'step': 1327, 'epoch': 1} {'type': 'loss', 'content': 0.016849998384714127, 'timestamp': '2025-10-01 04:17:46.529693', 'step': 1328, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:17:46.568027', 'step': 1328, 'epoch': 1} {'type': 'loss', 'content': 0.028173206374049187, 'timestamp': '2025-10-01 04:17:46.581388', 'step': 1329, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:17:46.621371', 'step': 1329, 'epoch': 1} {'type': 'loss', 'content': 0.023412570357322693, 'timestamp': '2025-10-01 04:17:46.629073', 'step': 1330, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:46.669753', 'step': 1330, 'epoch': 1} {'type': 'loss', 'content': 0.0318622924387455, 'timestamp': '2025-10-01 04:17:46.682231', 'step': 1331, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:17:46.722135', 'step': 1331, 'epoch': 1} {'type': 'loss', 'content': 0.012228555977344513, 'timestamp': '2025-10-01 04:17:46.756631', 'step': 1332, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:46.792315', 'step': 1332, 'epoch': 1} {'type': 'loss', 'content': 0.02357361651957035, 'timestamp': '2025-10-01 04:17:46.801526', 'step': 1333, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:17:46.852327', 'step': 1333, 'epoch': 1} {'type': 'loss', 'content': 0.05098006874322891, 'timestamp': '2025-10-01 04:17:46.865855', 'step': 1334, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:46.903419', 'step': 1334, 'epoch': 1} {'type': 'loss', 'content': 0.05719012767076492, 'timestamp': '2025-10-01 04:17:46.915966', 'step': 1335, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:46.967160', 'step': 1335, 'epoch': 1} {'type': 'loss', 'content': 0.022550299763679504, 'timestamp': '2025-10-01 04:17:46.999718', 'step': 1336, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:47.038218', 'step': 1336, 'epoch': 1} {'type': 'loss', 'content': 0.039951570332050323, 'timestamp': '2025-10-01 04:17:47.046691', 'step': 1337, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:17:47.083639', 'step': 1337, 'epoch': 1} {'type': 'loss', 'content': 0.010665753856301308, 'timestamp': '2025-10-01 04:17:47.096275', 'step': 1338, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:47.130389', 'step': 1338, 'epoch': 1} {'type': 'loss', 'content': 0.00990741141140461, 'timestamp': '2025-10-01 04:17:47.142942', 'step': 1339, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:47.186460', 'step': 1339, 'epoch': 1} {'type': 'loss', 'content': 0.01759345270693302, 'timestamp': '2025-10-01 04:17:47.219925', 'step': 1340, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:47.253220', 'step': 1340, 'epoch': 1} {'type': 'loss', 'content': 0.014484407380223274, 'timestamp': '2025-10-01 04:17:47.261592', 'step': 1341, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:47.305417', 'step': 1341, 'epoch': 1} {'type': 'loss', 'content': 0.030333848670125008, 'timestamp': '2025-10-01 04:17:47.316986', 'step': 1342, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:47.351278', 'step': 1342, 'epoch': 1} {'type': 'loss', 'content': 0.030901635065674782, 'timestamp': '2025-10-01 04:17:47.362047', 'step': 1343, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:47.405437', 'step': 1343, 'epoch': 1} {'type': 'loss', 'content': 0.03119835816323757, 'timestamp': '2025-10-01 04:17:47.437025', 'step': 1344, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:47.480927', 'step': 1344, 'epoch': 1} {'type': 'loss', 'content': 0.022074224427342415, 'timestamp': '2025-10-01 04:17:47.491241', 'step': 1345, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:47.529135', 'step': 1345, 'epoch': 1} {'type': 'loss', 'content': 0.025745537132024765, 'timestamp': '2025-10-01 04:17:47.541647', 'step': 1346, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:47.585192', 'step': 1346, 'epoch': 1} {'type': 'loss', 'content': 0.024026384577155113, 'timestamp': '2025-10-01 04:17:47.596453', 'step': 1347, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:17:47.637209', 'step': 1347, 'epoch': 1} {'type': 'loss', 'content': 0.039784472435712814, 'timestamp': '2025-10-01 04:17:47.672165', 'step': 1348, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:17:47.708901', 'step': 1348, 'epoch': 1} {'type': 'loss', 'content': 0.01853860355913639, 'timestamp': '2025-10-01 04:17:47.719978', 'step': 1349, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:47.758015', 'step': 1349, 'epoch': 1} {'type': 'loss', 'content': 0.015193779952824116, 'timestamp': '2025-10-01 04:17:47.766217', 'step': 1350, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:17:47.802585', 'step': 1350, 'epoch': 1} {'type': 'loss', 'content': 0.018944907933473587, 'timestamp': '2025-10-01 04:17:47.815310', 'step': 1351, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:17:47.862846', 'step': 1351, 'epoch': 1} {'type': 'loss', 'content': 0.02555946819484234, 'timestamp': '2025-10-01 04:17:47.891812', 'step': 1352, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:47.926629', 'step': 1352, 'epoch': 1} {'type': 'loss', 'content': 0.021566983312368393, 'timestamp': '2025-10-01 04:17:47.935659', 'step': 1353, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:47.974977', 'step': 1353, 'epoch': 1} {'type': 'loss', 'content': 0.022174106910824776, 'timestamp': '2025-10-01 04:17:47.986609', 'step': 1354, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:17:48.027246', 'step': 1354, 'epoch': 1} {'type': 'loss', 'content': 0.01684454083442688, 'timestamp': '2025-10-01 04:17:48.034753', 'step': 1355, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:17:48.070544', 'step': 1355, 'epoch': 1} {'type': 'loss', 'content': 0.02601834386587143, 'timestamp': '2025-10-01 04:17:48.099461', 'step': 1356, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:17:48.137482', 'step': 1356, 'epoch': 1} {'type': 'loss', 'content': 0.02316879853606224, 'timestamp': '2025-10-01 04:17:48.142564', 'step': 1357, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:17:48.181538', 'step': 1357, 'epoch': 1} {'type': 'loss', 'content': 0.022926362231373787, 'timestamp': '2025-10-01 04:17:48.186545', 'step': 1358, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:17:48.228997', 'step': 1358, 'epoch': 1} {'type': 'loss', 'content': 0.02760382555425167, 'timestamp': '2025-10-01 04:17:48.236566', 'step': 1359, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:48.276887', 'step': 1359, 'epoch': 1} {'type': 'loss', 'content': 0.019699225202202797, 'timestamp': '2025-10-01 04:17:48.308755', 'step': 1360, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:48.351685', 'step': 1360, 'epoch': 1} {'type': 'loss', 'content': 0.01297005359083414, 'timestamp': '2025-10-01 04:17:48.360119', 'step': 1361, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:17:48.407732', 'step': 1361, 'epoch': 1} {'type': 'loss', 'content': 0.026044223457574844, 'timestamp': '2025-10-01 04:17:48.420440', 'step': 1362, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:17:48.458794', 'step': 1362, 'epoch': 1} {'type': 'loss', 'content': 0.02130778506398201, 'timestamp': '2025-10-01 04:17:48.472783', 'step': 1363, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:48.517629', 'step': 1363, 'epoch': 1} {'type': 'loss', 'content': 0.011479970999062061, 'timestamp': '2025-10-01 04:17:48.546800', 'step': 1364, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:17:48.590687', 'step': 1364, 'epoch': 1} {'type': 'loss', 'content': 0.021426871418952942, 'timestamp': '2025-10-01 04:17:48.596302', 'step': 1365, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:17:48.635708', 'step': 1365, 'epoch': 1} {'type': 'loss', 'content': 0.015869390219449997, 'timestamp': '2025-10-01 04:17:48.643762', 'step': 1366, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:48.682481', 'step': 1366, 'epoch': 1} {'type': 'loss', 'content': 0.010812297463417053, 'timestamp': '2025-10-01 04:17:48.693891', 'step': 1367, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:17:48.735684', 'step': 1367, 'epoch': 1} {'type': 'loss', 'content': 0.02029472216963768, 'timestamp': '2025-10-01 04:17:48.764240', 'step': 1368, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:48.803743', 'step': 1368, 'epoch': 1} {'type': 'loss', 'content': 0.014588592574000359, 'timestamp': '2025-10-01 04:17:48.812143', 'step': 1369, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:17:48.857628', 'step': 1369, 'epoch': 1} {'type': 'loss', 'content': 0.02048553340137005, 'timestamp': '2025-10-01 04:17:48.871592', 'step': 1370, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:48.916410', 'step': 1370, 'epoch': 1} {'type': 'loss', 'content': 0.026746736839413643, 'timestamp': '2025-10-01 04:17:48.923669', 'step': 1371, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:48.962525', 'step': 1371, 'epoch': 1} {'type': 'loss', 'content': 0.022588498890399933, 'timestamp': '2025-10-01 04:17:48.994559', 'step': 1372, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:49.035190', 'step': 1372, 'epoch': 1} {'type': 'loss', 'content': 0.031310636550188065, 'timestamp': '2025-10-01 04:17:49.043383', 'step': 1373, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:17:49.083605', 'step': 1373, 'epoch': 1} {'type': 'loss', 'content': 0.020347315818071365, 'timestamp': '2025-10-01 04:17:49.096114', 'step': 1374, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:49.142343', 'step': 1374, 'epoch': 1} {'type': 'loss', 'content': 0.0178179070353508, 'timestamp': '2025-10-01 04:17:49.151671', 'step': 1375, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:49.201251', 'step': 1375, 'epoch': 1} {'type': 'loss', 'content': 0.009956605732440948, 'timestamp': '2025-10-01 04:17:49.229830', 'step': 1376, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:17:49.270998', 'step': 1376, 'epoch': 1} {'type': 'loss', 'content': 0.01747898757457733, 'timestamp': '2025-10-01 04:17:49.279641', 'step': 1377, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:49.327691', 'step': 1377, 'epoch': 1} {'type': 'loss', 'content': 0.01432023849338293, 'timestamp': '2025-10-01 04:17:49.337185', 'step': 1378, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:17:49.383389', 'step': 1378, 'epoch': 1} {'type': 'loss', 'content': 0.018536902964115143, 'timestamp': '2025-10-01 04:17:49.390022', 'step': 1379, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:49.445345', 'step': 1379, 'epoch': 1} {'type': 'loss', 'content': 0.03330438211560249, 'timestamp': '2025-10-01 04:17:49.475890', 'step': 1380, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-10-01 04:17:52.291907', 'step': 1380, 'epoch': 1} {'type': 'pplx', 'content': 5.332488542738945, 'timestamp': '2025-10-01 04:17:52.293602', 'step': 1380, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:17:52.328038', 'step': 1380, 'epoch': 1} {'type': 'loss', 'content': 0.03699210658669472, 'timestamp': '2025-10-01 04:17:52.341345', 'step': 1381, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:52.379802', 'step': 1381, 'epoch': 1} {'type': 'loss', 'content': 0.013190505094826221, 'timestamp': '2025-10-01 04:17:52.387972', 'step': 1382, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:17:52.430217', 'step': 1382, 'epoch': 1} {'type': 'loss', 'content': 0.0301478523761034, 'timestamp': '2025-10-01 04:17:52.444178', 'step': 1383, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:17:52.483880', 'step': 1383, 'epoch': 1} {'type': 'loss', 'content': 0.02687494270503521, 'timestamp': '2025-10-01 04:17:52.518315', 'step': 1384, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:52.551609', 'step': 1384, 'epoch': 1} {'type': 'loss', 'content': 0.0200329702347517, 'timestamp': '2025-10-01 04:17:52.562074', 'step': 1385, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:17:52.596622', 'step': 1385, 'epoch': 1} {'type': 'loss', 'content': 0.02248782105743885, 'timestamp': '2025-10-01 04:17:52.610236', 'step': 1386, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:52.643436', 'step': 1386, 'epoch': 1} {'type': 'loss', 'content': 0.015516207553446293, 'timestamp': '2025-10-01 04:17:52.654229', 'step': 1387, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:52.690087', 'step': 1387, 'epoch': 1} {'type': 'loss', 'content': 0.017798220738768578, 'timestamp': '2025-10-01 04:17:52.722645', 'step': 1388, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:17:52.762569', 'step': 1388, 'epoch': 1} {'type': 'loss', 'content': 0.01432851329445839, 'timestamp': '2025-10-01 04:17:52.767832', 'step': 1389, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:17:52.809677', 'step': 1389, 'epoch': 1} {'type': 'loss', 'content': 0.014311430975794792, 'timestamp': '2025-10-01 04:17:52.817299', 'step': 1390, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:17:52.855850', 'step': 1390, 'epoch': 1} {'type': 'loss', 'content': 0.03865724802017212, 'timestamp': '2025-10-01 04:17:52.863746', 'step': 1391, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:17:52.907422', 'step': 1391, 'epoch': 1} {'type': 'loss', 'content': 0.009136580862104893, 'timestamp': '2025-10-01 04:17:52.935927', 'step': 1392, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:17:52.975238', 'step': 1392, 'epoch': 1} {'type': 'loss', 'content': 0.011726299300789833, 'timestamp': '2025-10-01 04:17:52.988052', 'step': 1393, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:53.024517', 'step': 1393, 'epoch': 1} {'type': 'loss', 'content': 0.017422828823328018, 'timestamp': '2025-10-01 04:17:53.035582', 'step': 1394, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:53.071942', 'step': 1394, 'epoch': 1} {'type': 'loss', 'content': 0.015002214349806309, 'timestamp': '2025-10-01 04:17:53.083277', 'step': 1395, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:17:53.127161', 'step': 1395, 'epoch': 1} {'type': 'loss', 'content': 0.01298123225569725, 'timestamp': '2025-10-01 04:17:53.162087', 'step': 1396, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:53.202099', 'step': 1396, 'epoch': 1} {'type': 'loss', 'content': 0.022806180641055107, 'timestamp': '2025-10-01 04:17:53.211118', 'step': 1397, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:53.249554', 'step': 1397, 'epoch': 1} {'type': 'loss', 'content': 0.018283938989043236, 'timestamp': '2025-10-01 04:17:53.261091', 'step': 1398, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:53.296499', 'step': 1398, 'epoch': 1} {'type': 'loss', 'content': 0.028261395171284676, 'timestamp': '2025-10-01 04:17:53.307168', 'step': 1399, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:53.346551', 'step': 1399, 'epoch': 1} {'type': 'loss', 'content': 0.017059974372386932, 'timestamp': '2025-10-01 04:17:53.378186', 'step': 1400, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:53.411010', 'step': 1400, 'epoch': 1} {'type': 'loss', 'content': 0.014436691999435425, 'timestamp': '2025-10-01 04:17:53.419039', 'step': 1401, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:17:53.460935', 'step': 1401, 'epoch': 1} {'type': 'loss', 'content': 0.02647019736468792, 'timestamp': '2025-10-01 04:17:53.468249', 'step': 1402, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:53.505439', 'step': 1402, 'epoch': 1} {'type': 'loss', 'content': 0.017433462664484978, 'timestamp': '2025-10-01 04:17:53.515616', 'step': 1403, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:53.554050', 'step': 1403, 'epoch': 1} {'type': 'loss', 'content': 0.015735285356640816, 'timestamp': '2025-10-01 04:17:53.583296', 'step': 1404, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:17:53.627783', 'step': 1404, 'epoch': 1} {'type': 'loss', 'content': 0.02397422306239605, 'timestamp': '2025-10-01 04:17:53.640603', 'step': 1405, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:53.676193', 'step': 1405, 'epoch': 1} {'type': 'loss', 'content': 0.014264235273003578, 'timestamp': '2025-10-01 04:17:53.684593', 'step': 1406, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:53.726960', 'step': 1406, 'epoch': 1} {'type': 'loss', 'content': 0.01394173689186573, 'timestamp': '2025-10-01 04:17:53.738366', 'step': 1407, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:17:53.778258', 'step': 1407, 'epoch': 1} {'type': 'loss', 'content': 0.023644041270017624, 'timestamp': '2025-10-01 04:17:53.812032', 'step': 1408, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:53.847483', 'step': 1408, 'epoch': 1} {'type': 'loss', 'content': 0.041283827275037766, 'timestamp': '2025-10-01 04:17:53.856769', 'step': 1409, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:53.894820', 'step': 1409, 'epoch': 1} {'type': 'loss', 'content': 0.01421945821493864, 'timestamp': '2025-10-01 04:17:53.907376', 'step': 1410, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:53.943538', 'step': 1410, 'epoch': 1} {'type': 'loss', 'content': 0.045332930982112885, 'timestamp': '2025-10-01 04:17:53.951804', 'step': 1411, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:53.990423', 'step': 1411, 'epoch': 1} {'type': 'loss', 'content': 0.030463675037026405, 'timestamp': '2025-10-01 04:17:54.021883', 'step': 1412, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:17:54.054959', 'step': 1412, 'epoch': 1} {'type': 'loss', 'content': 0.017262602224946022, 'timestamp': '2025-10-01 04:17:54.065979', 'step': 1413, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:54.101936', 'step': 1413, 'epoch': 1} {'type': 'loss', 'content': 0.019965918734669685, 'timestamp': '2025-10-01 04:17:54.114470', 'step': 1414, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:17:54.150685', 'step': 1414, 'epoch': 1} {'type': 'loss', 'content': 0.01880469173192978, 'timestamp': '2025-10-01 04:17:54.164124', 'step': 1415, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:54.198953', 'step': 1415, 'epoch': 1} {'type': 'loss', 'content': 0.01885572075843811, 'timestamp': '2025-10-01 04:17:54.231129', 'step': 1416, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:17:54.268865', 'step': 1416, 'epoch': 1} {'type': 'loss', 'content': 0.013193568214774132, 'timestamp': '2025-10-01 04:17:54.274032', 'step': 1417, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:54.312863', 'step': 1417, 'epoch': 1} {'type': 'loss', 'content': 0.01069488376379013, 'timestamp': '2025-10-01 04:17:54.324184', 'step': 1418, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:17:54.362053', 'step': 1418, 'epoch': 1} {'type': 'loss', 'content': 0.028652342036366463, 'timestamp': '2025-10-01 04:17:54.369902', 'step': 1419, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:54.403706', 'step': 1419, 'epoch': 1} {'type': 'loss', 'content': 0.024117540568113327, 'timestamp': '2025-10-01 04:17:54.435044', 'step': 1420, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:54.473774', 'step': 1420, 'epoch': 1} {'type': 'loss', 'content': 0.020331203937530518, 'timestamp': '2025-10-01 04:17:54.482526', 'step': 1421, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:54.518179', 'step': 1421, 'epoch': 1} {'type': 'loss', 'content': 0.017535358667373657, 'timestamp': '2025-10-01 04:17:54.526141', 'step': 1422, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:54.569072', 'step': 1422, 'epoch': 1} {'type': 'loss', 'content': 0.0198888648301363, 'timestamp': '2025-10-01 04:17:54.580376', 'step': 1423, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:17:54.615042', 'step': 1423, 'epoch': 1} {'type': 'loss', 'content': 0.026057172566652298, 'timestamp': '2025-10-01 04:17:54.642984', 'step': 1424, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:17:54.688795', 'step': 1424, 'epoch': 1} {'type': 'loss', 'content': 0.020001044496893883, 'timestamp': '2025-10-01 04:17:54.693840', 'step': 1425, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:17:54.749295', 'step': 1425, 'epoch': 1} {'type': 'loss', 'content': 0.019602738320827484, 'timestamp': '2025-10-01 04:17:54.756754', 'step': 1426, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-10-01 04:17:54.795160', 'step': 1426, 'epoch': 1} {'type': 'loss', 'content': 0.010403119027614594, 'timestamp': '2025-10-01 04:17:54.799817', 'step': 1427, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:54.850509', 'step': 1427, 'epoch': 1} {'type': 'loss', 'content': 0.013352718204259872, 'timestamp': '2025-10-01 04:17:54.882206', 'step': 1428, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:17:54.922450', 'step': 1428, 'epoch': 1} {'type': 'loss', 'content': 0.010968288406729698, 'timestamp': '2025-10-01 04:17:54.933348', 'step': 1429, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:54.973450', 'step': 1429, 'epoch': 1} {'type': 'loss', 'content': 0.024545131251215935, 'timestamp': '2025-10-01 04:17:54.981715', 'step': 1430, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-10-01 04:17:55.025393', 'step': 1430, 'epoch': 1} {'type': 'loss', 'content': 0.017408376559615135, 'timestamp': '2025-10-01 04:17:55.029951', 'step': 1431, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:17:55.075635', 'step': 1431, 'epoch': 1} {'type': 'loss', 'content': 0.015040339902043343, 'timestamp': '2025-10-01 04:17:55.104242', 'step': 1432, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:17:55.149238', 'step': 1432, 'epoch': 1} {'type': 'loss', 'content': 0.01711038313806057, 'timestamp': '2025-10-01 04:17:55.151581', 'step': 1433, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:17:55.190786', 'step': 1433, 'epoch': 1} {'type': 'loss', 'content': 0.01482602022588253, 'timestamp': '2025-10-01 04:17:55.198395', 'step': 1434, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:55.245455', 'step': 1434, 'epoch': 1} {'type': 'loss', 'content': 0.017927950248122215, 'timestamp': '2025-10-01 04:17:55.253726', 'step': 1435, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:55.289552', 'step': 1435, 'epoch': 1} {'type': 'loss', 'content': 0.017121005803346634, 'timestamp': '2025-10-01 04:17:55.321181', 'step': 1436, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:55.359475', 'step': 1436, 'epoch': 1} {'type': 'loss', 'content': 0.014498970471322536, 'timestamp': '2025-10-01 04:17:55.365508', 'step': 1437, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:55.407903', 'step': 1437, 'epoch': 1} {'type': 'loss', 'content': 0.01076497882604599, 'timestamp': '2025-10-01 04:17:55.418355', 'step': 1438, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:55.461399', 'step': 1438, 'epoch': 1} {'type': 'loss', 'content': 0.006214038003236055, 'timestamp': '2025-10-01 04:17:55.472828', 'step': 1439, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:55.521031', 'step': 1439, 'epoch': 1} {'type': 'loss', 'content': 0.019860219210386276, 'timestamp': '2025-10-01 04:17:55.550510', 'step': 1440, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:17:55.596852', 'step': 1440, 'epoch': 1} {'type': 'loss', 'content': 0.022825557738542557, 'timestamp': '2025-10-01 04:17:55.602466', 'step': 1441, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:55.642916', 'step': 1441, 'epoch': 1} {'type': 'loss', 'content': 0.023838376626372337, 'timestamp': '2025-10-01 04:17:55.651268', 'step': 1442, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:17:55.695643', 'step': 1442, 'epoch': 1} {'type': 'loss', 'content': 0.018519744277000427, 'timestamp': '2025-10-01 04:17:55.703808', 'step': 1443, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:55.752681', 'step': 1443, 'epoch': 1} {'type': 'loss', 'content': 0.018157457932829857, 'timestamp': '2025-10-01 04:17:55.786107', 'step': 1444, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:55.820631', 'step': 1444, 'epoch': 1} {'type': 'loss', 'content': 0.027135591953992844, 'timestamp': '2025-10-01 04:17:55.826663', 'step': 1445, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:55.859066', 'step': 1445, 'epoch': 1} {'type': 'loss', 'content': 0.018489615991711617, 'timestamp': '2025-10-01 04:17:55.867332', 'step': 1446, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:17:55.909209', 'step': 1446, 'epoch': 1} {'type': 'loss', 'content': 0.02455466240644455, 'timestamp': '2025-10-01 04:17:55.922732', 'step': 1447, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:55.956634', 'step': 1447, 'epoch': 1} {'type': 'loss', 'content': 0.01674746163189411, 'timestamp': '2025-10-01 04:17:55.986008', 'step': 1448, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:56.017573', 'step': 1448, 'epoch': 1} {'type': 'loss', 'content': 0.025684408843517303, 'timestamp': '2025-10-01 04:17:56.026443', 'step': 1449, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:17:56.061646', 'step': 1449, 'epoch': 1} {'type': 'loss', 'content': 0.012956513091921806, 'timestamp': '2025-10-01 04:17:56.074362', 'step': 1450, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:17:56.112594', 'step': 1450, 'epoch': 1} {'type': 'loss', 'content': 0.011197579093277454, 'timestamp': '2025-10-01 04:17:56.126106', 'step': 1451, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:17:56.163852', 'step': 1451, 'epoch': 1} {'type': 'loss', 'content': 0.01490304060280323, 'timestamp': '2025-10-01 04:17:56.192298', 'step': 1452, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-10-01 04:17:56.233495', 'step': 1452, 'epoch': 1} {'type': 'loss', 'content': 0.022594623267650604, 'timestamp': '2025-10-01 04:17:56.235853', 'step': 1453, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:17:56.274014', 'step': 1453, 'epoch': 1} {'type': 'loss', 'content': 0.018350785598158836, 'timestamp': '2025-10-01 04:17:56.281450', 'step': 1454, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:56.318269', 'step': 1454, 'epoch': 1} {'type': 'loss', 'content': 0.02252149023115635, 'timestamp': '2025-10-01 04:17:56.326156', 'step': 1455, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:17:56.364425', 'step': 1455, 'epoch': 1} {'type': 'loss', 'content': 0.015543840825557709, 'timestamp': '2025-10-01 04:17:56.393073', 'step': 1456, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:56.428442', 'step': 1456, 'epoch': 1} {'type': 'loss', 'content': 0.017735548317432404, 'timestamp': '2025-10-01 04:17:56.436778', 'step': 1457, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:56.475920', 'step': 1457, 'epoch': 1} {'type': 'loss', 'content': 0.03465959057211876, 'timestamp': '2025-10-01 04:17:56.486720', 'step': 1458, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-10-01 04:17:56.528384', 'step': 1458, 'epoch': 1} {'type': 'loss', 'content': 0.05344873294234276, 'timestamp': '2025-10-01 04:17:56.531504', 'step': 1459, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:17:56.572586', 'step': 1459, 'epoch': 1} {'type': 'loss', 'content': 0.027159608900547028, 'timestamp': '2025-10-01 04:17:56.601044', 'step': 1460, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:56.644606', 'step': 1460, 'epoch': 1} {'type': 'loss', 'content': 0.016680359840393066, 'timestamp': '2025-10-01 04:17:56.650512', 'step': 1461, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:56.697415', 'step': 1461, 'epoch': 1} {'type': 'loss', 'content': 0.02154964953660965, 'timestamp': '2025-10-01 04:17:56.705784', 'step': 1462, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:56.757201', 'step': 1462, 'epoch': 1} {'type': 'loss', 'content': 0.016181670129299164, 'timestamp': '2025-10-01 04:17:56.768769', 'step': 1463, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:56.805313', 'step': 1463, 'epoch': 1} {'type': 'loss', 'content': 0.03245038166642189, 'timestamp': '2025-10-01 04:17:56.834706', 'step': 1464, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:56.881963', 'step': 1464, 'epoch': 1} {'type': 'loss', 'content': 0.016212454065680504, 'timestamp': '2025-10-01 04:17:56.892344', 'step': 1465, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:17:56.935736', 'step': 1465, 'epoch': 1} {'type': 'loss', 'content': 0.02356822043657303, 'timestamp': '2025-10-01 04:17:56.943688', 'step': 1466, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:17:56.983875', 'step': 1466, 'epoch': 1} {'type': 'loss', 'content': 0.013604934327304363, 'timestamp': '2025-10-01 04:17:56.996556', 'step': 1467, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:57.035696', 'step': 1467, 'epoch': 1} {'type': 'loss', 'content': 0.028578083962202072, 'timestamp': '2025-10-01 04:17:57.069173', 'step': 1468, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:57.105979', 'step': 1468, 'epoch': 1} {'type': 'loss', 'content': 0.031199678778648376, 'timestamp': '2025-10-01 04:17:57.115316', 'step': 1469, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:57.159345', 'step': 1469, 'epoch': 1} {'type': 'loss', 'content': 0.018763314932584763, 'timestamp': '2025-10-01 04:17:57.170965', 'step': 1470, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:57.203689', 'step': 1470, 'epoch': 1} {'type': 'loss', 'content': 0.017443107441067696, 'timestamp': '2025-10-01 04:17:57.211996', 'step': 1471, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:17:57.256063', 'step': 1471, 'epoch': 1} {'type': 'loss', 'content': 0.01511184498667717, 'timestamp': '2025-10-01 04:17:57.284652', 'step': 1472, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:17:57.329201', 'step': 1472, 'epoch': 1} {'type': 'loss', 'content': 0.02664830908179283, 'timestamp': '2025-10-01 04:17:57.334600', 'step': 1473, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:57.372840', 'step': 1473, 'epoch': 1} {'type': 'loss', 'content': 0.016942927613854408, 'timestamp': '2025-10-01 04:17:57.384316', 'step': 1474, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:57.439558', 'step': 1474, 'epoch': 1} {'type': 'loss', 'content': 0.01934242621064186, 'timestamp': '2025-10-01 04:17:57.450252', 'step': 1475, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:57.487038', 'step': 1475, 'epoch': 1} {'type': 'loss', 'content': 0.030840331688523293, 'timestamp': '2025-10-01 04:17:57.520520', 'step': 1476, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:57.579630', 'step': 1476, 'epoch': 1} {'type': 'loss', 'content': 0.023219402879476547, 'timestamp': '2025-10-01 04:17:57.587955', 'step': 1477, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:17:57.624562', 'step': 1477, 'epoch': 1} {'type': 'loss', 'content': 0.033144544810056686, 'timestamp': '2025-10-01 04:17:57.632490', 'step': 1478, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:57.690427', 'step': 1478, 'epoch': 1} {'type': 'loss', 'content': 0.025627050548791885, 'timestamp': '2025-10-01 04:17:57.698784', 'step': 1479, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:57.735925', 'step': 1479, 'epoch': 1} {'type': 'loss', 'content': 0.019381066784262657, 'timestamp': '2025-10-01 04:17:57.767566', 'step': 1480, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:57.806119', 'step': 1480, 'epoch': 1} {'type': 'loss', 'content': 0.03029000759124756, 'timestamp': '2025-10-01 04:17:57.815466', 'step': 1481, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:57.861892', 'step': 1481, 'epoch': 1} {'type': 'loss', 'content': 0.03448953106999397, 'timestamp': '2025-10-01 04:17:57.873566', 'step': 1482, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:57.914459', 'step': 1482, 'epoch': 1} {'type': 'loss', 'content': 0.01334314327687025, 'timestamp': '2025-10-01 04:17:57.925572', 'step': 1483, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:17:57.980092', 'step': 1483, 'epoch': 1} {'type': 'loss', 'content': 0.034625034779310226, 'timestamp': '2025-10-01 04:17:58.015090', 'step': 1484, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:17:58.057449', 'step': 1484, 'epoch': 1} {'type': 'loss', 'content': 0.024145063012838364, 'timestamp': '2025-10-01 04:17:58.070794', 'step': 1485, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:17:58.104027', 'step': 1485, 'epoch': 1} {'type': 'loss', 'content': 0.02899235486984253, 'timestamp': '2025-10-01 04:17:58.111656', 'step': 1486, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:17:58.164829', 'step': 1486, 'epoch': 1} {'type': 'loss', 'content': 0.013184099458158016, 'timestamp': '2025-10-01 04:17:58.177544', 'step': 1487, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 15187581968384}, 'timestamp': '2025-10-01 04:17:58.228873', 'step': 1487, 'epoch': 1} {'type': 'loss', 'content': 0.013117182068526745, 'timestamp': '2025-10-01 04:17:58.267682', 'step': 1488, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:58.317038', 'step': 1488, 'epoch': 1} {'type': 'loss', 'content': 0.01810397394001484, 'timestamp': '2025-10-01 04:17:58.326386', 'step': 1489, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:17:58.372713', 'step': 1489, 'epoch': 1} {'type': 'loss', 'content': 0.018638476729393005, 'timestamp': '2025-10-01 04:17:58.385218', 'step': 1490, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:58.427851', 'step': 1490, 'epoch': 1} {'type': 'loss', 'content': 0.016684915870428085, 'timestamp': '2025-10-01 04:17:58.438754', 'step': 1491, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:17:58.487529', 'step': 1491, 'epoch': 1} {'type': 'loss', 'content': 0.01958133466541767, 'timestamp': '2025-10-01 04:17:58.519329', 'step': 1492, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:17:58.560598', 'step': 1492, 'epoch': 1} {'type': 'loss', 'content': 0.04190078005194664, 'timestamp': '2025-10-01 04:17:58.566567', 'step': 1493, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:17:58.615071', 'step': 1493, 'epoch': 1} {'type': 'loss', 'content': 0.011535041965544224, 'timestamp': '2025-10-01 04:17:58.622270', 'step': 1494, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:17:58.668276', 'step': 1494, 'epoch': 1} {'type': 'loss', 'content': 0.018958117812871933, 'timestamp': '2025-10-01 04:17:58.679770', 'step': 1495, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-10-01 04:18:01.628829', 'step': 1495, 'epoch': 1} {'type': 'pplx', 'content': 5.448813704574382, 'timestamp': '2025-10-01 04:18:01.630334', 'step': 1495, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:18:01.663342', 'step': 1495, 'epoch': 1} {'type': 'loss', 'content': 0.017586395144462585, 'timestamp': '2025-10-01 04:18:01.694715', 'step': 1496, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:18:01.733967', 'step': 1496, 'epoch': 1} {'type': 'loss', 'content': 0.01691913791000843, 'timestamp': '2025-10-01 04:18:01.747332', 'step': 1497, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:18:01.791314', 'step': 1497, 'epoch': 1} {'type': 'loss', 'content': 0.022938566282391548, 'timestamp': '2025-10-01 04:18:01.799427', 'step': 1498, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:18:01.867891', 'step': 1498, 'epoch': 1} {'type': 'loss', 'content': 0.0243848729878664, 'timestamp': '2025-10-01 04:18:01.875855', 'step': 1499, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:01.912811', 'step': 1499, 'epoch': 1} {'type': 'loss', 'content': 0.015246634371578693, 'timestamp': '2025-10-01 04:18:01.945090', 'step': 1500, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 1500', 'timestamp': '2025-10-01 04:18:07.210260', 'step': 1500, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:07.267481', 'step': 1500, 'epoch': 1} {'type': 'loss', 'content': 0.017364447936415672, 'timestamp': '2025-10-01 04:18:07.274897', 'step': 1501, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:07.331784', 'step': 1501, 'epoch': 1} {'type': 'loss', 'content': 0.006571675185114145, 'timestamp': '2025-10-01 04:18:07.344133', 'step': 1502, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:07.403711', 'step': 1502, 'epoch': 1} {'type': 'loss', 'content': 0.009253782220184803, 'timestamp': '2025-10-01 04:18:07.417278', 'step': 1503, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:18:07.480762', 'step': 1503, 'epoch': 1} {'type': 'loss', 'content': 0.009325696155428886, 'timestamp': '2025-10-01 04:18:07.512553', 'step': 1504, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:07.572632', 'step': 1504, 'epoch': 1} {'type': 'loss', 'content': 0.01157869677990675, 'timestamp': '2025-10-01 04:18:07.581876', 'step': 1505, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:07.629379', 'step': 1505, 'epoch': 1} {'type': 'loss', 'content': 0.014158712700009346, 'timestamp': '2025-10-01 04:18:07.640956', 'step': 1506, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:18:07.696959', 'step': 1506, 'epoch': 1} {'type': 'loss', 'content': 0.02838299609720707, 'timestamp': '2025-10-01 04:18:07.704419', 'step': 1507, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:18:07.757433', 'step': 1507, 'epoch': 1} {'type': 'loss', 'content': 0.012078782543540001, 'timestamp': '2025-10-01 04:18:07.786611', 'step': 1508, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:18:07.844241', 'step': 1508, 'epoch': 1} {'type': 'loss', 'content': 0.007396257482469082, 'timestamp': '2025-10-01 04:18:07.857648', 'step': 1509, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:07.909514', 'step': 1509, 'epoch': 1} {'type': 'loss', 'content': 0.015254822559654713, 'timestamp': '2025-10-01 04:18:07.921387', 'step': 1510, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:07.979173', 'step': 1510, 'epoch': 1} {'type': 'loss', 'content': 0.02308519557118416, 'timestamp': '2025-10-01 04:18:07.990690', 'step': 1511, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:18:08.031862', 'step': 1511, 'epoch': 1} {'type': 'loss', 'content': 0.01683235354721546, 'timestamp': '2025-10-01 04:18:08.063357', 'step': 1512, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:08.117954', 'step': 1512, 'epoch': 1} {'type': 'loss', 'content': 0.006957236677408218, 'timestamp': '2025-10-01 04:18:08.128465', 'step': 1513, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:08.180659', 'step': 1513, 'epoch': 1} {'type': 'loss', 'content': 0.02949751727283001, 'timestamp': '2025-10-01 04:18:08.193184', 'step': 1514, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:18:08.240425', 'step': 1514, 'epoch': 1} {'type': 'loss', 'content': 0.02741338312625885, 'timestamp': '2025-10-01 04:18:08.248453', 'step': 1515, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:08.298981', 'step': 1515, 'epoch': 1} {'type': 'loss', 'content': 0.027072006836533546, 'timestamp': '2025-10-01 04:18:08.331586', 'step': 1516, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:18:08.378345', 'step': 1516, 'epoch': 1} {'type': 'loss', 'content': 0.018121542409062386, 'timestamp': '2025-10-01 04:18:08.386767', 'step': 1517, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:18:08.429545', 'step': 1517, 'epoch': 1} {'type': 'loss', 'content': 0.03576618805527687, 'timestamp': '2025-10-01 04:18:08.437272', 'step': 1518, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:18:08.481392', 'step': 1518, 'epoch': 1} {'type': 'loss', 'content': 0.025120655074715614, 'timestamp': '2025-10-01 04:18:08.489018', 'step': 1519, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:18:08.539991', 'step': 1519, 'epoch': 1} {'type': 'loss', 'content': 0.03449385613203049, 'timestamp': '2025-10-01 04:18:08.569070', 'step': 1520, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:18:08.614599', 'step': 1520, 'epoch': 1} {'type': 'loss', 'content': 0.026891129091382027, 'timestamp': '2025-10-01 04:18:08.620243', 'step': 1521, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:08.657186', 'step': 1521, 'epoch': 1} {'type': 'loss', 'content': 0.029606394469738007, 'timestamp': '2025-10-01 04:18:08.669754', 'step': 1522, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:08.721576', 'step': 1522, 'epoch': 1} {'type': 'loss', 'content': 0.014822524040937424, 'timestamp': '2025-10-01 04:18:08.735036', 'step': 1523, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:18:08.790686', 'step': 1523, 'epoch': 1} {'type': 'loss', 'content': 0.02269207127392292, 'timestamp': '2025-10-01 04:18:08.820185', 'step': 1524, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:18:08.880750', 'step': 1524, 'epoch': 1} {'type': 'loss', 'content': 0.034071117639541626, 'timestamp': '2025-10-01 04:18:08.886776', 'step': 1525, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:18:08.936285', 'step': 1525, 'epoch': 1} {'type': 'loss', 'content': 0.017200184985995293, 'timestamp': '2025-10-01 04:18:08.943673', 'step': 1526, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:08.984560', 'step': 1526, 'epoch': 1} {'type': 'loss', 'content': 0.020306864753365517, 'timestamp': '2025-10-01 04:18:08.996215', 'step': 1527, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:09.051914', 'step': 1527, 'epoch': 1} {'type': 'loss', 'content': 0.05050971359014511, 'timestamp': '2025-10-01 04:18:09.086377', 'step': 1528, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:18:09.137359', 'step': 1528, 'epoch': 1} {'type': 'loss', 'content': 0.026865525171160698, 'timestamp': '2025-10-01 04:18:09.145617', 'step': 1529, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:09.195778', 'step': 1529, 'epoch': 1} {'type': 'loss', 'content': 0.03304493799805641, 'timestamp': '2025-10-01 04:18:09.207362', 'step': 1530, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:18:09.257146', 'step': 1530, 'epoch': 1} {'type': 'loss', 'content': 0.023649422451853752, 'timestamp': '2025-10-01 04:18:09.271175', 'step': 1531, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:09.314334', 'step': 1531, 'epoch': 1} {'type': 'loss', 'content': 0.021493330597877502, 'timestamp': '2025-10-01 04:18:09.348079', 'step': 1532, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:09.400783', 'step': 1532, 'epoch': 1} {'type': 'loss', 'content': 0.011504357680678368, 'timestamp': '2025-10-01 04:18:09.409946', 'step': 1533, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:09.461233', 'step': 1533, 'epoch': 1} {'type': 'loss', 'content': 0.04498176649212837, 'timestamp': '2025-10-01 04:18:09.474076', 'step': 1534, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:09.515041', 'step': 1534, 'epoch': 1} {'type': 'loss', 'content': 0.022841744124889374, 'timestamp': '2025-10-01 04:18:09.526565', 'step': 1535, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:09.561333', 'step': 1535, 'epoch': 1} {'type': 'loss', 'content': 0.01160109881311655, 'timestamp': '2025-10-01 04:18:09.595073', 'step': 1536, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:09.641307', 'step': 1536, 'epoch': 1} {'type': 'loss', 'content': 0.012231712229549885, 'timestamp': '2025-10-01 04:18:09.654185', 'step': 1537, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:09.706083', 'step': 1537, 'epoch': 1} {'type': 'loss', 'content': 0.03560103848576546, 'timestamp': '2025-10-01 04:18:09.718604', 'step': 1538, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:09.772822', 'step': 1538, 'epoch': 1} {'type': 'loss', 'content': 0.014405665919184685, 'timestamp': '2025-10-01 04:18:09.785551', 'step': 1539, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:09.822981', 'step': 1539, 'epoch': 1} {'type': 'loss', 'content': 0.04839467629790306, 'timestamp': '2025-10-01 04:18:09.856503', 'step': 1540, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:18:09.908247', 'step': 1540, 'epoch': 1} {'type': 'loss', 'content': 0.023640353232622147, 'timestamp': '2025-10-01 04:18:09.916751', 'step': 1541, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:18:09.975158', 'step': 1541, 'epoch': 1} {'type': 'loss', 'content': 0.008837074041366577, 'timestamp': '2025-10-01 04:18:09.989211', 'step': 1542, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:10.022601', 'step': 1542, 'epoch': 1} {'type': 'loss', 'content': 0.023689191788434982, 'timestamp': '2025-10-01 04:18:10.036538', 'step': 1543, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:10.083981', 'step': 1543, 'epoch': 1} {'type': 'loss', 'content': 0.021525302901864052, 'timestamp': '2025-10-01 04:18:10.117521', 'step': 1544, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:10.169159', 'step': 1544, 'epoch': 1} {'type': 'loss', 'content': 0.006888038478791714, 'timestamp': '2025-10-01 04:18:10.179749', 'step': 1545, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:10.227033', 'step': 1545, 'epoch': 1} {'type': 'loss', 'content': 0.02512364089488983, 'timestamp': '2025-10-01 04:18:10.240634', 'step': 1546, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:10.276423', 'step': 1546, 'epoch': 1} {'type': 'loss', 'content': 0.01647218130528927, 'timestamp': '2025-10-01 04:18:10.288955', 'step': 1547, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:10.346510', 'step': 1547, 'epoch': 1} {'type': 'loss', 'content': 0.02184182219207287, 'timestamp': '2025-10-01 04:18:10.380038', 'step': 1548, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:10.421792', 'step': 1548, 'epoch': 1} {'type': 'loss', 'content': 0.007580046076327562, 'timestamp': '2025-10-01 04:18:10.434670', 'step': 1549, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:10.485723', 'step': 1549, 'epoch': 1} {'type': 'loss', 'content': 0.012353477999567986, 'timestamp': '2025-10-01 04:18:10.498504', 'step': 1550, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:10.535748', 'step': 1550, 'epoch': 1} {'type': 'loss', 'content': 0.013606374152004719, 'timestamp': '2025-10-01 04:18:10.547170', 'step': 1551, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:10.598383', 'step': 1551, 'epoch': 1} {'type': 'loss', 'content': 0.021855227649211884, 'timestamp': '2025-10-01 04:18:10.632946', 'step': 1552, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:10.675689', 'step': 1552, 'epoch': 1} {'type': 'loss', 'content': 0.014832005836069584, 'timestamp': '2025-10-01 04:18:10.686985', 'step': 1553, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:10.734448', 'step': 1553, 'epoch': 1} {'type': 'loss', 'content': 0.02415865659713745, 'timestamp': '2025-10-01 04:18:10.745886', 'step': 1554, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:10.801845', 'step': 1554, 'epoch': 1} {'type': 'loss', 'content': 0.028419308364391327, 'timestamp': '2025-10-01 04:18:10.815405', 'step': 1555, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:18:10.853136', 'step': 1555, 'epoch': 1} {'type': 'loss', 'content': 0.021399280056357384, 'timestamp': '2025-10-01 04:18:10.882535', 'step': 1556, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:18:10.924922', 'step': 1556, 'epoch': 1} {'type': 'loss', 'content': 0.020992500707507133, 'timestamp': '2025-10-01 04:18:10.940482', 'step': 1557, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:10.990369', 'step': 1557, 'epoch': 1} {'type': 'loss', 'content': 0.010936364531517029, 'timestamp': '2025-10-01 04:18:11.002019', 'step': 1558, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:18:11.050810', 'step': 1558, 'epoch': 1} {'type': 'loss', 'content': 0.016816353425383568, 'timestamp': '2025-10-01 04:18:11.064908', 'step': 1559, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:18:11.104551', 'step': 1559, 'epoch': 1} {'type': 'loss', 'content': 0.013380827382206917, 'timestamp': '2025-10-01 04:18:11.133779', 'step': 1560, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:18:11.177006', 'step': 1560, 'epoch': 1} {'type': 'loss', 'content': 0.02546640858054161, 'timestamp': '2025-10-01 04:18:11.185538', 'step': 1561, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:18:11.236139', 'step': 1561, 'epoch': 1} {'type': 'loss', 'content': 0.011112100444734097, 'timestamp': '2025-10-01 04:18:11.250385', 'step': 1562, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:11.293416', 'step': 1562, 'epoch': 1} {'type': 'loss', 'content': 0.02321033366024494, 'timestamp': '2025-10-01 04:18:11.305087', 'step': 1563, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:18:11.343945', 'step': 1563, 'epoch': 1} {'type': 'loss', 'content': 0.03194960951805115, 'timestamp': '2025-10-01 04:18:11.375508', 'step': 1564, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:11.413685', 'step': 1564, 'epoch': 1} {'type': 'loss', 'content': 0.03183642402291298, 'timestamp': '2025-10-01 04:18:11.423032', 'step': 1565, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:11.476094', 'step': 1565, 'epoch': 1} {'type': 'loss', 'content': 0.026108121499419212, 'timestamp': '2025-10-01 04:18:11.488653', 'step': 1566, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:11.525814', 'step': 1566, 'epoch': 1} {'type': 'loss', 'content': 0.020046010613441467, 'timestamp': '2025-10-01 04:18:11.539321', 'step': 1567, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:18:11.573599', 'step': 1567, 'epoch': 1} {'type': 'loss', 'content': 0.04007160663604736, 'timestamp': '2025-10-01 04:18:11.605496', 'step': 1568, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:18:11.648191', 'step': 1568, 'epoch': 1} {'type': 'loss', 'content': 0.017696283757686615, 'timestamp': '2025-10-01 04:18:11.656842', 'step': 1569, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:11.699091', 'step': 1569, 'epoch': 1} {'type': 'loss', 'content': 0.012454686686396599, 'timestamp': '2025-10-01 04:18:11.711806', 'step': 1570, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:11.762411', 'step': 1570, 'epoch': 1} {'type': 'loss', 'content': 0.013571929186582565, 'timestamp': '2025-10-01 04:18:11.774250', 'step': 1571, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:11.820087', 'step': 1571, 'epoch': 1} {'type': 'loss', 'content': 0.024686938151717186, 'timestamp': '2025-10-01 04:18:11.853806', 'step': 1572, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:11.889438', 'step': 1572, 'epoch': 1} {'type': 'loss', 'content': 0.020217042416334152, 'timestamp': '2025-10-01 04:18:11.900544', 'step': 1573, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:11.937948', 'step': 1573, 'epoch': 1} {'type': 'loss', 'content': 0.01481956522911787, 'timestamp': '2025-10-01 04:18:11.950446', 'step': 1574, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:11.993577', 'step': 1574, 'epoch': 1} {'type': 'loss', 'content': 0.020689889788627625, 'timestamp': '2025-10-01 04:18:12.006113', 'step': 1575, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:18:12.062027', 'step': 1575, 'epoch': 1} {'type': 'loss', 'content': 0.007984258234500885, 'timestamp': '2025-10-01 04:18:12.097003', 'step': 1576, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:12.134604', 'step': 1576, 'epoch': 1} {'type': 'loss', 'content': 0.012333019636571407, 'timestamp': '2025-10-01 04:18:12.147383', 'step': 1577, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:18:12.190631', 'step': 1577, 'epoch': 1} {'type': 'loss', 'content': 0.01109802071005106, 'timestamp': '2025-10-01 04:18:12.206721', 'step': 1578, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:18:12.248799', 'step': 1578, 'epoch': 1} {'type': 'loss', 'content': 0.007729155011475086, 'timestamp': '2025-10-01 04:18:12.264614', 'step': 1579, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:18:12.309555', 'step': 1579, 'epoch': 1} {'type': 'loss', 'content': 0.029064537957310677, 'timestamp': '2025-10-01 04:18:12.344485', 'step': 1580, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:12.395841', 'step': 1580, 'epoch': 1} {'type': 'loss', 'content': 0.02199581079185009, 'timestamp': '2025-10-01 04:18:12.405034', 'step': 1581, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:12.450111', 'step': 1581, 'epoch': 1} {'type': 'loss', 'content': 0.03018105961382389, 'timestamp': '2025-10-01 04:18:12.461677', 'step': 1582, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:18:12.507781', 'step': 1582, 'epoch': 1} {'type': 'loss', 'content': 0.01891062781214714, 'timestamp': '2025-10-01 04:18:12.518008', 'step': 1583, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:12.563089', 'step': 1583, 'epoch': 1} {'type': 'loss', 'content': 0.013830526731908321, 'timestamp': '2025-10-01 04:18:12.596804', 'step': 1584, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:18:12.643085', 'step': 1584, 'epoch': 1} {'type': 'loss', 'content': 0.019199766218662262, 'timestamp': '2025-10-01 04:18:12.651158', 'step': 1585, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:12.706558', 'step': 1585, 'epoch': 1} {'type': 'loss', 'content': 0.014671191573143005, 'timestamp': '2025-10-01 04:18:12.719111', 'step': 1586, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:18:12.765117', 'step': 1586, 'epoch': 1} {'type': 'loss', 'content': 0.04207087680697441, 'timestamp': '2025-10-01 04:18:12.775951', 'step': 1587, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:12.824996', 'step': 1587, 'epoch': 1} {'type': 'loss', 'content': 0.01165616326034069, 'timestamp': '2025-10-01 04:18:12.858680', 'step': 1588, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:12.926546', 'step': 1588, 'epoch': 1} {'type': 'loss', 'content': 0.038383204489946365, 'timestamp': '2025-10-01 04:18:12.939391', 'step': 1589, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:13.013548', 'step': 1589, 'epoch': 1} {'type': 'loss', 'content': 0.020439956337213516, 'timestamp': '2025-10-01 04:18:13.025247', 'step': 1590, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:18:13.095381', 'step': 1590, 'epoch': 1} {'type': 'loss', 'content': 0.011590687558054924, 'timestamp': '2025-10-01 04:18:13.103285', 'step': 1591, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:18:13.160115', 'step': 1591, 'epoch': 1} {'type': 'loss', 'content': 0.01732492633163929, 'timestamp': '2025-10-01 04:18:13.191773', 'step': 1592, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:13.229875', 'step': 1592, 'epoch': 1} {'type': 'loss', 'content': 0.03131359443068504, 'timestamp': '2025-10-01 04:18:13.239316', 'step': 1593, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:13.298679', 'step': 1593, 'epoch': 1} {'type': 'loss', 'content': 0.012945168651640415, 'timestamp': '2025-10-01 04:18:13.311186', 'step': 1594, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:18:13.362676', 'step': 1594, 'epoch': 1} {'type': 'loss', 'content': 0.019916623830795288, 'timestamp': '2025-10-01 04:18:13.376658', 'step': 1595, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:18:13.415817', 'step': 1595, 'epoch': 1} {'type': 'loss', 'content': 0.019979530945420265, 'timestamp': '2025-10-01 04:18:13.444521', 'step': 1596, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:13.488847', 'step': 1596, 'epoch': 1} {'type': 'loss', 'content': 0.010706532746553421, 'timestamp': '2025-10-01 04:18:13.499337', 'step': 1597, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:18:13.540917', 'step': 1597, 'epoch': 1} {'type': 'loss', 'content': 0.00958202313631773, 'timestamp': '2025-10-01 04:18:13.554993', 'step': 1598, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:18:13.599402', 'step': 1598, 'epoch': 1} {'type': 'loss', 'content': 0.023142145946621895, 'timestamp': '2025-10-01 04:18:13.610291', 'step': 1599, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:13.655405', 'step': 1599, 'epoch': 1} {'type': 'loss', 'content': 0.023094169795513153, 'timestamp': '2025-10-01 04:18:13.689027', 'step': 1600, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:18:13.728385', 'step': 1600, 'epoch': 1} {'type': 'loss', 'content': 0.01491971779614687, 'timestamp': '2025-10-01 04:18:13.741953', 'step': 1601, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:13.794838', 'step': 1601, 'epoch': 1} {'type': 'loss', 'content': 0.018602842465043068, 'timestamp': '2025-10-01 04:18:13.807603', 'step': 1602, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:13.851856', 'step': 1602, 'epoch': 1} {'type': 'loss', 'content': 0.01623660884797573, 'timestamp': '2025-10-01 04:18:13.865394', 'step': 1603, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:13.905085', 'step': 1603, 'epoch': 1} {'type': 'loss', 'content': 0.016775593161582947, 'timestamp': '2025-10-01 04:18:13.939624', 'step': 1604, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:13.981263', 'step': 1604, 'epoch': 1} {'type': 'loss', 'content': 0.0342467799782753, 'timestamp': '2025-10-01 04:18:13.992295', 'step': 1605, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:18:14.031844', 'step': 1605, 'epoch': 1} {'type': 'loss', 'content': 0.013792905956506729, 'timestamp': '2025-10-01 04:18:14.045828', 'step': 1606, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:14.087145', 'step': 1606, 'epoch': 1} {'type': 'loss', 'content': 0.017715562134981155, 'timestamp': '2025-10-01 04:18:14.100660', 'step': 1607, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:14.135082', 'step': 1607, 'epoch': 1} {'type': 'loss', 'content': 0.012390793301165104, 'timestamp': '2025-10-01 04:18:14.168566', 'step': 1608, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:18:14.200675', 'step': 1608, 'epoch': 1} {'type': 'loss', 'content': 0.028751736506819725, 'timestamp': '2025-10-01 04:18:14.206464', 'step': 1609, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:14.239051', 'step': 1609, 'epoch': 1} {'type': 'loss', 'content': 0.02901051566004753, 'timestamp': '2025-10-01 04:18:14.250522', 'step': 1610, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-10-01 04:18:17.227985', 'step': 1610, 'epoch': 1} {'type': 'pplx', 'content': 5.593979122038321, 'timestamp': '2025-10-01 04:18:17.230261', 'step': 1610, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:17.267722', 'step': 1610, 'epoch': 1} {'type': 'loss', 'content': 0.025198616087436676, 'timestamp': '2025-10-01 04:18:17.279888', 'step': 1611, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-10-01 04:18:17.342539', 'step': 1611, 'epoch': 1} {'type': 'loss', 'content': 0.018780868500471115, 'timestamp': '2025-10-01 04:18:17.379799', 'step': 1612, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:17.427536', 'step': 1612, 'epoch': 1} {'type': 'loss', 'content': 0.011398532427847385, 'timestamp': '2025-10-01 04:18:17.440356', 'step': 1613, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:18:17.504348', 'step': 1613, 'epoch': 1} {'type': 'loss', 'content': 0.01256561279296875, 'timestamp': '2025-10-01 04:18:17.520155', 'step': 1614, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:17.566173', 'step': 1614, 'epoch': 1} {'type': 'loss', 'content': 0.016573766246438026, 'timestamp': '2025-10-01 04:18:17.578652', 'step': 1615, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:17.619716', 'step': 1615, 'epoch': 1} {'type': 'loss', 'content': 0.010477817617356777, 'timestamp': '2025-10-01 04:18:17.653518', 'step': 1616, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:18:17.696108', 'step': 1616, 'epoch': 1} {'type': 'loss', 'content': 0.045246005058288574, 'timestamp': '2025-10-01 04:18:17.704483', 'step': 1617, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:17.752839', 'step': 1617, 'epoch': 1} {'type': 'loss', 'content': 0.03001241199672222, 'timestamp': '2025-10-01 04:18:17.765261', 'step': 1618, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:18:17.830762', 'step': 1618, 'epoch': 1} {'type': 'loss', 'content': 0.023897085338830948, 'timestamp': '2025-10-01 04:18:17.844808', 'step': 1619, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:18:17.894507', 'step': 1619, 'epoch': 1} {'type': 'loss', 'content': 0.040546610951423645, 'timestamp': '2025-10-01 04:18:17.920201', 'step': 1620, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:18:17.970692', 'step': 1620, 'epoch': 1} {'type': 'loss', 'content': 0.04108528792858124, 'timestamp': '2025-10-01 04:18:17.985902', 'step': 1621, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:18:18.033853', 'step': 1621, 'epoch': 1} {'type': 'loss', 'content': 0.04425758495926857, 'timestamp': '2025-10-01 04:18:18.038707', 'step': 1622, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-10-01 04:18:18.088526', 'step': 1622, 'epoch': 1} {'type': 'loss', 'content': 0.017232973128557205, 'timestamp': '2025-10-01 04:18:18.104803', 'step': 1623, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:18:18.148798', 'step': 1623, 'epoch': 1} {'type': 'loss', 'content': 0.017971260473132133, 'timestamp': '2025-10-01 04:18:18.176988', 'step': 1624, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:18.229010', 'step': 1624, 'epoch': 1} {'type': 'loss', 'content': 0.01961800642311573, 'timestamp': '2025-10-01 04:18:18.238249', 'step': 1625, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:18.276692', 'step': 1625, 'epoch': 1} {'type': 'loss', 'content': 0.029848063364624977, 'timestamp': '2025-10-01 04:18:18.288388', 'step': 1626, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:18:18.335970', 'step': 1626, 'epoch': 1} {'type': 'loss', 'content': 0.025408850982785225, 'timestamp': '2025-10-01 04:18:18.346751', 'step': 1627, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:18.400572', 'step': 1627, 'epoch': 1} {'type': 'loss', 'content': 0.01796720176935196, 'timestamp': '2025-10-01 04:18:18.434097', 'step': 1628, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:18:18.492561', 'step': 1628, 'epoch': 1} {'type': 'loss', 'content': 0.03473173826932907, 'timestamp': '2025-10-01 04:18:18.497852', 'step': 1629, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:18.545599', 'step': 1629, 'epoch': 1} {'type': 'loss', 'content': 0.017034264281392097, 'timestamp': '2025-10-01 04:18:18.558189', 'step': 1630, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:18.608047', 'step': 1630, 'epoch': 1} {'type': 'loss', 'content': 0.013170353136956692, 'timestamp': '2025-10-01 04:18:18.619642', 'step': 1631, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:18:18.674963', 'step': 1631, 'epoch': 1} {'type': 'loss', 'content': 0.015764208510518074, 'timestamp': '2025-10-01 04:18:18.704072', 'step': 1632, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-10-01 04:18:18.748187', 'step': 1632, 'epoch': 1} {'type': 'loss', 'content': 0.05028000846505165, 'timestamp': '2025-10-01 04:18:18.750729', 'step': 1633, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:18.804428', 'step': 1633, 'epoch': 1} {'type': 'loss', 'content': 0.01453358307480812, 'timestamp': '2025-10-01 04:18:18.816943', 'step': 1634, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:18:18.862044', 'step': 1634, 'epoch': 1} {'type': 'loss', 'content': 0.015392385423183441, 'timestamp': '2025-10-01 04:18:18.878062', 'step': 1635, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:18:18.911825', 'step': 1635, 'epoch': 1} {'type': 'loss', 'content': 0.013445505872368813, 'timestamp': '2025-10-01 04:18:18.940187', 'step': 1636, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:18.999214', 'step': 1636, 'epoch': 1} {'type': 'loss', 'content': 0.016553904861211777, 'timestamp': '2025-10-01 04:18:19.010276', 'step': 1637, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:19.055359', 'step': 1637, 'epoch': 1} {'type': 'loss', 'content': 0.03215773403644562, 'timestamp': '2025-10-01 04:18:19.067912', 'step': 1638, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:18:19.110619', 'step': 1638, 'epoch': 1} {'type': 'loss', 'content': 0.02450038306415081, 'timestamp': '2025-10-01 04:18:19.121404', 'step': 1639, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:18:19.177356', 'step': 1639, 'epoch': 1} {'type': 'loss', 'content': 0.032134588807821274, 'timestamp': '2025-10-01 04:18:19.212287', 'step': 1640, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:19.257095', 'step': 1640, 'epoch': 1} {'type': 'loss', 'content': 0.02503615990281105, 'timestamp': '2025-10-01 04:18:19.266107', 'step': 1641, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:19.311274', 'step': 1641, 'epoch': 1} {'type': 'loss', 'content': 0.018451670184731483, 'timestamp': '2025-10-01 04:18:19.323904', 'step': 1642, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:19.370129', 'step': 1642, 'epoch': 1} {'type': 'loss', 'content': 0.02114991284906864, 'timestamp': '2025-10-01 04:18:19.382860', 'step': 1643, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:19.429875', 'step': 1643, 'epoch': 1} {'type': 'loss', 'content': 0.010146068409085274, 'timestamp': '2025-10-01 04:18:19.463320', 'step': 1644, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:18:19.503631', 'step': 1644, 'epoch': 1} {'type': 'loss', 'content': 0.017030686140060425, 'timestamp': '2025-10-01 04:18:19.517110', 'step': 1645, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:19.560710', 'step': 1645, 'epoch': 1} {'type': 'loss', 'content': 0.03314747288823128, 'timestamp': '2025-10-01 04:18:19.574267', 'step': 1646, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:19.619309', 'step': 1646, 'epoch': 1} {'type': 'loss', 'content': 0.011789940297603607, 'timestamp': '2025-10-01 04:18:19.632813', 'step': 1647, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:19.676678', 'step': 1647, 'epoch': 1} {'type': 'loss', 'content': 0.008658136241137981, 'timestamp': '2025-10-01 04:18:19.710182', 'step': 1648, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:19.755605', 'step': 1648, 'epoch': 1} {'type': 'loss', 'content': 0.01645156554877758, 'timestamp': '2025-10-01 04:18:19.765688', 'step': 1649, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:18:19.806994', 'step': 1649, 'epoch': 1} {'type': 'loss', 'content': 0.007588530424982309, 'timestamp': '2025-10-01 04:18:19.817916', 'step': 1650, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:18:19.864346', 'step': 1650, 'epoch': 1} {'type': 'loss', 'content': 0.02591988444328308, 'timestamp': '2025-10-01 04:18:19.871988', 'step': 1651, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:18:19.906265', 'step': 1651, 'epoch': 1} {'type': 'loss', 'content': 0.016371173784136772, 'timestamp': '2025-10-01 04:18:19.938145', 'step': 1652, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:18:19.983499', 'step': 1652, 'epoch': 1} {'type': 'loss', 'content': 0.008740334771573544, 'timestamp': '2025-10-01 04:18:19.996867', 'step': 1653, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:18:20.041499', 'step': 1653, 'epoch': 1} {'type': 'loss', 'content': 0.02610090747475624, 'timestamp': '2025-10-01 04:18:20.049021', 'step': 1654, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:18:20.108679', 'step': 1654, 'epoch': 1} {'type': 'loss', 'content': 0.027114836499094963, 'timestamp': '2025-10-01 04:18:20.122697', 'step': 1655, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:18:20.170131', 'step': 1655, 'epoch': 1} {'type': 'loss', 'content': 0.020500676706433296, 'timestamp': '2025-10-01 04:18:20.199300', 'step': 1656, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:18:20.239648', 'step': 1656, 'epoch': 1} {'type': 'loss', 'content': 0.018444588407874107, 'timestamp': '2025-10-01 04:18:20.247985', 'step': 1657, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:20.299217', 'step': 1657, 'epoch': 1} {'type': 'loss', 'content': 0.016284937039017677, 'timestamp': '2025-10-01 04:18:20.311936', 'step': 1658, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:18:20.360145', 'step': 1658, 'epoch': 1} {'type': 'loss', 'content': 0.011339101009070873, 'timestamp': '2025-10-01 04:18:20.367861', 'step': 1659, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:20.406392', 'step': 1659, 'epoch': 1} {'type': 'loss', 'content': 0.010276601649820805, 'timestamp': '2025-10-01 04:18:20.440885', 'step': 1660, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:20.492464', 'step': 1660, 'epoch': 1} {'type': 'loss', 'content': 0.023340856656432152, 'timestamp': '2025-10-01 04:18:20.501679', 'step': 1661, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:18:20.535068', 'step': 1661, 'epoch': 1} {'type': 'loss', 'content': 0.02421562373638153, 'timestamp': '2025-10-01 04:18:20.543345', 'step': 1662, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:18:20.593207', 'step': 1662, 'epoch': 1} {'type': 'loss', 'content': 0.021397819742560387, 'timestamp': '2025-10-01 04:18:20.604046', 'step': 1663, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:18:20.644428', 'step': 1663, 'epoch': 1} {'type': 'loss', 'content': 0.032195936888456345, 'timestamp': '2025-10-01 04:18:20.673779', 'step': 1664, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:20.723507', 'step': 1664, 'epoch': 1} {'type': 'loss', 'content': 0.016440225765109062, 'timestamp': '2025-10-01 04:18:20.734359', 'step': 1665, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:20.783233', 'step': 1665, 'epoch': 1} {'type': 'loss', 'content': 0.006940478924661875, 'timestamp': '2025-10-01 04:18:20.796818', 'step': 1666, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:18:20.846167', 'step': 1666, 'epoch': 1} {'type': 'loss', 'content': 0.020685816183686256, 'timestamp': '2025-10-01 04:18:20.860087', 'step': 1667, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-10-01 04:18:20.917597', 'step': 1667, 'epoch': 1} {'type': 'loss', 'content': 0.008538699708878994, 'timestamp': '2025-10-01 04:18:20.954885', 'step': 1668, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:20.999790', 'step': 1668, 'epoch': 1} {'type': 'loss', 'content': 0.013117249123752117, 'timestamp': '2025-10-01 04:18:21.012634', 'step': 1669, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:21.062484', 'step': 1669, 'epoch': 1} {'type': 'loss', 'content': 0.020457638427615166, 'timestamp': '2025-10-01 04:18:21.075234', 'step': 1670, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:21.109976', 'step': 1670, 'epoch': 1} {'type': 'loss', 'content': 0.007725948002189398, 'timestamp': '2025-10-01 04:18:21.121388', 'step': 1671, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:18:21.171296', 'step': 1671, 'epoch': 1} {'type': 'loss', 'content': 0.00982472114264965, 'timestamp': '2025-10-01 04:18:21.206263', 'step': 1672, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:18:21.247917', 'step': 1672, 'epoch': 1} {'type': 'loss', 'content': 0.019432347267866135, 'timestamp': '2025-10-01 04:18:21.261437', 'step': 1673, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:18:21.311123', 'step': 1673, 'epoch': 1} {'type': 'loss', 'content': 0.009162181057035923, 'timestamp': '2025-10-01 04:18:21.319294', 'step': 1674, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:21.355610', 'step': 1674, 'epoch': 1} {'type': 'loss', 'content': 0.012204865925014019, 'timestamp': '2025-10-01 04:18:21.368333', 'step': 1675, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:18:21.410239', 'step': 1675, 'epoch': 1} {'type': 'loss', 'content': 0.01912628300487995, 'timestamp': '2025-10-01 04:18:21.439444', 'step': 1676, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:18:21.479813', 'step': 1676, 'epoch': 1} {'type': 'loss', 'content': 0.008903730660676956, 'timestamp': '2025-10-01 04:18:21.485018', 'step': 1677, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:18:21.520709', 'step': 1677, 'epoch': 1} {'type': 'loss', 'content': 0.055759236216545105, 'timestamp': '2025-10-01 04:18:21.528896', 'step': 1678, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:18:21.564200', 'step': 1678, 'epoch': 1} {'type': 'loss', 'content': 0.018104534596204758, 'timestamp': '2025-10-01 04:18:21.574600', 'step': 1679, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:18:21.608565', 'step': 1679, 'epoch': 1} {'type': 'loss', 'content': 0.016116252169013023, 'timestamp': '2025-10-01 04:18:21.637456', 'step': 1680, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:21.676514', 'step': 1680, 'epoch': 1} {'type': 'loss', 'content': 0.017669932916760445, 'timestamp': '2025-10-01 04:18:21.686675', 'step': 1681, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-10-01 04:18:21.744329', 'step': 1681, 'epoch': 1} {'type': 'loss', 'content': 0.009513936936855316, 'timestamp': '2025-10-01 04:18:21.761663', 'step': 1682, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-10-01 04:18:21.812019', 'step': 1682, 'epoch': 1} {'type': 'loss', 'content': 0.015163535252213478, 'timestamp': '2025-10-01 04:18:21.828343', 'step': 1683, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:21.869121', 'step': 1683, 'epoch': 1} {'type': 'loss', 'content': 0.028424466028809547, 'timestamp': '2025-10-01 04:18:21.903619', 'step': 1684, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:21.941052', 'step': 1684, 'epoch': 1} {'type': 'loss', 'content': 0.0115863848477602, 'timestamp': '2025-10-01 04:18:21.951471', 'step': 1685, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:21.990978', 'step': 1685, 'epoch': 1} {'type': 'loss', 'content': 0.024395732209086418, 'timestamp': '2025-10-01 04:18:22.004545', 'step': 1686, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:22.050243', 'step': 1686, 'epoch': 1} {'type': 'loss', 'content': 0.027058042585849762, 'timestamp': '2025-10-01 04:18:22.063740', 'step': 1687, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:22.102270', 'step': 1687, 'epoch': 1} {'type': 'loss', 'content': 0.013326586224138737, 'timestamp': '2025-10-01 04:18:22.136708', 'step': 1688, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:18:22.171311', 'step': 1688, 'epoch': 1} {'type': 'loss', 'content': 0.01646624505519867, 'timestamp': '2025-10-01 04:18:22.184648', 'step': 1689, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:18:22.225566', 'step': 1689, 'epoch': 1} {'type': 'loss', 'content': 0.029357250779867172, 'timestamp': '2025-10-01 04:18:22.236283', 'step': 1690, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:18:22.276597', 'step': 1690, 'epoch': 1} {'type': 'loss', 'content': 0.014977855607867241, 'timestamp': '2025-10-01 04:18:22.290601', 'step': 1691, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:22.326372', 'step': 1691, 'epoch': 1} {'type': 'loss', 'content': 0.013059538789093494, 'timestamp': '2025-10-01 04:18:22.359926', 'step': 1692, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:18:22.408590', 'step': 1692, 'epoch': 1} {'type': 'loss', 'content': 0.013435641303658485, 'timestamp': '2025-10-01 04:18:22.421960', 'step': 1693, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:22.459281', 'step': 1693, 'epoch': 1} {'type': 'loss', 'content': 0.02670745924115181, 'timestamp': '2025-10-01 04:18:22.472061', 'step': 1694, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:18:22.508457', 'step': 1694, 'epoch': 1} {'type': 'loss', 'content': 0.031223785132169724, 'timestamp': '2025-10-01 04:18:22.516690', 'step': 1695, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:22.552974', 'step': 1695, 'epoch': 1} {'type': 'loss', 'content': 0.038206346333026886, 'timestamp': '2025-10-01 04:18:22.587495', 'step': 1696, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:18:22.633584', 'step': 1696, 'epoch': 1} {'type': 'loss', 'content': 0.030020413920283318, 'timestamp': '2025-10-01 04:18:22.648903', 'step': 1697, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:22.693495', 'step': 1697, 'epoch': 1} {'type': 'loss', 'content': 0.029286161065101624, 'timestamp': '2025-10-01 04:18:22.706074', 'step': 1698, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:22.745092', 'step': 1698, 'epoch': 1} {'type': 'loss', 'content': 0.025705287232995033, 'timestamp': '2025-10-01 04:18:22.757895', 'step': 1699, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:22.795470', 'step': 1699, 'epoch': 1} {'type': 'loss', 'content': 0.045115936547517776, 'timestamp': '2025-10-01 04:18:22.828954', 'step': 1700, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:22.862248', 'step': 1700, 'epoch': 1} {'type': 'loss', 'content': 0.019663488492369652, 'timestamp': '2025-10-01 04:18:22.871125', 'step': 1701, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:22.909284', 'step': 1701, 'epoch': 1} {'type': 'loss', 'content': 0.016010113060474396, 'timestamp': '2025-10-01 04:18:22.920935', 'step': 1702, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:18:22.953027', 'step': 1702, 'epoch': 1} {'type': 'loss', 'content': 0.015191620215773582, 'timestamp': '2025-10-01 04:18:22.963660', 'step': 1703, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:18:23.002566', 'step': 1703, 'epoch': 1} {'type': 'loss', 'content': 0.013926830142736435, 'timestamp': '2025-10-01 04:18:23.034208', 'step': 1704, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:23.071004', 'step': 1704, 'epoch': 1} {'type': 'loss', 'content': 0.020394135266542435, 'timestamp': '2025-10-01 04:18:23.080040', 'step': 1705, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:23.119621', 'step': 1705, 'epoch': 1} {'type': 'loss', 'content': 0.012026087380945683, 'timestamp': '2025-10-01 04:18:23.130966', 'step': 1706, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:18:23.167052', 'step': 1706, 'epoch': 1} {'type': 'loss', 'content': 0.02041672170162201, 'timestamp': '2025-10-01 04:18:23.174739', 'step': 1707, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:18:23.210336', 'step': 1707, 'epoch': 1} {'type': 'loss', 'content': 0.032481852918863297, 'timestamp': '2025-10-01 04:18:23.245308', 'step': 1708, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:18:23.285534', 'step': 1708, 'epoch': 1} {'type': 'loss', 'content': 0.019367672502994537, 'timestamp': '2025-10-01 04:18:23.298921', 'step': 1709, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:23.335420', 'step': 1709, 'epoch': 1} {'type': 'loss', 'content': 0.014696183614432812, 'timestamp': '2025-10-01 04:18:23.348213', 'step': 1710, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:18:23.391030', 'step': 1710, 'epoch': 1} {'type': 'loss', 'content': 0.017389249056577682, 'timestamp': '2025-10-01 04:18:23.401662', 'step': 1711, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:23.436147', 'step': 1711, 'epoch': 1} {'type': 'loss', 'content': 0.020116174593567848, 'timestamp': '2025-10-01 04:18:23.468583', 'step': 1712, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:18:23.504021', 'step': 1712, 'epoch': 1} {'type': 'loss', 'content': 0.022103119641542435, 'timestamp': '2025-10-01 04:18:23.517352', 'step': 1713, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:18:23.557432', 'step': 1713, 'epoch': 1} {'type': 'loss', 'content': 0.03199003264307976, 'timestamp': '2025-10-01 04:18:23.571437', 'step': 1714, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-10-01 04:18:23.614862', 'step': 1714, 'epoch': 1} {'type': 'loss', 'content': 0.020290348678827286, 'timestamp': '2025-10-01 04:18:23.631352', 'step': 1715, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:23.665863', 'step': 1715, 'epoch': 1} {'type': 'loss', 'content': 0.014079631306231022, 'timestamp': '2025-10-01 04:18:23.698092', 'step': 1716, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:23.744847', 'step': 1716, 'epoch': 1} {'type': 'loss', 'content': 0.01670772023499012, 'timestamp': '2025-10-01 04:18:23.755104', 'step': 1717, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:23.795497', 'step': 1717, 'epoch': 1} {'type': 'loss', 'content': 0.013140379451215267, 'timestamp': '2025-10-01 04:18:23.808249', 'step': 1718, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:18:23.840439', 'step': 1718, 'epoch': 1} {'type': 'loss', 'content': 0.020490651950240135, 'timestamp': '2025-10-01 04:18:23.851138', 'step': 1719, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:23.884305', 'step': 1719, 'epoch': 1} {'type': 'loss', 'content': 0.030255138874053955, 'timestamp': '2025-10-01 04:18:23.918030', 'step': 1720, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:23.950787', 'step': 1720, 'epoch': 1} {'type': 'loss', 'content': 0.030911199748516083, 'timestamp': '2025-10-01 04:18:23.959844', 'step': 1721, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:23.995202', 'step': 1721, 'epoch': 1} {'type': 'loss', 'content': 0.01870807446539402, 'timestamp': '2025-10-01 04:18:24.006700', 'step': 1722, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:24.039555', 'step': 1722, 'epoch': 1} {'type': 'loss', 'content': 0.00911593809723854, 'timestamp': '2025-10-01 04:18:24.052271', 'step': 1723, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:24.084088', 'step': 1723, 'epoch': 1} {'type': 'loss', 'content': 0.03782375901937485, 'timestamp': '2025-10-01 04:18:24.117545', 'step': 1724, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:18:24.149087', 'step': 1724, 'epoch': 1} {'type': 'loss', 'content': 0.01561465859413147, 'timestamp': '2025-10-01 04:18:24.155002', 'step': 1725, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-10-01 04:18:26.527144', 'step': 1725, 'epoch': 1} {'type': 'pplx', 'content': 5.601864715132421, 'timestamp': '2025-10-01 04:18:26.528808', 'step': 1725, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:26.560581', 'step': 1725, 'epoch': 1} {'type': 'loss', 'content': 0.010027130134403706, 'timestamp': '2025-10-01 04:18:26.573984', 'step': 1726, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:18:26.606201', 'step': 1726, 'epoch': 1} {'type': 'loss', 'content': 0.034467846155166626, 'timestamp': '2025-10-01 04:18:26.613217', 'step': 1727, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:18:26.647869', 'step': 1727, 'epoch': 1} {'type': 'loss', 'content': 0.007656212896108627, 'timestamp': '2025-10-01 04:18:26.682775', 'step': 1728, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:18:26.713890', 'step': 1728, 'epoch': 1} {'type': 'loss', 'content': 0.03145366907119751, 'timestamp': '2025-10-01 04:18:26.722024', 'step': 1729, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:18:26.753033', 'step': 1729, 'epoch': 1} {'type': 'loss', 'content': 0.02027307264506817, 'timestamp': '2025-10-01 04:18:26.760261', 'step': 1730, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:26.791317', 'step': 1730, 'epoch': 1} {'type': 'loss', 'content': 0.03065544366836548, 'timestamp': '2025-10-01 04:18:26.802606', 'step': 1731, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:18:26.833582', 'step': 1731, 'epoch': 1} {'type': 'loss', 'content': 0.022751335054636, 'timestamp': '2025-10-01 04:18:26.865192', 'step': 1732, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:18:26.896603', 'step': 1732, 'epoch': 1} {'type': 'loss', 'content': 0.026806462556123734, 'timestamp': '2025-10-01 04:18:26.902420', 'step': 1733, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:26.933900', 'step': 1733, 'epoch': 1} {'type': 'loss', 'content': 0.020350703969597816, 'timestamp': '2025-10-01 04:18:26.946429', 'step': 1734, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:26.977571', 'step': 1734, 'epoch': 1} {'type': 'loss', 'content': 0.010938882827758789, 'timestamp': '2025-10-01 04:18:26.990048', 'step': 1735, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:18:27.026799', 'step': 1735, 'epoch': 1} {'type': 'loss', 'content': 0.02501179836690426, 'timestamp': '2025-10-01 04:18:27.061721', 'step': 1736, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:18:27.096207', 'step': 1736, 'epoch': 1} {'type': 'loss', 'content': 0.03313678875565529, 'timestamp': '2025-10-01 04:18:27.109532', 'step': 1737, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:18:27.139380', 'step': 1737, 'epoch': 1} {'type': 'loss', 'content': 0.008483263663947582, 'timestamp': '2025-10-01 04:18:27.146861', 'step': 1738, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-10-01 04:18:27.176866', 'step': 1738, 'epoch': 1} {'type': 'loss', 'content': 0.025176139548420906, 'timestamp': '2025-10-01 04:18:27.181466', 'step': 1739, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-10-01 04:18:27.213789', 'step': 1739, 'epoch': 1} {'type': 'loss', 'content': 0.010088310576975346, 'timestamp': '2025-10-01 04:18:27.239124', 'step': 1740, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:27.270570', 'step': 1740, 'epoch': 1} {'type': 'loss', 'content': 0.01300918497145176, 'timestamp': '2025-10-01 04:18:27.280339', 'step': 1741, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:18:27.312146', 'step': 1741, 'epoch': 1} {'type': 'loss', 'content': 0.010754897259175777, 'timestamp': '2025-10-01 04:18:27.322761', 'step': 1742, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:18:27.354604', 'step': 1742, 'epoch': 1} {'type': 'loss', 'content': 0.00481828860938549, 'timestamp': '2025-10-01 04:18:27.364607', 'step': 1743, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:18:27.397285', 'step': 1743, 'epoch': 1} {'type': 'loss', 'content': 0.02174891158938408, 'timestamp': '2025-10-01 04:18:27.426020', 'step': 1744, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:18:27.457438', 'step': 1744, 'epoch': 1} {'type': 'loss', 'content': 0.010435568168759346, 'timestamp': '2025-10-01 04:18:27.463036', 'step': 1745, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:18:27.493317', 'step': 1745, 'epoch': 1} {'type': 'loss', 'content': 0.017720995470881462, 'timestamp': '2025-10-01 04:18:27.501526', 'step': 1746, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:27.532812', 'step': 1746, 'epoch': 1} {'type': 'loss', 'content': 0.006690440699458122, 'timestamp': '2025-10-01 04:18:27.545411', 'step': 1747, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:27.581867', 'step': 1747, 'epoch': 1} {'type': 'loss', 'content': 0.014519999735057354, 'timestamp': '2025-10-01 04:18:27.615550', 'step': 1748, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:27.647223', 'step': 1748, 'epoch': 1} {'type': 'loss', 'content': 0.0317423976957798, 'timestamp': '2025-10-01 04:18:27.655400', 'step': 1749, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:27.689381', 'step': 1749, 'epoch': 1} {'type': 'loss', 'content': 0.017313024029135704, 'timestamp': '2025-10-01 04:18:27.702977', 'step': 1750, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:18:27.733365', 'step': 1750, 'epoch': 1} {'type': 'loss', 'content': 0.024189934134483337, 'timestamp': '2025-10-01 04:18:27.741627', 'step': 1751, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:18:27.772304', 'step': 1751, 'epoch': 1} {'type': 'loss', 'content': 0.01774529553949833, 'timestamp': '2025-10-01 04:18:27.803624', 'step': 1752, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:18:27.834732', 'step': 1752, 'epoch': 1} {'type': 'loss', 'content': 0.014753354713320732, 'timestamp': '2025-10-01 04:18:27.840049', 'step': 1753, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:18:27.874820', 'step': 1753, 'epoch': 1} {'type': 'loss', 'content': 0.024173537269234657, 'timestamp': '2025-10-01 04:18:27.882746', 'step': 1754, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:27.917476', 'step': 1754, 'epoch': 1} {'type': 'loss', 'content': 0.017047399654984474, 'timestamp': '2025-10-01 04:18:27.931051', 'step': 1755, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:27.962997', 'step': 1755, 'epoch': 1} {'type': 'loss', 'content': 0.015936287119984627, 'timestamp': '2025-10-01 04:18:27.995819', 'step': 1756, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:18:28.026203', 'step': 1756, 'epoch': 1} {'type': 'loss', 'content': 0.028364410623908043, 'timestamp': '2025-10-01 04:18:28.032040', 'step': 1757, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:28.065851', 'step': 1757, 'epoch': 1} {'type': 'loss', 'content': 0.01169276237487793, 'timestamp': '2025-10-01 04:18:28.079317', 'step': 1758, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 14712978242368}, 'timestamp': '2025-10-01 04:18:28.122083', 'step': 1758, 'epoch': 1} {'type': 'loss', 'content': 0.009476525709033012, 'timestamp': '2025-10-01 04:18:28.139847', 'step': 1759, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:18:28.174785', 'step': 1759, 'epoch': 1} {'type': 'loss', 'content': 0.008177477866411209, 'timestamp': '2025-10-01 04:18:28.209708', 'step': 1760, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:18:28.245544', 'step': 1760, 'epoch': 1} {'type': 'loss', 'content': 0.013267424888908863, 'timestamp': '2025-10-01 04:18:28.260840', 'step': 1761, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:28.291848', 'step': 1761, 'epoch': 1} {'type': 'loss', 'content': 0.03363882750272751, 'timestamp': '2025-10-01 04:18:28.304611', 'step': 1762, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:28.336439', 'step': 1762, 'epoch': 1} {'type': 'loss', 'content': 0.008160397410392761, 'timestamp': '2025-10-01 04:18:28.349235', 'step': 1763, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:18:28.384665', 'step': 1763, 'epoch': 1} {'type': 'loss', 'content': 0.01128003653138876, 'timestamp': '2025-10-01 04:18:28.419580', 'step': 1764, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:28.450100', 'step': 1764, 'epoch': 1} {'type': 'loss', 'content': 0.01587626338005066, 'timestamp': '2025-10-01 04:18:28.461091', 'step': 1765, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:28.512578', 'step': 1765, 'epoch': 1} {'type': 'loss', 'content': 0.019184308126568794, 'timestamp': '2025-10-01 04:18:28.525136', 'step': 1766, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:18:28.559755', 'step': 1766, 'epoch': 1} {'type': 'loss', 'content': 0.014177127741277218, 'timestamp': '2025-10-01 04:18:28.573739', 'step': 1767, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:18:28.605542', 'step': 1767, 'epoch': 1} {'type': 'loss', 'content': 0.010413894429802895, 'timestamp': '2025-10-01 04:18:28.637233', 'step': 1768, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:28.667880', 'step': 1768, 'epoch': 1} {'type': 'loss', 'content': 0.019515879452228546, 'timestamp': '2025-10-01 04:18:28.677973', 'step': 1769, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:18:28.712645', 'step': 1769, 'epoch': 1} {'type': 'loss', 'content': 0.012564033269882202, 'timestamp': '2025-10-01 04:18:28.726610', 'step': 1770, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:28.760369', 'step': 1770, 'epoch': 1} {'type': 'loss', 'content': 0.015752892941236496, 'timestamp': '2025-10-01 04:18:28.773125', 'step': 1771, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:18:28.805183', 'step': 1771, 'epoch': 1} {'type': 'loss', 'content': 0.01215983647853136, 'timestamp': '2025-10-01 04:18:28.837066', 'step': 1772, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:28.867985', 'step': 1772, 'epoch': 1} {'type': 'loss', 'content': 0.02222033590078354, 'timestamp': '2025-10-01 04:18:28.876753', 'step': 1773, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:28.907502', 'step': 1773, 'epoch': 1} {'type': 'loss', 'content': 0.011051365174353123, 'timestamp': '2025-10-01 04:18:28.919091', 'step': 1774, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:18:28.949940', 'step': 1774, 'epoch': 1} {'type': 'loss', 'content': 0.021608304232358932, 'timestamp': '2025-10-01 04:18:28.958214', 'step': 1775, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:18:28.993872', 'step': 1775, 'epoch': 1} {'type': 'loss', 'content': 0.005096945911645889, 'timestamp': '2025-10-01 04:18:29.028842', 'step': 1776, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:18:29.074482', 'step': 1776, 'epoch': 1} {'type': 'loss', 'content': 0.007277989760041237, 'timestamp': '2025-10-01 04:18:29.089965', 'step': 1777, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:18:29.127815', 'step': 1777, 'epoch': 1} {'type': 'loss', 'content': 0.006591422483325005, 'timestamp': '2025-10-01 04:18:29.141845', 'step': 1778, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:18:29.179071', 'step': 1778, 'epoch': 1} {'type': 'loss', 'content': 0.006580004468560219, 'timestamp': '2025-10-01 04:18:29.193060', 'step': 1779, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:29.225135', 'step': 1779, 'epoch': 1} {'type': 'loss', 'content': 0.011600742116570473, 'timestamp': '2025-10-01 04:18:29.258734', 'step': 1780, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:29.290722', 'step': 1780, 'epoch': 1} {'type': 'loss', 'content': 0.019471243023872375, 'timestamp': '2025-10-01 04:18:29.301351', 'step': 1781, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:29.334284', 'step': 1781, 'epoch': 1} {'type': 'loss', 'content': 0.006876435596495867, 'timestamp': '2025-10-01 04:18:29.347016', 'step': 1782, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:18:29.386998', 'step': 1782, 'epoch': 1} {'type': 'loss', 'content': 0.0062797786667943, 'timestamp': '2025-10-01 04:18:29.403052', 'step': 1783, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:18:29.439482', 'step': 1783, 'epoch': 1} {'type': 'loss', 'content': 0.016312038525938988, 'timestamp': '2025-10-01 04:18:29.468381', 'step': 1784, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-10-01 04:18:29.506037', 'step': 1784, 'epoch': 1} {'type': 'loss', 'content': 0.0161957498639822, 'timestamp': '2025-10-01 04:18:29.521830', 'step': 1785, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:29.554026', 'step': 1785, 'epoch': 1} {'type': 'loss', 'content': 0.03529829531908035, 'timestamp': '2025-10-01 04:18:29.565556', 'step': 1786, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:29.596839', 'step': 1786, 'epoch': 1} {'type': 'loss', 'content': 0.013611998409032822, 'timestamp': '2025-10-01 04:18:29.608402', 'step': 1787, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:18:29.639320', 'step': 1787, 'epoch': 1} {'type': 'loss', 'content': 0.009845256805419922, 'timestamp': '2025-10-01 04:18:29.670969', 'step': 1788, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-10-01 04:18:29.707953', 'step': 1788, 'epoch': 1} {'type': 'loss', 'content': 0.008966210298240185, 'timestamp': '2025-10-01 04:18:29.723805', 'step': 1789, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:18:29.759049', 'step': 1789, 'epoch': 1} {'type': 'loss', 'content': 0.013007842935621738, 'timestamp': '2025-10-01 04:18:29.772989', 'step': 1790, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:18:29.809432', 'step': 1790, 'epoch': 1} {'type': 'loss', 'content': 0.007810547947883606, 'timestamp': '2025-10-01 04:18:29.823456', 'step': 1791, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 14238374516352}, 'timestamp': '2025-10-01 04:18:29.869650', 'step': 1791, 'epoch': 1} {'type': 'loss', 'content': 0.008561071008443832, 'timestamp': '2025-10-01 04:18:29.907990', 'step': 1792, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:29.938959', 'step': 1792, 'epoch': 1} {'type': 'loss', 'content': 0.01668546348810196, 'timestamp': '2025-10-01 04:18:29.948100', 'step': 1793, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:29.980448', 'step': 1793, 'epoch': 1} {'type': 'loss', 'content': 0.019076699391007423, 'timestamp': '2025-10-01 04:18:29.992878', 'step': 1794, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:18:30.028805', 'step': 1794, 'epoch': 1} {'type': 'loss', 'content': 0.01214452926069498, 'timestamp': '2025-10-01 04:18:30.042760', 'step': 1795, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:18:30.078992', 'step': 1795, 'epoch': 1} {'type': 'loss', 'content': 0.005245069041848183, 'timestamp': '2025-10-01 04:18:30.114196', 'step': 1796, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:30.145451', 'step': 1796, 'epoch': 1} {'type': 'loss', 'content': 0.024406544864177704, 'timestamp': '2025-10-01 04:18:30.154293', 'step': 1797, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:30.188486', 'step': 1797, 'epoch': 1} {'type': 'loss', 'content': 0.008885449729859829, 'timestamp': '2025-10-01 04:18:30.202066', 'step': 1798, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:18:30.236584', 'step': 1798, 'epoch': 1} {'type': 'loss', 'content': 0.018838804215192795, 'timestamp': '2025-10-01 04:18:30.250627', 'step': 1799, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:30.283412', 'step': 1799, 'epoch': 1} {'type': 'loss', 'content': 0.024760624393820763, 'timestamp': '2025-10-01 04:18:30.316850', 'step': 1800, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:30.347909', 'step': 1800, 'epoch': 1} {'type': 'loss', 'content': 0.010942873544991016, 'timestamp': '2025-10-01 04:18:30.358383', 'step': 1801, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:30.390520', 'step': 1801, 'epoch': 1} {'type': 'loss', 'content': 0.021728897467255592, 'timestamp': '2025-10-01 04:18:30.401981', 'step': 1802, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:18:30.433261', 'step': 1802, 'epoch': 1} {'type': 'loss', 'content': 0.023533660918474197, 'timestamp': '2025-10-01 04:18:30.444003', 'step': 1803, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:30.475237', 'step': 1803, 'epoch': 1} {'type': 'loss', 'content': 0.02223568968474865, 'timestamp': '2025-10-01 04:18:30.507507', 'step': 1804, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:18:30.538200', 'step': 1804, 'epoch': 1} {'type': 'loss', 'content': 0.010113216936588287, 'timestamp': '2025-10-01 04:18:30.544229', 'step': 1805, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:18:30.575203', 'step': 1805, 'epoch': 1} {'type': 'loss', 'content': 0.011264489963650703, 'timestamp': '2025-10-01 04:18:30.582148', 'step': 1806, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:30.616163', 'step': 1806, 'epoch': 1} {'type': 'loss', 'content': 0.008940476924180984, 'timestamp': '2025-10-01 04:18:30.629678', 'step': 1807, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:30.661255', 'step': 1807, 'epoch': 1} {'type': 'loss', 'content': 0.02408371865749359, 'timestamp': '2025-10-01 04:18:30.694947', 'step': 1808, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:30.726530', 'step': 1808, 'epoch': 1} {'type': 'loss', 'content': 0.018335795029997826, 'timestamp': '2025-10-01 04:18:30.736691', 'step': 1809, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:30.772893', 'step': 1809, 'epoch': 1} {'type': 'loss', 'content': 0.02496935799717903, 'timestamp': '2025-10-01 04:18:30.785668', 'step': 1810, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:30.820541', 'step': 1810, 'epoch': 1} {'type': 'loss', 'content': 0.03186821937561035, 'timestamp': '2025-10-01 04:18:30.834126', 'step': 1811, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:30.865958', 'step': 1811, 'epoch': 1} {'type': 'loss', 'content': 0.029807399958372116, 'timestamp': '2025-10-01 04:18:30.899414', 'step': 1812, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:18:30.933749', 'step': 1812, 'epoch': 1} {'type': 'loss', 'content': 0.01161856297403574, 'timestamp': '2025-10-01 04:18:30.947261', 'step': 1813, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:30.979101', 'step': 1813, 'epoch': 1} {'type': 'loss', 'content': 0.00798034481704235, 'timestamp': '2025-10-01 04:18:30.991397', 'step': 1814, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 17560600598464}, 'timestamp': '2025-10-01 04:18:31.042654', 'step': 1814, 'epoch': 1} {'type': 'loss', 'content': 0.0058685410767793655, 'timestamp': '2025-10-01 04:18:31.063827', 'step': 1815, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:31.098820', 'step': 1815, 'epoch': 1} {'type': 'loss', 'content': 0.012516863644123077, 'timestamp': '2025-10-01 04:18:31.133245', 'step': 1816, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:18:31.165062', 'step': 1816, 'epoch': 1} {'type': 'loss', 'content': 0.010818073526024818, 'timestamp': '2025-10-01 04:18:31.170061', 'step': 1817, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:31.201974', 'step': 1817, 'epoch': 1} {'type': 'loss', 'content': 0.0059486934915184975, 'timestamp': '2025-10-01 04:18:31.214489', 'step': 1818, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:31.248824', 'step': 1818, 'epoch': 1} {'type': 'loss', 'content': 0.007898926734924316, 'timestamp': '2025-10-01 04:18:31.262397', 'step': 1819, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:31.293886', 'step': 1819, 'epoch': 1} {'type': 'loss', 'content': 0.009081286378204823, 'timestamp': '2025-10-01 04:18:31.327671', 'step': 1820, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:31.361219', 'step': 1820, 'epoch': 1} {'type': 'loss', 'content': 0.02388526313006878, 'timestamp': '2025-10-01 04:18:31.374032', 'step': 1821, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:31.404308', 'step': 1821, 'epoch': 1} {'type': 'loss', 'content': 0.033021844923496246, 'timestamp': '2025-10-01 04:18:31.415765', 'step': 1822, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:31.447093', 'step': 1822, 'epoch': 1} {'type': 'loss', 'content': 0.024497490376234055, 'timestamp': '2025-10-01 04:18:31.459639', 'step': 1823, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:18:31.500508', 'step': 1823, 'epoch': 1} {'type': 'loss', 'content': 0.02912803180515766, 'timestamp': '2025-10-01 04:18:31.535501', 'step': 1824, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:31.569441', 'step': 1824, 'epoch': 1} {'type': 'loss', 'content': 0.018693383783102036, 'timestamp': '2025-10-01 04:18:31.582274', 'step': 1825, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:31.614614', 'step': 1825, 'epoch': 1} {'type': 'loss', 'content': 0.01734699122607708, 'timestamp': '2025-10-01 04:18:31.627362', 'step': 1826, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:18:31.663926', 'step': 1826, 'epoch': 1} {'type': 'loss', 'content': 0.030496900901198387, 'timestamp': '2025-10-01 04:18:31.677943', 'step': 1827, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:31.708962', 'step': 1827, 'epoch': 1} {'type': 'loss', 'content': 0.02643936313688755, 'timestamp': '2025-10-01 04:18:31.742440', 'step': 1828, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:31.773627', 'step': 1828, 'epoch': 1} {'type': 'loss', 'content': 0.03282654285430908, 'timestamp': '2025-10-01 04:18:31.783507', 'step': 1829, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:31.813996', 'step': 1829, 'epoch': 1} {'type': 'loss', 'content': 0.030992023646831512, 'timestamp': '2025-10-01 04:18:31.825587', 'step': 1830, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:18:31.856781', 'step': 1830, 'epoch': 1} {'type': 'loss', 'content': 0.0277473833411932, 'timestamp': '2025-10-01 04:18:31.867437', 'step': 1831, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:18:31.902859', 'step': 1831, 'epoch': 1} {'type': 'loss', 'content': 0.010192503221333027, 'timestamp': '2025-10-01 04:18:31.938001', 'step': 1832, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:18:31.971809', 'step': 1832, 'epoch': 1} {'type': 'loss', 'content': 0.027514657005667686, 'timestamp': '2025-10-01 04:18:31.985213', 'step': 1833, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 15187581968384}, 'timestamp': '2025-10-01 04:18:32.029169', 'step': 1833, 'epoch': 1} {'type': 'loss', 'content': 0.013618717901408672, 'timestamp': '2025-10-01 04:18:32.047119', 'step': 1834, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:18:32.087798', 'step': 1834, 'epoch': 1} {'type': 'loss', 'content': 0.021984675899147987, 'timestamp': '2025-10-01 04:18:32.103848', 'step': 1835, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:32.138380', 'step': 1835, 'epoch': 1} {'type': 'loss', 'content': 0.00422345707193017, 'timestamp': '2025-10-01 04:18:32.172938', 'step': 1836, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:32.205038', 'step': 1836, 'epoch': 1} {'type': 'loss', 'content': 0.010084973648190498, 'timestamp': '2025-10-01 04:18:32.215384', 'step': 1837, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:32.249514', 'step': 1837, 'epoch': 1} {'type': 'loss', 'content': 0.01134247425943613, 'timestamp': '2025-10-01 04:18:32.263075', 'step': 1838, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:32.300518', 'step': 1838, 'epoch': 1} {'type': 'loss', 'content': 0.012564203701913357, 'timestamp': '2025-10-01 04:18:32.314060', 'step': 1839, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:18:32.349787', 'step': 1839, 'epoch': 1} {'type': 'loss', 'content': 0.012390137650072575, 'timestamp': '2025-10-01 04:18:32.384826', 'step': 1840, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-10-01 04:18:34.749627', 'step': 1840, 'epoch': 1} {'type': 'pplx', 'content': 5.568335479998755, 'timestamp': '2025-10-01 04:18:34.751817', 'step': 1840, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:34.783185', 'step': 1840, 'epoch': 1} {'type': 'loss', 'content': 0.005450346972793341, 'timestamp': '2025-10-01 04:18:34.795030', 'step': 1841, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:34.832925', 'step': 1841, 'epoch': 1} {'type': 'loss', 'content': 0.015193846076726913, 'timestamp': '2025-10-01 04:18:34.845544', 'step': 1842, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:18:34.880412', 'step': 1842, 'epoch': 1} {'type': 'loss', 'content': 0.01000229176133871, 'timestamp': '2025-10-01 04:18:34.894298', 'step': 1843, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:18:34.925461', 'step': 1843, 'epoch': 1} {'type': 'loss', 'content': 0.018862273544073105, 'timestamp': '2025-10-01 04:18:34.954008', 'step': 1844, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:18:34.986137', 'step': 1844, 'epoch': 2} {'type': 'loss', 'content': 0.03820132464170456, 'timestamp': '2025-10-01 04:18:34.991292', 'step': 1845, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:35.025459', 'step': 1845, 'epoch': 2} {'type': 'loss', 'content': 0.007865171879529953, 'timestamp': '2025-10-01 04:18:35.038980', 'step': 1846, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:18:35.069967', 'step': 1846, 'epoch': 2} {'type': 'loss', 'content': 0.013688954524695873, 'timestamp': '2025-10-01 04:18:35.078027', 'step': 1847, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:35.109982', 'step': 1847, 'epoch': 2} {'type': 'loss', 'content': 0.014721650630235672, 'timestamp': '2025-10-01 04:18:35.142340', 'step': 1848, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:35.173824', 'step': 1848, 'epoch': 2} {'type': 'loss', 'content': 0.011716598644852638, 'timestamp': '2025-10-01 04:18:35.183916', 'step': 1849, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:18:35.215320', 'step': 1849, 'epoch': 2} {'type': 'loss', 'content': 0.011819160543382168, 'timestamp': '2025-10-01 04:18:35.223227', 'step': 1850, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:35.258010', 'step': 1850, 'epoch': 2} {'type': 'loss', 'content': 0.014694464392960072, 'timestamp': '2025-10-01 04:18:35.271595', 'step': 1851, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:35.302747', 'step': 1851, 'epoch': 2} {'type': 'loss', 'content': 0.010577446781098843, 'timestamp': '2025-10-01 04:18:35.336391', 'step': 1852, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-10-01 04:18:35.375813', 'step': 1852, 'epoch': 2} {'type': 'loss', 'content': 0.011161639355123043, 'timestamp': '2025-10-01 04:18:35.392598', 'step': 1853, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:35.423706', 'step': 1853, 'epoch': 2} {'type': 'loss', 'content': 0.010226949118077755, 'timestamp': '2025-10-01 04:18:35.435192', 'step': 1854, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:35.469847', 'step': 1854, 'epoch': 2} {'type': 'loss', 'content': 0.005998927168548107, 'timestamp': '2025-10-01 04:18:35.483429', 'step': 1855, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:35.518572', 'step': 1855, 'epoch': 2} {'type': 'loss', 'content': 0.011409330181777477, 'timestamp': '2025-10-01 04:18:35.553107', 'step': 1856, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:18:35.584289', 'step': 1856, 'epoch': 2} {'type': 'loss', 'content': 0.013351772911846638, 'timestamp': '2025-10-01 04:18:35.589727', 'step': 1857, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:18:35.620414', 'step': 1857, 'epoch': 2} {'type': 'loss', 'content': 0.013659109361469746, 'timestamp': '2025-10-01 04:18:35.628519', 'step': 1858, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:35.659680', 'step': 1858, 'epoch': 2} {'type': 'loss', 'content': 0.01279975101351738, 'timestamp': '2025-10-01 04:18:35.671169', 'step': 1859, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:35.705695', 'step': 1859, 'epoch': 2} {'type': 'loss', 'content': 0.013204109854996204, 'timestamp': '2025-10-01 04:18:35.740176', 'step': 1860, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:18:35.770396', 'step': 1860, 'epoch': 2} {'type': 'loss', 'content': 0.018839897587895393, 'timestamp': '2025-10-01 04:18:35.778849', 'step': 1861, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:35.811200', 'step': 1861, 'epoch': 2} {'type': 'loss', 'content': 0.016438620164990425, 'timestamp': '2025-10-01 04:18:35.823967', 'step': 1862, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:18:35.857534', 'step': 1862, 'epoch': 2} {'type': 'loss', 'content': 0.018327197059988976, 'timestamp': '2025-10-01 04:18:35.865014', 'step': 1863, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:18:35.895999', 'step': 1863, 'epoch': 2} {'type': 'loss', 'content': 0.02078166976571083, 'timestamp': '2025-10-01 04:18:35.925142', 'step': 1864, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:18:35.956204', 'step': 1864, 'epoch': 2} {'type': 'loss', 'content': 0.0168403722345829, 'timestamp': '2025-10-01 04:18:35.962101', 'step': 1865, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:18:35.995381', 'step': 1865, 'epoch': 2} {'type': 'loss', 'content': 0.006144505459815264, 'timestamp': '2025-10-01 04:18:36.003754', 'step': 1866, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:18:36.034500', 'step': 1866, 'epoch': 2} {'type': 'loss', 'content': 0.013867932371795177, 'timestamp': '2025-10-01 04:18:36.041931', 'step': 1867, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:18:36.074231', 'step': 1867, 'epoch': 2} {'type': 'loss', 'content': 0.014335421845316887, 'timestamp': '2025-10-01 04:18:36.102550', 'step': 1868, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:18:36.135612', 'step': 1868, 'epoch': 2} {'type': 'loss', 'content': 0.007680200506001711, 'timestamp': '2025-10-01 04:18:36.148919', 'step': 1869, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:18:36.183268', 'step': 1869, 'epoch': 2} {'type': 'loss', 'content': 0.022508980706334114, 'timestamp': '2025-10-01 04:18:36.190587', 'step': 1870, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:18:36.226702', 'step': 1870, 'epoch': 2} {'type': 'loss', 'content': 0.011306891217827797, 'timestamp': '2025-10-01 04:18:36.240597', 'step': 1871, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:36.273586', 'step': 1871, 'epoch': 2} {'type': 'loss', 'content': 0.017875203862786293, 'timestamp': '2025-10-01 04:18:36.307283', 'step': 1872, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:36.339096', 'step': 1872, 'epoch': 2} {'type': 'loss', 'content': 0.00794854387640953, 'timestamp': '2025-10-01 04:18:36.350120', 'step': 1873, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:36.384273', 'step': 1873, 'epoch': 2} {'type': 'loss', 'content': 0.009827898815274239, 'timestamp': '2025-10-01 04:18:36.397816', 'step': 1874, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:36.429253', 'step': 1874, 'epoch': 2} {'type': 'loss', 'content': 0.010488813742995262, 'timestamp': '2025-10-01 04:18:36.440520', 'step': 1875, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:18:36.471550', 'step': 1875, 'epoch': 2} {'type': 'loss', 'content': 0.008571452461183071, 'timestamp': '2025-10-01 04:18:36.500724', 'step': 1876, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:18:36.533876', 'step': 1876, 'epoch': 2} {'type': 'loss', 'content': 0.008217232301831245, 'timestamp': '2025-10-01 04:18:36.547213', 'step': 1877, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:36.578909', 'step': 1877, 'epoch': 2} {'type': 'loss', 'content': 0.018934834748506546, 'timestamp': '2025-10-01 04:18:36.590361', 'step': 1878, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:18:36.626783', 'step': 1878, 'epoch': 2} {'type': 'loss', 'content': 0.015348241664469242, 'timestamp': '2025-10-01 04:18:36.640760', 'step': 1879, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:18:36.694350', 'step': 1879, 'epoch': 2} {'type': 'loss', 'content': 0.017238156870007515, 'timestamp': '2025-10-01 04:18:36.729264', 'step': 1880, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:18:36.774692', 'step': 1880, 'epoch': 2} {'type': 'loss', 'content': 0.0052963364869356155, 'timestamp': '2025-10-01 04:18:36.780592', 'step': 1881, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:36.842825', 'step': 1881, 'epoch': 2} {'type': 'loss', 'content': 0.009239593520760536, 'timestamp': '2025-10-01 04:18:36.853831', 'step': 1882, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:18:36.899838', 'step': 1882, 'epoch': 2} {'type': 'loss', 'content': 0.020877879112958908, 'timestamp': '2025-10-01 04:18:36.907173', 'step': 1883, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:18:36.963770', 'step': 1883, 'epoch': 2} {'type': 'loss', 'content': 0.01545753888785839, 'timestamp': '2025-10-01 04:18:36.998465', 'step': 1884, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:18:37.047790', 'step': 1884, 'epoch': 2} {'type': 'loss', 'content': 0.017146632075309753, 'timestamp': '2025-10-01 04:18:37.053631', 'step': 1885, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:18:37.097321', 'step': 1885, 'epoch': 2} {'type': 'loss', 'content': 0.01256302185356617, 'timestamp': '2025-10-01 04:18:37.107887', 'step': 1886, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 15187581968384}, 'timestamp': '2025-10-01 04:18:37.158951', 'step': 1886, 'epoch': 2} {'type': 'loss', 'content': 0.010217920877039433, 'timestamp': '2025-10-01 04:18:37.176835', 'step': 1887, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:18:37.227355', 'step': 1887, 'epoch': 2} {'type': 'loss', 'content': 0.00902167335152626, 'timestamp': '2025-10-01 04:18:37.262265', 'step': 1888, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:18:37.313137', 'step': 1888, 'epoch': 2} {'type': 'loss', 'content': 0.007561711594462395, 'timestamp': '2025-10-01 04:18:37.328766', 'step': 1889, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:37.367641', 'step': 1889, 'epoch': 2} {'type': 'loss', 'content': 0.01449101883918047, 'timestamp': '2025-10-01 04:18:37.380230', 'step': 1890, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:18:37.452061', 'step': 1890, 'epoch': 2} {'type': 'loss', 'content': 0.010136093012988567, 'timestamp': '2025-10-01 04:18:37.460214', 'step': 1891, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:37.512046', 'step': 1891, 'epoch': 2} {'type': 'loss', 'content': 0.013669551350176334, 'timestamp': '2025-10-01 04:18:37.544480', 'step': 1892, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:18:37.596975', 'step': 1892, 'epoch': 2} {'type': 'loss', 'content': 0.01569577306509018, 'timestamp': '2025-10-01 04:18:37.603811', 'step': 1893, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:37.654921', 'step': 1893, 'epoch': 2} {'type': 'loss', 'content': 0.02122773602604866, 'timestamp': '2025-10-01 04:18:37.666352', 'step': 1894, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:18:37.731457', 'step': 1894, 'epoch': 2} {'type': 'loss', 'content': 0.013736285269260406, 'timestamp': '2025-10-01 04:18:37.739294', 'step': 1895, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:37.804420', 'step': 1895, 'epoch': 2} {'type': 'loss', 'content': 0.009090851061046124, 'timestamp': '2025-10-01 04:18:37.838903', 'step': 1896, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:18:37.890529', 'step': 1896, 'epoch': 2} {'type': 'loss', 'content': 0.008909045718610287, 'timestamp': '2025-10-01 04:18:37.903830', 'step': 1897, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:37.956083', 'step': 1897, 'epoch': 2} {'type': 'loss', 'content': 0.010444003157317638, 'timestamp': '2025-10-01 04:18:37.967683', 'step': 1898, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:38.017806', 'step': 1898, 'epoch': 2} {'type': 'loss', 'content': 0.010843073949217796, 'timestamp': '2025-10-01 04:18:38.029236', 'step': 1899, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:38.090144', 'step': 1899, 'epoch': 2} {'type': 'loss', 'content': 0.012226540595293045, 'timestamp': '2025-10-01 04:18:38.123610', 'step': 1900, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:38.195118', 'step': 1900, 'epoch': 2} {'type': 'loss', 'content': 0.016123108565807343, 'timestamp': '2025-10-01 04:18:38.205641', 'step': 1901, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:38.275370', 'step': 1901, 'epoch': 2} {'type': 'loss', 'content': 0.010011312551796436, 'timestamp': '2025-10-01 04:18:38.286850', 'step': 1902, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:18:38.353873', 'step': 1902, 'epoch': 2} {'type': 'loss', 'content': 0.009070523083209991, 'timestamp': '2025-10-01 04:18:38.361286', 'step': 1903, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:18:38.410915', 'step': 1903, 'epoch': 2} {'type': 'loss', 'content': 0.021322788670659065, 'timestamp': '2025-10-01 04:18:38.440308', 'step': 1904, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:18:38.487614', 'step': 1904, 'epoch': 2} {'type': 'loss', 'content': 0.00696581369265914, 'timestamp': '2025-10-01 04:18:38.492510', 'step': 1905, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:18:38.535916', 'step': 1905, 'epoch': 2} {'type': 'loss', 'content': 0.011080571450293064, 'timestamp': '2025-10-01 04:18:38.543662', 'step': 1906, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:38.593038', 'step': 1906, 'epoch': 2} {'type': 'loss', 'content': 0.010627690702676773, 'timestamp': '2025-10-01 04:18:38.605513', 'step': 1907, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:18:38.647319', 'step': 1907, 'epoch': 2} {'type': 'loss', 'content': 0.0067788828164339066, 'timestamp': '2025-10-01 04:18:38.682387', 'step': 1908, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:18:38.732199', 'step': 1908, 'epoch': 2} {'type': 'loss', 'content': 0.007359156385064125, 'timestamp': '2025-10-01 04:18:38.745753', 'step': 1909, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:18:38.791268', 'step': 1909, 'epoch': 2} {'type': 'loss', 'content': 0.012397399172186852, 'timestamp': '2025-10-01 04:18:38.798523', 'step': 1910, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:38.834396', 'step': 1910, 'epoch': 2} {'type': 'loss', 'content': 0.012921800836920738, 'timestamp': '2025-10-01 04:18:38.846935', 'step': 1911, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:38.887210', 'step': 1911, 'epoch': 2} {'type': 'loss', 'content': 0.008307450450956821, 'timestamp': '2025-10-01 04:18:38.920656', 'step': 1912, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:38.964003', 'step': 1912, 'epoch': 2} {'type': 'loss', 'content': 0.010192888788878918, 'timestamp': '2025-10-01 04:18:38.973286', 'step': 1913, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:18:39.038175', 'step': 1913, 'epoch': 2} {'type': 'loss', 'content': 0.011736606247723103, 'timestamp': '2025-10-01 04:18:39.052163', 'step': 1914, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:18:39.089452', 'step': 1914, 'epoch': 2} {'type': 'loss', 'content': 0.012741741724312305, 'timestamp': '2025-10-01 04:18:39.097260', 'step': 1915, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:39.137095', 'step': 1915, 'epoch': 2} {'type': 'loss', 'content': 0.007002673111855984, 'timestamp': '2025-10-01 04:18:39.170551', 'step': 1916, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:18:39.210716', 'step': 1916, 'epoch': 2} {'type': 'loss', 'content': 0.013662534765899181, 'timestamp': '2025-10-01 04:18:39.219233', 'step': 1917, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:39.254598', 'step': 1917, 'epoch': 2} {'type': 'loss', 'content': 0.008733103983104229, 'timestamp': '2025-10-01 04:18:39.267396', 'step': 1918, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:39.312932', 'step': 1918, 'epoch': 2} {'type': 'loss', 'content': 0.009519064798951149, 'timestamp': '2025-10-01 04:18:39.324521', 'step': 1919, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:18:39.372450', 'step': 1919, 'epoch': 2} {'type': 'loss', 'content': 0.011130514554679394, 'timestamp': '2025-10-01 04:18:39.401459', 'step': 1920, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:39.444301', 'step': 1920, 'epoch': 2} {'type': 'loss', 'content': 0.011831437237560749, 'timestamp': '2025-10-01 04:18:39.454645', 'step': 1921, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:39.499128', 'step': 1921, 'epoch': 2} {'type': 'loss', 'content': 0.013504189439117908, 'timestamp': '2025-10-01 04:18:39.511817', 'step': 1922, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:39.555447', 'step': 1922, 'epoch': 2} {'type': 'loss', 'content': 0.012575997970998287, 'timestamp': '2025-10-01 04:18:39.567120', 'step': 1923, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:18:39.618151', 'step': 1923, 'epoch': 2} {'type': 'loss', 'content': 0.015461028553545475, 'timestamp': '2025-10-01 04:18:39.646610', 'step': 1924, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:18:39.697378', 'step': 1924, 'epoch': 2} {'type': 'loss', 'content': 0.014925253577530384, 'timestamp': '2025-10-01 04:18:39.710359', 'step': 1925, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:18:39.753569', 'step': 1925, 'epoch': 2} {'type': 'loss', 'content': 0.00820508599281311, 'timestamp': '2025-10-01 04:18:39.761794', 'step': 1926, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:39.798541', 'step': 1926, 'epoch': 2} {'type': 'loss', 'content': 0.008297908119857311, 'timestamp': '2025-10-01 04:18:39.812096', 'step': 1927, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-10-01 04:18:39.855934', 'step': 1927, 'epoch': 2} {'type': 'loss', 'content': 0.004497934132814407, 'timestamp': '2025-10-01 04:18:39.893276', 'step': 1928, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:39.934079', 'step': 1928, 'epoch': 2} {'type': 'loss', 'content': 0.017110373824834824, 'timestamp': '2025-10-01 04:18:39.946884', 'step': 1929, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:18:39.989548', 'step': 1929, 'epoch': 2} {'type': 'loss', 'content': 0.00907861441373825, 'timestamp': '2025-10-01 04:18:40.003542', 'step': 1930, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:18:40.046330', 'step': 1930, 'epoch': 2} {'type': 'loss', 'content': 0.007920607924461365, 'timestamp': '2025-10-01 04:18:40.060372', 'step': 1931, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:40.103977', 'step': 1931, 'epoch': 2} {'type': 'loss', 'content': 0.0067981877364218235, 'timestamp': '2025-10-01 04:18:40.138482', 'step': 1932, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:18:40.182429', 'step': 1932, 'epoch': 2} {'type': 'loss', 'content': 0.01214128639549017, 'timestamp': '2025-10-01 04:18:40.195799', 'step': 1933, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:18:40.231187', 'step': 1933, 'epoch': 2} {'type': 'loss', 'content': 0.014207003638148308, 'timestamp': '2025-10-01 04:18:40.244658', 'step': 1934, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:18:40.282086', 'step': 1934, 'epoch': 2} {'type': 'loss', 'content': 0.007230380550026894, 'timestamp': '2025-10-01 04:18:40.292782', 'step': 1935, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:18:40.334817', 'step': 1935, 'epoch': 2} {'type': 'loss', 'content': 0.01176377758383751, 'timestamp': '2025-10-01 04:18:40.364014', 'step': 1936, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:18:40.400482', 'step': 1936, 'epoch': 2} {'type': 'loss', 'content': 0.013041955418884754, 'timestamp': '2025-10-01 04:18:40.408544', 'step': 1937, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:40.444598', 'step': 1937, 'epoch': 2} {'type': 'loss', 'content': 0.010989395901560783, 'timestamp': '2025-10-01 04:18:40.456381', 'step': 1938, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:18:40.496803', 'step': 1938, 'epoch': 2} {'type': 'loss', 'content': 0.012487567961215973, 'timestamp': '2025-10-01 04:18:40.510844', 'step': 1939, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:40.543257', 'step': 1939, 'epoch': 2} {'type': 'loss', 'content': 0.010269319638609886, 'timestamp': '2025-10-01 04:18:40.576684', 'step': 1940, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:18:40.618053', 'step': 1940, 'epoch': 2} {'type': 'loss', 'content': 0.02301398664712906, 'timestamp': '2025-10-01 04:18:40.626523', 'step': 1941, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:40.663414', 'step': 1941, 'epoch': 2} {'type': 'loss', 'content': 0.015543151646852493, 'timestamp': '2025-10-01 04:18:40.676198', 'step': 1942, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:18:40.715233', 'step': 1942, 'epoch': 2} {'type': 'loss', 'content': 0.013862814754247665, 'timestamp': '2025-10-01 04:18:40.729207', 'step': 1943, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:40.769223', 'step': 1943, 'epoch': 2} {'type': 'loss', 'content': 0.01791265979409218, 'timestamp': '2025-10-01 04:18:40.803714', 'step': 1944, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:18:40.841871', 'step': 1944, 'epoch': 2} {'type': 'loss', 'content': 0.004067541100084782, 'timestamp': '2025-10-01 04:18:40.855233', 'step': 1945, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:40.890000', 'step': 1945, 'epoch': 2} {'type': 'loss', 'content': 0.009141439571976662, 'timestamp': '2025-10-01 04:18:40.902550', 'step': 1946, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:40.936931', 'step': 1946, 'epoch': 2} {'type': 'loss', 'content': 0.00973354373127222, 'timestamp': '2025-10-01 04:18:40.949498', 'step': 1947, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:18:40.988924', 'step': 1947, 'epoch': 2} {'type': 'loss', 'content': 0.008560257032513618, 'timestamp': '2025-10-01 04:18:41.017731', 'step': 1948, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:18:41.055610', 'step': 1948, 'epoch': 2} {'type': 'loss', 'content': 0.005396606400609016, 'timestamp': '2025-10-01 04:18:41.064101', 'step': 1949, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:18:41.105825', 'step': 1949, 'epoch': 2} {'type': 'loss', 'content': 0.00528886029496789, 'timestamp': '2025-10-01 04:18:41.119864', 'step': 1950, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:41.155730', 'step': 1950, 'epoch': 2} {'type': 'loss', 'content': 0.0046656192280352116, 'timestamp': '2025-10-01 04:18:41.168521', 'step': 1951, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:18:41.208421', 'step': 1951, 'epoch': 2} {'type': 'loss', 'content': 0.012598524801433086, 'timestamp': '2025-10-01 04:18:41.240003', 'step': 1952, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:18:41.276225', 'step': 1952, 'epoch': 2} {'type': 'loss', 'content': 0.007788372226059437, 'timestamp': '2025-10-01 04:18:41.281831', 'step': 1953, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:41.328584', 'step': 1953, 'epoch': 2} {'type': 'loss', 'content': 0.01929563283920288, 'timestamp': '2025-10-01 04:18:41.341306', 'step': 1954, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-10-01 04:18:41.385967', 'step': 1954, 'epoch': 2} {'type': 'loss', 'content': 0.0093774925917387, 'timestamp': '2025-10-01 04:18:41.402268', 'step': 1955, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-10-01 04:18:44.117089', 'step': 1955, 'epoch': 2} {'type': 'pplx', 'content': 5.539104913839287, 'timestamp': '2025-10-01 04:18:44.120208', 'step': 1955, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:44.155841', 'step': 1955, 'epoch': 2} {'type': 'loss', 'content': 0.0128665491938591, 'timestamp': '2025-10-01 04:18:44.188257', 'step': 1956, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:18:44.233853', 'step': 1956, 'epoch': 2} {'type': 'loss', 'content': 0.008603084832429886, 'timestamp': '2025-10-01 04:18:44.247351', 'step': 1957, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:18:44.288344', 'step': 1957, 'epoch': 2} {'type': 'loss', 'content': 0.013736980967223644, 'timestamp': '2025-10-01 04:18:44.296545', 'step': 1958, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:18:44.336877', 'step': 1958, 'epoch': 2} {'type': 'loss', 'content': 0.022138552740216255, 'timestamp': '2025-10-01 04:18:44.344568', 'step': 1959, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:18:44.391789', 'step': 1959, 'epoch': 2} {'type': 'loss', 'content': 0.018873469904065132, 'timestamp': '2025-10-01 04:18:44.428697', 'step': 1960, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:18:44.471022', 'step': 1960, 'epoch': 2} {'type': 'loss', 'content': 0.008743165992200375, 'timestamp': '2025-10-01 04:18:44.479249', 'step': 1961, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:44.515826', 'step': 1961, 'epoch': 2} {'type': 'loss', 'content': 0.01437905989587307, 'timestamp': '2025-10-01 04:18:44.528331', 'step': 1962, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:44.567032', 'step': 1962, 'epoch': 2} {'type': 'loss', 'content': 0.020304564386606216, 'timestamp': '2025-10-01 04:18:44.578565', 'step': 1963, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:18:44.620061', 'step': 1963, 'epoch': 2} {'type': 'loss', 'content': 0.008796967566013336, 'timestamp': '2025-10-01 04:18:44.655015', 'step': 1964, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:18:44.695806', 'step': 1964, 'epoch': 2} {'type': 'loss', 'content': 0.015902109444141388, 'timestamp': '2025-10-01 04:18:44.709065', 'step': 1965, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:44.749564', 'step': 1965, 'epoch': 2} {'type': 'loss', 'content': 0.010493443347513676, 'timestamp': '2025-10-01 04:18:44.762251', 'step': 1966, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:18:44.798077', 'step': 1966, 'epoch': 2} {'type': 'loss', 'content': 0.01428085658699274, 'timestamp': '2025-10-01 04:18:44.812028', 'step': 1967, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:18:44.855080', 'step': 1967, 'epoch': 2} {'type': 'loss', 'content': 0.0065879602916538715, 'timestamp': '2025-10-01 04:18:44.892096', 'step': 1968, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:18:44.937146', 'step': 1968, 'epoch': 2} {'type': 'loss', 'content': 0.005495098419487476, 'timestamp': '2025-10-01 04:18:44.952736', 'step': 1969, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:18:44.996318', 'step': 1969, 'epoch': 2} {'type': 'loss', 'content': 0.007976599968969822, 'timestamp': '2025-10-01 04:18:45.012127', 'step': 1970, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:45.047932', 'step': 1970, 'epoch': 2} {'type': 'loss', 'content': 0.009238004684448242, 'timestamp': '2025-10-01 04:18:45.059554', 'step': 1971, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 16611393146432}, 'timestamp': '2025-10-01 04:18:45.114114', 'step': 1971, 'epoch': 2} {'type': 'loss', 'content': 0.013294457457959652, 'timestamp': '2025-10-01 04:18:45.154492', 'step': 1972, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-10-01 04:18:45.209329', 'step': 1972, 'epoch': 2} {'type': 'loss', 'content': 0.008463277481496334, 'timestamp': '2025-10-01 04:18:45.226169', 'step': 1973, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:18:45.277324', 'step': 1973, 'epoch': 2} {'type': 'loss', 'content': 0.006849885452538729, 'timestamp': '2025-10-01 04:18:45.291463', 'step': 1974, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:45.327351', 'step': 1974, 'epoch': 2} {'type': 'loss', 'content': 0.010126440785825253, 'timestamp': '2025-10-01 04:18:45.339898', 'step': 1975, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:45.384622', 'step': 1975, 'epoch': 2} {'type': 'loss', 'content': 0.0123395761474967, 'timestamp': '2025-10-01 04:18:45.419083', 'step': 1976, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-10-01 04:18:45.465026', 'step': 1976, 'epoch': 2} {'type': 'loss', 'content': 0.00779641093686223, 'timestamp': '2025-10-01 04:18:45.481849', 'step': 1977, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 14712978242368}, 'timestamp': '2025-10-01 04:18:45.529059', 'step': 1977, 'epoch': 2} {'type': 'loss', 'content': 0.006612794473767281, 'timestamp': '2025-10-01 04:18:45.546817', 'step': 1978, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:18:45.587700', 'step': 1978, 'epoch': 2} {'type': 'loss', 'content': 0.014660404995083809, 'timestamp': '2025-10-01 04:18:45.601768', 'step': 1979, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:18:45.636795', 'step': 1979, 'epoch': 2} {'type': 'loss', 'content': 0.01383479218930006, 'timestamp': '2025-10-01 04:18:45.666742', 'step': 1980, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:18:45.703032', 'step': 1980, 'epoch': 2} {'type': 'loss', 'content': 0.009216244332492352, 'timestamp': '2025-10-01 04:18:45.711151', 'step': 1981, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:45.759000', 'step': 1981, 'epoch': 2} {'type': 'loss', 'content': 0.012158205732703209, 'timestamp': '2025-10-01 04:18:45.771827', 'step': 1982, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:45.823825', 'step': 1982, 'epoch': 2} {'type': 'loss', 'content': 0.013594024814665318, 'timestamp': '2025-10-01 04:18:45.837460', 'step': 1983, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:18:45.895029', 'step': 1983, 'epoch': 2} {'type': 'loss', 'content': 0.019531631842255592, 'timestamp': '2025-10-01 04:18:45.925981', 'step': 1984, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:18:45.968308', 'step': 1984, 'epoch': 2} {'type': 'loss', 'content': 0.008424998261034489, 'timestamp': '2025-10-01 04:18:45.974307', 'step': 1985, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:46.024509', 'step': 1985, 'epoch': 2} {'type': 'loss', 'content': 0.02545682154595852, 'timestamp': '2025-10-01 04:18:46.038014', 'step': 1986, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-10-01 04:18:46.089469', 'step': 1986, 'epoch': 2} {'type': 'loss', 'content': 0.01235184259712696, 'timestamp': '2025-10-01 04:18:46.105808', 'step': 1987, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:18:46.143449', 'step': 1987, 'epoch': 2} {'type': 'loss', 'content': 0.011035177856683731, 'timestamp': '2025-10-01 04:18:46.173781', 'step': 1988, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:18:46.218658', 'step': 1988, 'epoch': 2} {'type': 'loss', 'content': 0.016446422785520554, 'timestamp': '2025-10-01 04:18:46.227294', 'step': 1989, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:18:46.263216', 'step': 1989, 'epoch': 2} {'type': 'loss', 'content': 0.009289894253015518, 'timestamp': '2025-10-01 04:18:46.271459', 'step': 1990, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:18:46.308754', 'step': 1990, 'epoch': 2} {'type': 'loss', 'content': 0.014242404140532017, 'timestamp': '2025-10-01 04:18:46.316381', 'step': 1991, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:18:46.351734', 'step': 1991, 'epoch': 2} {'type': 'loss', 'content': 0.021860193461179733, 'timestamp': '2025-10-01 04:18:46.381633', 'step': 1992, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:46.417029', 'step': 1992, 'epoch': 2} {'type': 'loss', 'content': 0.010540030896663666, 'timestamp': '2025-10-01 04:18:46.428076', 'step': 1993, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:46.474154', 'step': 1993, 'epoch': 2} {'type': 'loss', 'content': 0.02374594658613205, 'timestamp': '2025-10-01 04:18:46.485803', 'step': 1994, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:46.524617', 'step': 1994, 'epoch': 2} {'type': 'loss', 'content': 0.011393887922167778, 'timestamp': '2025-10-01 04:18:46.536320', 'step': 1995, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:18:46.578685', 'step': 1995, 'epoch': 2} {'type': 'loss', 'content': 0.012068143114447594, 'timestamp': '2025-10-01 04:18:46.613813', 'step': 1996, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:46.648231', 'step': 1996, 'epoch': 2} {'type': 'loss', 'content': 0.011276407167315483, 'timestamp': '2025-10-01 04:18:46.659256', 'step': 1997, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:18:46.697664', 'step': 1997, 'epoch': 2} {'type': 'loss', 'content': 0.00781253818422556, 'timestamp': '2025-10-01 04:18:46.711645', 'step': 1998, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:18:46.752467', 'step': 1998, 'epoch': 2} {'type': 'loss', 'content': 0.01256867591291666, 'timestamp': '2025-10-01 04:18:46.766449', 'step': 1999, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:18:46.824479', 'step': 1999, 'epoch': 2} {'type': 'loss', 'content': 0.01071880105882883, 'timestamp': '2025-10-01 04:18:46.861245', 'step': 2000, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 2000', 'timestamp': '2025-10-01 04:18:52.436474', 'step': 2000, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:52.471522', 'step': 2000, 'epoch': 2} {'type': 'loss', 'content': 0.01053350418806076, 'timestamp': '2025-10-01 04:18:52.479422', 'step': 2001, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:18:52.522755', 'step': 2001, 'epoch': 2} {'type': 'loss', 'content': 0.006631781812757254, 'timestamp': '2025-10-01 04:18:52.536729', 'step': 2002, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:52.572730', 'step': 2002, 'epoch': 2} {'type': 'loss', 'content': 0.02082558535039425, 'timestamp': '2025-10-01 04:18:52.585218', 'step': 2003, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:18:52.623593', 'step': 2003, 'epoch': 2} {'type': 'loss', 'content': 0.011814413592219353, 'timestamp': '2025-10-01 04:18:52.658633', 'step': 2004, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:18:52.701075', 'step': 2004, 'epoch': 2} {'type': 'loss', 'content': 0.011002382263541222, 'timestamp': '2025-10-01 04:18:52.707062', 'step': 2005, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:52.742332', 'step': 2005, 'epoch': 2} {'type': 'loss', 'content': 0.01049763336777687, 'timestamp': '2025-10-01 04:18:52.755120', 'step': 2006, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:52.794415', 'step': 2006, 'epoch': 2} {'type': 'loss', 'content': 0.011580907739698887, 'timestamp': '2025-10-01 04:18:52.806030', 'step': 2007, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:52.840402', 'step': 2007, 'epoch': 2} {'type': 'loss', 'content': 0.011779814027249813, 'timestamp': '2025-10-01 04:18:52.874122', 'step': 2008, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:52.912411', 'step': 2008, 'epoch': 2} {'type': 'loss', 'content': 0.007202190812677145, 'timestamp': '2025-10-01 04:18:52.923323', 'step': 2009, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:18:52.963249', 'step': 2009, 'epoch': 2} {'type': 'loss', 'content': 0.011819681152701378, 'timestamp': '2025-10-01 04:18:52.973979', 'step': 2010, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:53.011424', 'step': 2010, 'epoch': 2} {'type': 'loss', 'content': 0.0089492779225111, 'timestamp': '2025-10-01 04:18:53.023911', 'step': 2011, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:18:53.065173', 'step': 2011, 'epoch': 2} {'type': 'loss', 'content': 0.012455213814973831, 'timestamp': '2025-10-01 04:18:53.094321', 'step': 2012, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:18:53.131336', 'step': 2012, 'epoch': 2} {'type': 'loss', 'content': 0.012302165850996971, 'timestamp': '2025-10-01 04:18:53.137264', 'step': 2013, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:18:53.174902', 'step': 2013, 'epoch': 2} {'type': 'loss', 'content': 0.011665032245218754, 'timestamp': '2025-10-01 04:18:53.183118', 'step': 2014, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:18:53.226609', 'step': 2014, 'epoch': 2} {'type': 'loss', 'content': 0.007117496337741613, 'timestamp': '2025-10-01 04:18:53.240644', 'step': 2015, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:18:53.279066', 'step': 2015, 'epoch': 2} {'type': 'loss', 'content': 0.00924930814653635, 'timestamp': '2025-10-01 04:18:53.308054', 'step': 2016, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:53.342068', 'step': 2016, 'epoch': 2} {'type': 'loss', 'content': 0.011233747936785221, 'timestamp': '2025-10-01 04:18:53.352439', 'step': 2017, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:18:53.389280', 'step': 2017, 'epoch': 2} {'type': 'loss', 'content': 0.012122174724936485, 'timestamp': '2025-10-01 04:18:53.399994', 'step': 2018, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:18:53.433124', 'step': 2018, 'epoch': 2} {'type': 'loss', 'content': 0.01288899127393961, 'timestamp': '2025-10-01 04:18:53.440719', 'step': 2019, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:53.481196', 'step': 2019, 'epoch': 2} {'type': 'loss', 'content': 0.008778244256973267, 'timestamp': '2025-10-01 04:18:53.514625', 'step': 2020, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:18:53.556998', 'step': 2020, 'epoch': 2} {'type': 'loss', 'content': 0.006542769726365805, 'timestamp': '2025-10-01 04:18:53.570297', 'step': 2021, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:18:53.606862', 'step': 2021, 'epoch': 2} {'type': 'loss', 'content': 0.006875595543533564, 'timestamp': '2025-10-01 04:18:53.615163', 'step': 2022, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-10-01 04:18:53.662040', 'step': 2022, 'epoch': 2} {'type': 'loss', 'content': 0.0034512504935264587, 'timestamp': '2025-10-01 04:18:53.678376', 'step': 2023, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:53.718114', 'step': 2023, 'epoch': 2} {'type': 'loss', 'content': 0.010408591479063034, 'timestamp': '2025-10-01 04:18:53.752630', 'step': 2024, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:53.790993', 'step': 2024, 'epoch': 2} {'type': 'loss', 'content': 0.009249560534954071, 'timestamp': '2025-10-01 04:18:53.801717', 'step': 2025, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:53.835226', 'step': 2025, 'epoch': 2} {'type': 'loss', 'content': 0.011620047502219677, 'timestamp': '2025-10-01 04:18:53.846825', 'step': 2026, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:18:53.883737', 'step': 2026, 'epoch': 2} {'type': 'loss', 'content': 0.017058631405234337, 'timestamp': '2025-10-01 04:18:53.891994', 'step': 2027, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:53.926406', 'step': 2027, 'epoch': 2} {'type': 'loss', 'content': 0.01148415356874466, 'timestamp': '2025-10-01 04:18:53.960172', 'step': 2028, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:18:53.992685', 'step': 2028, 'epoch': 2} {'type': 'loss', 'content': 0.01248354185372591, 'timestamp': '2025-10-01 04:18:53.995398', 'step': 2029, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:18:54.033794', 'step': 2029, 'epoch': 2} {'type': 'loss', 'content': 0.006514685228466988, 'timestamp': '2025-10-01 04:18:54.044332', 'step': 2030, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:18:54.088436', 'step': 2030, 'epoch': 2} {'type': 'loss', 'content': 0.013804574497044086, 'timestamp': '2025-10-01 04:18:54.102453', 'step': 2031, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:18:54.142124', 'step': 2031, 'epoch': 2} {'type': 'loss', 'content': 0.019239069893956184, 'timestamp': '2025-10-01 04:18:54.170898', 'step': 2032, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:54.208307', 'step': 2032, 'epoch': 2} {'type': 'loss', 'content': 0.007814605720341206, 'timestamp': '2025-10-01 04:18:54.221196', 'step': 2033, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:54.254172', 'step': 2033, 'epoch': 2} {'type': 'loss', 'content': 0.006031763274222612, 'timestamp': '2025-10-01 04:18:54.266889', 'step': 2034, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:18:54.311743', 'step': 2034, 'epoch': 2} {'type': 'loss', 'content': 0.004242385271936655, 'timestamp': '2025-10-01 04:18:54.327755', 'step': 2035, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:54.366299', 'step': 2035, 'epoch': 2} {'type': 'loss', 'content': 0.004580293782055378, 'timestamp': '2025-10-01 04:18:54.399978', 'step': 2036, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:18:54.431611', 'step': 2036, 'epoch': 2} {'type': 'loss', 'content': 0.013194718398153782, 'timestamp': '2025-10-01 04:18:54.438867', 'step': 2037, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:18:54.475858', 'step': 2037, 'epoch': 2} {'type': 'loss', 'content': 0.010610926896333694, 'timestamp': '2025-10-01 04:18:54.486463', 'step': 2038, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:18:54.530708', 'step': 2038, 'epoch': 2} {'type': 'loss', 'content': 0.009930195286870003, 'timestamp': '2025-10-01 04:18:54.546769', 'step': 2039, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:18:54.591193', 'step': 2039, 'epoch': 2} {'type': 'loss', 'content': 0.009586768224835396, 'timestamp': '2025-10-01 04:18:54.626112', 'step': 2040, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:18:54.659876', 'step': 2040, 'epoch': 2} {'type': 'loss', 'content': 0.010077967308461666, 'timestamp': '2025-10-01 04:18:54.667320', 'step': 2041, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:18:54.705350', 'step': 2041, 'epoch': 2} {'type': 'loss', 'content': 0.01475665345788002, 'timestamp': '2025-10-01 04:18:54.716002', 'step': 2042, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:18:54.748127', 'step': 2042, 'epoch': 2} {'type': 'loss', 'content': 0.016530966386198997, 'timestamp': '2025-10-01 04:18:54.758716', 'step': 2043, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:18:54.794203', 'step': 2043, 'epoch': 2} {'type': 'loss', 'content': 0.010064169764518738, 'timestamp': '2025-10-01 04:18:54.825860', 'step': 2044, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:18:54.861093', 'step': 2044, 'epoch': 2} {'type': 'loss', 'content': 0.014578712172806263, 'timestamp': '2025-10-01 04:18:54.867034', 'step': 2045, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:54.905112', 'step': 2045, 'epoch': 2} {'type': 'loss', 'content': 0.006539412774145603, 'timestamp': '2025-10-01 04:18:54.918652', 'step': 2046, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:18:54.956500', 'step': 2046, 'epoch': 2} {'type': 'loss', 'content': 0.005309057887643576, 'timestamp': '2025-10-01 04:18:54.970556', 'step': 2047, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:18:55.009855', 'step': 2047, 'epoch': 2} {'type': 'loss', 'content': 0.007415627595037222, 'timestamp': '2025-10-01 04:18:55.044833', 'step': 2048, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:55.084719', 'step': 2048, 'epoch': 2} {'type': 'loss', 'content': 0.014014936052262783, 'timestamp': '2025-10-01 04:18:55.097473', 'step': 2049, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:55.136025', 'step': 2049, 'epoch': 2} {'type': 'loss', 'content': 0.012029731646180153, 'timestamp': '2025-10-01 04:18:55.148589', 'step': 2050, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:55.189010', 'step': 2050, 'epoch': 2} {'type': 'loss', 'content': 0.009125737473368645, 'timestamp': '2025-10-01 04:18:55.202566', 'step': 2051, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:18:55.240349', 'step': 2051, 'epoch': 2} {'type': 'loss', 'content': 0.0084026288241148, 'timestamp': '2025-10-01 04:18:55.275356', 'step': 2052, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:18:55.312240', 'step': 2052, 'epoch': 2} {'type': 'loss', 'content': 0.012727833352982998, 'timestamp': '2025-10-01 04:18:55.320727', 'step': 2053, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:18:55.359435', 'step': 2053, 'epoch': 2} {'type': 'loss', 'content': 0.008404972031712532, 'timestamp': '2025-10-01 04:18:55.373400', 'step': 2054, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:18:55.409131', 'step': 2054, 'epoch': 2} {'type': 'loss', 'content': 0.011271034367382526, 'timestamp': '2025-10-01 04:18:55.417433', 'step': 2055, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:55.456704', 'step': 2055, 'epoch': 2} {'type': 'loss', 'content': 0.010025057010352612, 'timestamp': '2025-10-01 04:18:55.491192', 'step': 2056, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:55.525797', 'step': 2056, 'epoch': 2} {'type': 'loss', 'content': 0.012525882571935654, 'timestamp': '2025-10-01 04:18:55.536072', 'step': 2057, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:55.574230', 'step': 2057, 'epoch': 2} {'type': 'loss', 'content': 0.015204235911369324, 'timestamp': '2025-10-01 04:18:55.585778', 'step': 2058, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:55.620858', 'step': 2058, 'epoch': 2} {'type': 'loss', 'content': 0.009213750250637531, 'timestamp': '2025-10-01 04:18:55.633357', 'step': 2059, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:18:55.670927', 'step': 2059, 'epoch': 2} {'type': 'loss', 'content': 0.010274983942508698, 'timestamp': '2025-10-01 04:18:55.705918', 'step': 2060, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:55.752443', 'step': 2060, 'epoch': 2} {'type': 'loss', 'content': 0.006509967613965273, 'timestamp': '2025-10-01 04:18:55.765395', 'step': 2061, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:55.800563', 'step': 2061, 'epoch': 2} {'type': 'loss', 'content': 0.0070640090852975845, 'timestamp': '2025-10-01 04:18:55.813338', 'step': 2062, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:55.857803', 'step': 2062, 'epoch': 2} {'type': 'loss', 'content': 0.007656324654817581, 'timestamp': '2025-10-01 04:18:55.871361', 'step': 2063, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:55.908385', 'step': 2063, 'epoch': 2} {'type': 'loss', 'content': 0.006804482080042362, 'timestamp': '2025-10-01 04:18:55.941905', 'step': 2064, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:55.980825', 'step': 2064, 'epoch': 2} {'type': 'loss', 'content': 0.006550224032253027, 'timestamp': '2025-10-01 04:18:55.993657', 'step': 2065, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:18:56.030624', 'step': 2065, 'epoch': 2} {'type': 'loss', 'content': 0.009134102612733841, 'timestamp': '2025-10-01 04:18:56.039507', 'step': 2066, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:56.076910', 'step': 2066, 'epoch': 2} {'type': 'loss', 'content': 0.006942349951714277, 'timestamp': '2025-10-01 04:18:56.089675', 'step': 2067, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:18:56.131148', 'step': 2067, 'epoch': 2} {'type': 'loss', 'content': 0.009811093099415302, 'timestamp': '2025-10-01 04:18:56.160181', 'step': 2068, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:56.200374', 'step': 2068, 'epoch': 2} {'type': 'loss', 'content': 0.0116462716832757, 'timestamp': '2025-10-01 04:18:56.211374', 'step': 2069, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:56.243106', 'step': 2069, 'epoch': 2} {'type': 'loss', 'content': 0.01142897643148899, 'timestamp': '2025-10-01 04:18:56.254648', 'step': 2070, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-10-01 04:18:58.662897', 'step': 2070, 'epoch': 2} {'type': 'pplx', 'content': 5.613089806722828, 'timestamp': '2025-10-01 04:18:58.675501', 'step': 2070, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:18:58.715247', 'step': 2070, 'epoch': 2} {'type': 'loss', 'content': 0.012708108872175217, 'timestamp': '2025-10-01 04:18:58.729816', 'step': 2071, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-10-01 04:18:58.783844', 'step': 2071, 'epoch': 2} {'type': 'loss', 'content': 0.007048464380204678, 'timestamp': '2025-10-01 04:18:58.821987', 'step': 2072, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:58.858567', 'step': 2072, 'epoch': 2} {'type': 'loss', 'content': 0.006396925542503595, 'timestamp': '2025-10-01 04:18:58.871357', 'step': 2073, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:58.904012', 'step': 2073, 'epoch': 2} {'type': 'loss', 'content': 0.01588745042681694, 'timestamp': '2025-10-01 04:18:58.916569', 'step': 2074, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:18:58.964188', 'step': 2074, 'epoch': 2} {'type': 'loss', 'content': 0.007528386544436216, 'timestamp': '2025-10-01 04:18:58.972343', 'step': 2075, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:59.014465', 'step': 2075, 'epoch': 2} {'type': 'loss', 'content': 0.014374079182744026, 'timestamp': '2025-10-01 04:18:59.047927', 'step': 2076, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:18:59.091119', 'step': 2076, 'epoch': 2} {'type': 'loss', 'content': 0.019365297630429268, 'timestamp': '2025-10-01 04:18:59.099537', 'step': 2077, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:59.140400', 'step': 2077, 'epoch': 2} {'type': 'loss', 'content': 0.007113221567124128, 'timestamp': '2025-10-01 04:18:59.153262', 'step': 2078, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:18:59.196059', 'step': 2078, 'epoch': 2} {'type': 'loss', 'content': 0.00717134727165103, 'timestamp': '2025-10-01 04:18:59.204375', 'step': 2079, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:59.249438', 'step': 2079, 'epoch': 2} {'type': 'loss', 'content': 0.006013211328536272, 'timestamp': '2025-10-01 04:18:59.284008', 'step': 2080, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:59.316156', 'step': 2080, 'epoch': 2} {'type': 'loss', 'content': 0.015247929841279984, 'timestamp': '2025-10-01 04:18:59.325245', 'step': 2081, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:59.370044', 'step': 2081, 'epoch': 2} {'type': 'loss', 'content': 0.013365598395466805, 'timestamp': '2025-10-01 04:18:59.382605', 'step': 2082, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:18:59.420748', 'step': 2082, 'epoch': 2} {'type': 'loss', 'content': 0.012016217224299908, 'timestamp': '2025-10-01 04:18:59.433259', 'step': 2083, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:59.470930', 'step': 2083, 'epoch': 2} {'type': 'loss', 'content': 0.012486659921705723, 'timestamp': '2025-10-01 04:18:59.505425', 'step': 2084, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:18:59.543546', 'step': 2084, 'epoch': 2} {'type': 'loss', 'content': 0.012935977429151535, 'timestamp': '2025-10-01 04:18:59.556931', 'step': 2085, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:59.590403', 'step': 2085, 'epoch': 2} {'type': 'loss', 'content': 0.008486568927764893, 'timestamp': '2025-10-01 04:18:59.602022', 'step': 2086, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:18:59.647011', 'step': 2086, 'epoch': 2} {'type': 'loss', 'content': 0.012267296202480793, 'timestamp': '2025-10-01 04:18:59.658515', 'step': 2087, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:18:59.704271', 'step': 2087, 'epoch': 2} {'type': 'loss', 'content': 0.006491878069937229, 'timestamp': '2025-10-01 04:18:59.739253', 'step': 2088, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:18:59.787596', 'step': 2088, 'epoch': 2} {'type': 'loss', 'content': 0.01540594082325697, 'timestamp': '2025-10-01 04:18:59.800888', 'step': 2089, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:18:59.842957', 'step': 2089, 'epoch': 2} {'type': 'loss', 'content': 0.006828084122389555, 'timestamp': '2025-10-01 04:18:59.859033', 'step': 2090, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:18:59.910393', 'step': 2090, 'epoch': 2} {'type': 'loss', 'content': 0.005875085014849901, 'timestamp': '2025-10-01 04:18:59.924007', 'step': 2091, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:18:59.957095', 'step': 2091, 'epoch': 2} {'type': 'loss', 'content': 0.00929328240454197, 'timestamp': '2025-10-01 04:18:59.990743', 'step': 2092, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:19:00.025821', 'step': 2092, 'epoch': 2} {'type': 'loss', 'content': 0.007730370853096247, 'timestamp': '2025-10-01 04:19:00.039342', 'step': 2093, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:19:00.094811', 'step': 2093, 'epoch': 2} {'type': 'loss', 'content': 0.010209760628640652, 'timestamp': '2025-10-01 04:19:00.110864', 'step': 2094, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:19:00.159348', 'step': 2094, 'epoch': 2} {'type': 'loss', 'content': 0.005793020129203796, 'timestamp': '2025-10-01 04:19:00.174408', 'step': 2095, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:19:00.225872', 'step': 2095, 'epoch': 2} {'type': 'loss', 'content': 0.003753918455913663, 'timestamp': '2025-10-01 04:19:00.262971', 'step': 2096, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:19:00.298262', 'step': 2096, 'epoch': 2} {'type': 'loss', 'content': 0.009094505570828915, 'timestamp': '2025-10-01 04:19:00.312483', 'step': 2097, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:19:00.358129', 'step': 2097, 'epoch': 2} {'type': 'loss', 'content': 0.011716114357113838, 'timestamp': '2025-10-01 04:19:00.371641', 'step': 2098, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:19:00.409528', 'step': 2098, 'epoch': 2} {'type': 'loss', 'content': 0.011924310587346554, 'timestamp': '2025-10-01 04:19:00.423544', 'step': 2099, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:00.469523', 'step': 2099, 'epoch': 2} {'type': 'loss', 'content': 0.014485645107924938, 'timestamp': '2025-10-01 04:19:00.505660', 'step': 2100, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:19:00.541351', 'step': 2100, 'epoch': 2} {'type': 'loss', 'content': 0.011004070751369, 'timestamp': '2025-10-01 04:19:00.554701', 'step': 2101, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:19:00.588880', 'step': 2101, 'epoch': 2} {'type': 'loss', 'content': 0.010235278867185116, 'timestamp': '2025-10-01 04:19:00.601396', 'step': 2102, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:19:00.648052', 'step': 2102, 'epoch': 2} {'type': 'loss', 'content': 0.012883284129202366, 'timestamp': '2025-10-01 04:19:00.661616', 'step': 2103, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:19:00.709582', 'step': 2103, 'epoch': 2} {'type': 'loss', 'content': 0.008026556111872196, 'timestamp': '2025-10-01 04:19:00.744682', 'step': 2104, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 14238374516352}, 'timestamp': '2025-10-01 04:19:00.796009', 'step': 2104, 'epoch': 2} {'type': 'loss', 'content': 0.007295748684555292, 'timestamp': '2025-10-01 04:19:00.813154', 'step': 2105, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:19:00.851883', 'step': 2105, 'epoch': 2} {'type': 'loss', 'content': 0.009701365604996681, 'timestamp': '2025-10-01 04:19:00.865391', 'step': 2106, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:00.906694', 'step': 2106, 'epoch': 2} {'type': 'loss', 'content': 0.009323429316282272, 'timestamp': '2025-10-01 04:19:00.914921', 'step': 2107, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:00.948693', 'step': 2107, 'epoch': 2} {'type': 'loss', 'content': 0.020818162709474564, 'timestamp': '2025-10-01 04:19:00.980076', 'step': 2108, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:01.013874', 'step': 2108, 'epoch': 2} {'type': 'loss', 'content': 0.02208242379128933, 'timestamp': '2025-10-01 04:19:01.019686', 'step': 2109, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:19:01.052724', 'step': 2109, 'epoch': 2} {'type': 'loss', 'content': 0.009645006619393826, 'timestamp': '2025-10-01 04:19:01.065502', 'step': 2110, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:19:01.107084', 'step': 2110, 'epoch': 2} {'type': 'loss', 'content': 0.015802545472979546, 'timestamp': '2025-10-01 04:19:01.120570', 'step': 2111, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:19:01.164751', 'step': 2111, 'epoch': 2} {'type': 'loss', 'content': 0.01765204221010208, 'timestamp': '2025-10-01 04:19:01.198352', 'step': 2112, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:19:01.233198', 'step': 2112, 'epoch': 2} {'type': 'loss', 'content': 0.008515198715031147, 'timestamp': '2025-10-01 04:19:01.244269', 'step': 2113, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:19:01.280912', 'step': 2113, 'epoch': 2} {'type': 'loss', 'content': 0.011204243637621403, 'timestamp': '2025-10-01 04:19:01.295399', 'step': 2114, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-10-01 04:19:01.337042', 'step': 2114, 'epoch': 2} {'type': 'loss', 'content': 0.007484538480639458, 'timestamp': '2025-10-01 04:19:01.353585', 'step': 2115, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:19:01.404028', 'step': 2115, 'epoch': 2} {'type': 'loss', 'content': 0.008002368733286858, 'timestamp': '2025-10-01 04:19:01.441038', 'step': 2116, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:19:01.483720', 'step': 2116, 'epoch': 2} {'type': 'loss', 'content': 0.007711250800639391, 'timestamp': '2025-10-01 04:19:01.497151', 'step': 2117, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:01.538447', 'step': 2117, 'epoch': 2} {'type': 'loss', 'content': 0.01008161436766386, 'timestamp': '2025-10-01 04:19:01.550769', 'step': 2118, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-10-01 04:19:01.592927', 'step': 2118, 'epoch': 2} {'type': 'loss', 'content': 0.004567032679915428, 'timestamp': '2025-10-01 04:19:01.609201', 'step': 2119, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:19:01.645949', 'step': 2119, 'epoch': 2} {'type': 'loss', 'content': 0.008573171682655811, 'timestamp': '2025-10-01 04:19:01.680948', 'step': 2120, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:19:01.717798', 'step': 2120, 'epoch': 2} {'type': 'loss', 'content': 0.00963873416185379, 'timestamp': '2025-10-01 04:19:01.728070', 'step': 2121, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:19:01.762194', 'step': 2121, 'epoch': 2} {'type': 'loss', 'content': 0.015014849603176117, 'timestamp': '2025-10-01 04:19:01.769845', 'step': 2122, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:01.812516', 'step': 2122, 'epoch': 2} {'type': 'loss', 'content': 0.01500563882291317, 'timestamp': '2025-10-01 04:19:01.824814', 'step': 2123, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:01.866182', 'step': 2123, 'epoch': 2} {'type': 'loss', 'content': 0.007117177359759808, 'timestamp': '2025-10-01 04:19:01.897765', 'step': 2124, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:19:01.940867', 'step': 2124, 'epoch': 2} {'type': 'loss', 'content': 0.025839336216449738, 'timestamp': '2025-10-01 04:19:01.954137', 'step': 2125, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:01.990625', 'step': 2125, 'epoch': 2} {'type': 'loss', 'content': 0.0040946584194898605, 'timestamp': '2025-10-01 04:19:02.001374', 'step': 2126, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:19:02.043022', 'step': 2126, 'epoch': 2} {'type': 'loss', 'content': 0.01242857612669468, 'timestamp': '2025-10-01 04:19:02.057139', 'step': 2127, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:19:02.097966', 'step': 2127, 'epoch': 2} {'type': 'loss', 'content': 0.011060532182455063, 'timestamp': '2025-10-01 04:19:02.131416', 'step': 2128, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:19:02.176880', 'step': 2128, 'epoch': 2} {'type': 'loss', 'content': 0.009997732006013393, 'timestamp': '2025-10-01 04:19:02.192190', 'step': 2129, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:19:02.251449', 'step': 2129, 'epoch': 2} {'type': 'loss', 'content': 0.01913614198565483, 'timestamp': '2025-10-01 04:19:02.267497', 'step': 2130, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:02.299309', 'step': 2130, 'epoch': 2} {'type': 'loss', 'content': 0.010258916765451431, 'timestamp': '2025-10-01 04:19:02.310850', 'step': 2131, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:02.354375', 'step': 2131, 'epoch': 2} {'type': 'loss', 'content': 0.008846418000757694, 'timestamp': '2025-10-01 04:19:02.386059', 'step': 2132, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:02.429018', 'step': 2132, 'epoch': 2} {'type': 'loss', 'content': 0.014166883192956448, 'timestamp': '2025-10-01 04:19:02.438086', 'step': 2133, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:19:02.474285', 'step': 2133, 'epoch': 2} {'type': 'loss', 'content': 0.006876503117382526, 'timestamp': '2025-10-01 04:19:02.487891', 'step': 2134, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:19:02.528581', 'step': 2134, 'epoch': 2} {'type': 'loss', 'content': 0.009603909216821194, 'timestamp': '2025-10-01 04:19:02.542137', 'step': 2135, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:19:02.584776', 'step': 2135, 'epoch': 2} {'type': 'loss', 'content': 0.011837280355393887, 'timestamp': '2025-10-01 04:19:02.618226', 'step': 2136, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:02.650519', 'step': 2136, 'epoch': 2} {'type': 'loss', 'content': 0.008356200531125069, 'timestamp': '2025-10-01 04:19:02.662318', 'step': 2137, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:19:02.696782', 'step': 2137, 'epoch': 2} {'type': 'loss', 'content': 0.008852719329297543, 'timestamp': '2025-10-01 04:19:02.709332', 'step': 2138, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:19:02.752380', 'step': 2138, 'epoch': 2} {'type': 'loss', 'content': 0.008716036565601826, 'timestamp': '2025-10-01 04:19:02.764339', 'step': 2139, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:19:02.807568', 'step': 2139, 'epoch': 2} {'type': 'loss', 'content': 0.007436427753418684, 'timestamp': '2025-10-01 04:19:02.844606', 'step': 2140, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:19:02.877609', 'step': 2140, 'epoch': 2} {'type': 'loss', 'content': 0.011678795330226421, 'timestamp': '2025-10-01 04:19:02.882703', 'step': 2141, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:19:02.915507', 'step': 2141, 'epoch': 2} {'type': 'loss', 'content': 0.015152360312640667, 'timestamp': '2025-10-01 04:19:02.928631', 'step': 2142, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:02.961830', 'step': 2142, 'epoch': 2} {'type': 'loss', 'content': 0.0052658976055681705, 'timestamp': '2025-10-01 04:19:02.972461', 'step': 2143, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:03.005580', 'step': 2143, 'epoch': 2} {'type': 'loss', 'content': 0.011507282964885235, 'timestamp': '2025-10-01 04:19:03.038141', 'step': 2144, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:19:03.085183', 'step': 2144, 'epoch': 2} {'type': 'loss', 'content': 0.010375902988016605, 'timestamp': '2025-10-01 04:19:03.098675', 'step': 2145, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:03.141108', 'step': 2145, 'epoch': 2} {'type': 'loss', 'content': 0.009601197205483913, 'timestamp': '2025-10-01 04:19:03.149411', 'step': 2146, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:19:03.192379', 'step': 2146, 'epoch': 2} {'type': 'loss', 'content': 0.027258029207587242, 'timestamp': '2025-10-01 04:19:03.199978', 'step': 2147, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:03.233744', 'step': 2147, 'epoch': 2} {'type': 'loss', 'content': 0.008714809082448483, 'timestamp': '2025-10-01 04:19:03.266306', 'step': 2148, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:19:03.309798', 'step': 2148, 'epoch': 2} {'type': 'loss', 'content': 0.008165842853486538, 'timestamp': '2025-10-01 04:19:03.326186', 'step': 2149, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:03.359458', 'step': 2149, 'epoch': 2} {'type': 'loss', 'content': 0.013743998482823372, 'timestamp': '2025-10-01 04:19:03.367664', 'step': 2150, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:03.401398', 'step': 2150, 'epoch': 2} {'type': 'loss', 'content': 0.00993258785456419, 'timestamp': '2025-10-01 04:19:03.416176', 'step': 2151, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:19:03.462219', 'step': 2151, 'epoch': 2} {'type': 'loss', 'content': 0.01015244610607624, 'timestamp': '2025-10-01 04:19:03.497071', 'step': 2152, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:19:03.546219', 'step': 2152, 'epoch': 2} {'type': 'loss', 'content': 0.007464682683348656, 'timestamp': '2025-10-01 04:19:03.561819', 'step': 2153, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-10-01 04:19:03.605363', 'step': 2153, 'epoch': 2} {'type': 'loss', 'content': 0.010694925673305988, 'timestamp': '2025-10-01 04:19:03.632549', 'step': 2154, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:19:03.676944', 'step': 2154, 'epoch': 2} {'type': 'loss', 'content': 0.012157081626355648, 'timestamp': '2025-10-01 04:19:03.690959', 'step': 2155, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:19:03.729971', 'step': 2155, 'epoch': 2} {'type': 'loss', 'content': 0.006802540272474289, 'timestamp': '2025-10-01 04:19:03.764951', 'step': 2156, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:03.796793', 'step': 2156, 'epoch': 2} {'type': 'loss', 'content': 0.009896165691316128, 'timestamp': '2025-10-01 04:19:03.807621', 'step': 2157, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:19:03.852020', 'step': 2157, 'epoch': 2} {'type': 'loss', 'content': 0.012829391285777092, 'timestamp': '2025-10-01 04:19:03.866022', 'step': 2158, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:19:03.899419', 'step': 2158, 'epoch': 2} {'type': 'loss', 'content': 0.008480077609419823, 'timestamp': '2025-10-01 04:19:03.912017', 'step': 2159, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:19:03.949438', 'step': 2159, 'epoch': 2} {'type': 'loss', 'content': 0.004896396305412054, 'timestamp': '2025-10-01 04:19:03.983997', 'step': 2160, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:19:04.033982', 'step': 2160, 'epoch': 2} {'type': 'loss', 'content': 0.004578308202326298, 'timestamp': '2025-10-01 04:19:04.047561', 'step': 2161, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:19:04.088762', 'step': 2161, 'epoch': 2} {'type': 'loss', 'content': 0.007314469665288925, 'timestamp': '2025-10-01 04:19:04.102374', 'step': 2162, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:19:04.147677', 'step': 2162, 'epoch': 2} {'type': 'loss', 'content': 0.011029335670173168, 'timestamp': '2025-10-01 04:19:04.160433', 'step': 2163, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:19:04.207013', 'step': 2163, 'epoch': 2} {'type': 'loss', 'content': 0.006489933934062719, 'timestamp': '2025-10-01 04:19:04.241993', 'step': 2164, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:04.274100', 'step': 2164, 'epoch': 2} {'type': 'loss', 'content': 0.01199716329574585, 'timestamp': '2025-10-01 04:19:04.282537', 'step': 2165, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:19:04.314780', 'step': 2165, 'epoch': 2} {'type': 'loss', 'content': 0.0077391574159264565, 'timestamp': '2025-10-01 04:19:04.327469', 'step': 2166, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:19:04.366792', 'step': 2166, 'epoch': 2} {'type': 'loss', 'content': 0.013269958086311817, 'timestamp': '2025-10-01 04:19:04.374095', 'step': 2167, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:19:04.405364', 'step': 2167, 'epoch': 2} {'type': 'loss', 'content': 0.006828898563981056, 'timestamp': '2025-10-01 04:19:04.436501', 'step': 2168, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:19:04.469831', 'step': 2168, 'epoch': 2} {'type': 'loss', 'content': 0.004970960319042206, 'timestamp': '2025-10-01 04:19:04.481080', 'step': 2169, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:19:04.520300', 'step': 2169, 'epoch': 2} {'type': 'loss', 'content': 0.01199716329574585, 'timestamp': '2025-10-01 04:19:04.529217', 'step': 2170, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:19:04.569047', 'step': 2170, 'epoch': 2} {'type': 'loss', 'content': 0.011250588111579418, 'timestamp': '2025-10-01 04:19:04.581737', 'step': 2171, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-10-01 04:19:04.613341', 'step': 2171, 'epoch': 2} {'type': 'loss', 'content': 0.010763965547084808, 'timestamp': '2025-10-01 04:19:04.644546', 'step': 2172, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:19:04.688916', 'step': 2172, 'epoch': 2} {'type': 'loss', 'content': 0.004710193257778883, 'timestamp': '2025-10-01 04:19:04.702822', 'step': 2173, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:04.739180', 'step': 2173, 'epoch': 2} {'type': 'loss', 'content': 0.008146033622324467, 'timestamp': '2025-10-01 04:19:04.749929', 'step': 2174, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:19:04.795097', 'step': 2174, 'epoch': 2} {'type': 'loss', 'content': 0.007303517311811447, 'timestamp': '2025-10-01 04:19:04.809055', 'step': 2175, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:19:04.848978', 'step': 2175, 'epoch': 2} {'type': 'loss', 'content': 0.020675325766205788, 'timestamp': '2025-10-01 04:19:04.885796', 'step': 2176, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:19:04.926535', 'step': 2176, 'epoch': 2} {'type': 'loss', 'content': 0.011910445988178253, 'timestamp': '2025-10-01 04:19:04.939884', 'step': 2177, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:19:04.974557', 'step': 2177, 'epoch': 2} {'type': 'loss', 'content': 0.015800071880221367, 'timestamp': '2025-10-01 04:19:04.988150', 'step': 2178, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:19:05.031928', 'step': 2178, 'epoch': 2} {'type': 'loss', 'content': 0.008765121921896935, 'timestamp': '2025-10-01 04:19:05.046057', 'step': 2179, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:19:05.091930', 'step': 2179, 'epoch': 2} {'type': 'loss', 'content': 0.013155746273696423, 'timestamp': '2025-10-01 04:19:05.126408', 'step': 2180, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:05.165557', 'step': 2180, 'epoch': 2} {'type': 'loss', 'content': 0.01393271703273058, 'timestamp': '2025-10-01 04:19:05.176485', 'step': 2181, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:19:05.216518', 'step': 2181, 'epoch': 2} {'type': 'loss', 'content': 0.00969043280929327, 'timestamp': '2025-10-01 04:19:05.230093', 'step': 2182, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:19:05.264188', 'step': 2182, 'epoch': 2} {'type': 'loss', 'content': 0.006643644068390131, 'timestamp': '2025-10-01 04:19:05.271798', 'step': 2183, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:05.316291', 'step': 2183, 'epoch': 2} {'type': 'loss', 'content': 0.012567729689180851, 'timestamp': '2025-10-01 04:19:05.350378', 'step': 2184, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:19:05.383928', 'step': 2184, 'epoch': 2} {'type': 'loss', 'content': 0.03909774869680405, 'timestamp': '2025-10-01 04:19:05.395539', 'step': 2185, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-10-01 04:19:07.906908', 'step': 2185, 'epoch': 2} {'type': 'pplx', 'content': 5.7155398429432225, 'timestamp': '2025-10-01 04:19:07.909934', 'step': 2185, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:07.944861', 'step': 2185, 'epoch': 2} {'type': 'loss', 'content': 0.008955067954957485, 'timestamp': '2025-10-01 04:19:07.956289', 'step': 2186, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:19:07.994613', 'step': 2186, 'epoch': 2} {'type': 'loss', 'content': 0.009595559909939766, 'timestamp': '2025-10-01 04:19:08.002032', 'step': 2187, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:08.053866', 'step': 2187, 'epoch': 2} {'type': 'loss', 'content': 0.015223323367536068, 'timestamp': '2025-10-01 04:19:08.085471', 'step': 2188, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:19:08.124682', 'step': 2188, 'epoch': 2} {'type': 'loss', 'content': 0.013352875597774982, 'timestamp': '2025-10-01 04:19:08.132767', 'step': 2189, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:19:08.206869', 'step': 2189, 'epoch': 2} {'type': 'loss', 'content': 0.012816013768315315, 'timestamp': '2025-10-01 04:19:08.214885', 'step': 2190, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:08.267824', 'step': 2190, 'epoch': 2} {'type': 'loss', 'content': 0.014680095948278904, 'timestamp': '2025-10-01 04:19:08.278477', 'step': 2191, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:08.319142', 'step': 2191, 'epoch': 2} {'type': 'loss', 'content': 0.010051066055893898, 'timestamp': '2025-10-01 04:19:08.351712', 'step': 2192, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:19:08.391935', 'step': 2192, 'epoch': 2} {'type': 'loss', 'content': 0.00946446880698204, 'timestamp': '2025-10-01 04:19:08.405322', 'step': 2193, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:08.443824', 'step': 2193, 'epoch': 2} {'type': 'loss', 'content': 0.01290104165673256, 'timestamp': '2025-10-01 04:19:08.455560', 'step': 2194, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:08.493811', 'step': 2194, 'epoch': 2} {'type': 'loss', 'content': 0.00854999665170908, 'timestamp': '2025-10-01 04:19:08.505476', 'step': 2195, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:19:08.542910', 'step': 2195, 'epoch': 2} {'type': 'loss', 'content': 0.013319321908056736, 'timestamp': '2025-10-01 04:19:08.577392', 'step': 2196, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:19:08.611879', 'step': 2196, 'epoch': 2} {'type': 'loss', 'content': 0.01354017574340105, 'timestamp': '2025-10-01 04:19:08.624723', 'step': 2197, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:19:08.662848', 'step': 2197, 'epoch': 2} {'type': 'loss', 'content': 0.00793538149446249, 'timestamp': '2025-10-01 04:19:08.676877', 'step': 2198, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:19:08.717055', 'step': 2198, 'epoch': 2} {'type': 'loss', 'content': 0.017000969499349594, 'timestamp': '2025-10-01 04:19:08.731298', 'step': 2199, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:19:08.768251', 'step': 2199, 'epoch': 2} {'type': 'loss', 'content': 0.0132242850959301, 'timestamp': '2025-10-01 04:19:08.802726', 'step': 2200, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:19:08.837211', 'step': 2200, 'epoch': 2} {'type': 'loss', 'content': 0.005933412350714207, 'timestamp': '2025-10-01 04:19:08.847887', 'step': 2201, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:19:08.888028', 'step': 2201, 'epoch': 2} {'type': 'loss', 'content': 0.011517650447785854, 'timestamp': '2025-10-01 04:19:08.900643', 'step': 2202, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:19:08.954172', 'step': 2202, 'epoch': 2} {'type': 'loss', 'content': 0.013464934192597866, 'timestamp': '2025-10-01 04:19:08.963416', 'step': 2203, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:19:09.027066', 'step': 2203, 'epoch': 2} {'type': 'loss', 'content': 0.00912579894065857, 'timestamp': '2025-10-01 04:19:09.064060', 'step': 2204, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:19:09.122104', 'step': 2204, 'epoch': 2} {'type': 'loss', 'content': 0.01325265970081091, 'timestamp': '2025-10-01 04:19:09.135462', 'step': 2205, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:09.177368', 'step': 2205, 'epoch': 2} {'type': 'loss', 'content': 0.008780271746218204, 'timestamp': '2025-10-01 04:19:09.185767', 'step': 2206, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:19:09.232843', 'step': 2206, 'epoch': 2} {'type': 'loss', 'content': 0.010828737169504166, 'timestamp': '2025-10-01 04:19:09.248893', 'step': 2207, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 14238374516352}, 'timestamp': '2025-10-01 04:19:09.324083', 'step': 2207, 'epoch': 2} {'type': 'loss', 'content': 0.007853636518120766, 'timestamp': '2025-10-01 04:19:09.362480', 'step': 2208, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:19:09.423582', 'step': 2208, 'epoch': 2} {'type': 'loss', 'content': 0.01724216900765896, 'timestamp': '2025-10-01 04:19:09.434006', 'step': 2209, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:19:09.479880', 'step': 2209, 'epoch': 2} {'type': 'loss', 'content': 0.013094434514641762, 'timestamp': '2025-10-01 04:19:09.492576', 'step': 2210, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:09.547309', 'step': 2210, 'epoch': 2} {'type': 'loss', 'content': 0.009966771118342876, 'timestamp': '2025-10-01 04:19:09.555565', 'step': 2211, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:19:09.625134', 'step': 2211, 'epoch': 2} {'type': 'loss', 'content': 0.014813040383160114, 'timestamp': '2025-10-01 04:19:09.658636', 'step': 2212, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:09.735644', 'step': 2212, 'epoch': 2} {'type': 'loss', 'content': 0.015496751293540001, 'timestamp': '2025-10-01 04:19:09.746148', 'step': 2213, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:19:09.785918', 'step': 2213, 'epoch': 2} {'type': 'loss', 'content': 0.014114034362137318, 'timestamp': '2025-10-01 04:19:09.793955', 'step': 2214, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:09.837352', 'step': 2214, 'epoch': 2} {'type': 'loss', 'content': 0.008162938989698887, 'timestamp': '2025-10-01 04:19:09.845693', 'step': 2215, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:09.910356', 'step': 2215, 'epoch': 2} {'type': 'loss', 'content': 0.009955370798707008, 'timestamp': '2025-10-01 04:19:09.942263', 'step': 2216, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:19:09.992995', 'step': 2216, 'epoch': 2} {'type': 'loss', 'content': 0.010535430163145065, 'timestamp': '2025-10-01 04:19:10.003480', 'step': 2217, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:19:10.062242', 'step': 2217, 'epoch': 2} {'type': 'loss', 'content': 0.009204904548823833, 'timestamp': '2025-10-01 04:19:10.075026', 'step': 2218, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:19:10.133199', 'step': 2218, 'epoch': 2} {'type': 'loss', 'content': 0.010671501979231834, 'timestamp': '2025-10-01 04:19:10.147246', 'step': 2219, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:10.233062', 'step': 2219, 'epoch': 2} {'type': 'loss', 'content': 0.00754348561167717, 'timestamp': '2025-10-01 04:19:10.262431', 'step': 2220, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:10.309353', 'step': 2220, 'epoch': 2} {'type': 'loss', 'content': 0.011910923756659031, 'timestamp': '2025-10-01 04:19:10.316087', 'step': 2221, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:19:10.351418', 'step': 2221, 'epoch': 2} {'type': 'loss', 'content': 0.008378557860851288, 'timestamp': '2025-10-01 04:19:10.364169', 'step': 2222, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:19:10.404044', 'step': 2222, 'epoch': 2} {'type': 'loss', 'content': 0.01048757042735815, 'timestamp': '2025-10-01 04:19:10.412125', 'step': 2223, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:19:10.445432', 'step': 2223, 'epoch': 2} {'type': 'loss', 'content': 0.010335358791053295, 'timestamp': '2025-10-01 04:19:10.478838', 'step': 2224, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:19:10.517188', 'step': 2224, 'epoch': 2} {'type': 'loss', 'content': 0.006379018537700176, 'timestamp': '2025-10-01 04:19:10.529422', 'step': 2225, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:19:10.562966', 'step': 2225, 'epoch': 2} {'type': 'loss', 'content': 0.0131736621260643, 'timestamp': '2025-10-01 04:19:10.575508', 'step': 2226, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:19:10.617153', 'step': 2226, 'epoch': 2} {'type': 'loss', 'content': 0.009965633042156696, 'timestamp': '2025-10-01 04:19:10.629972', 'step': 2227, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:19:10.672935', 'step': 2227, 'epoch': 2} {'type': 'loss', 'content': 0.007053043227642775, 'timestamp': '2025-10-01 04:19:10.707384', 'step': 2228, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:10.751972', 'step': 2228, 'epoch': 2} {'type': 'loss', 'content': 0.0067025222815573215, 'timestamp': '2025-10-01 04:19:10.761228', 'step': 2229, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:19:10.805758', 'step': 2229, 'epoch': 2} {'type': 'loss', 'content': 0.012638087384402752, 'timestamp': '2025-10-01 04:19:10.813746', 'step': 2230, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:19:10.859031', 'step': 2230, 'epoch': 2} {'type': 'loss', 'content': 0.012631523422896862, 'timestamp': '2025-10-01 04:19:10.873094', 'step': 2231, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:19:10.917262', 'step': 2231, 'epoch': 2} {'type': 'loss', 'content': 0.006544933188706636, 'timestamp': '2025-10-01 04:19:10.945713', 'step': 2232, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:10.978839', 'step': 2232, 'epoch': 2} {'type': 'loss', 'content': 0.010851316154003143, 'timestamp': '2025-10-01 04:19:10.996048', 'step': 2233, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:19:11.037960', 'step': 2233, 'epoch': 2} {'type': 'loss', 'content': 0.005908276420086622, 'timestamp': '2025-10-01 04:19:11.050453', 'step': 2234, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:11.091833', 'step': 2234, 'epoch': 2} {'type': 'loss', 'content': 0.011478250846266747, 'timestamp': '2025-10-01 04:19:11.102788', 'step': 2235, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:19:11.138366', 'step': 2235, 'epoch': 2} {'type': 'loss', 'content': 0.005575316492468119, 'timestamp': '2025-10-01 04:19:11.172076', 'step': 2236, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:19:11.212896', 'step': 2236, 'epoch': 2} {'type': 'loss', 'content': 0.010587864555418491, 'timestamp': '2025-10-01 04:19:11.225748', 'step': 2237, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:19:11.258379', 'step': 2237, 'epoch': 2} {'type': 'loss', 'content': 0.01007275190204382, 'timestamp': '2025-10-01 04:19:11.271128', 'step': 2238, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:11.307679', 'step': 2238, 'epoch': 2} {'type': 'loss', 'content': 0.01369172427803278, 'timestamp': '2025-10-01 04:19:11.316026', 'step': 2239, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:19:11.355896', 'step': 2239, 'epoch': 2} {'type': 'loss', 'content': 0.013469669036567211, 'timestamp': '2025-10-01 04:19:11.390812', 'step': 2240, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:19:11.429566', 'step': 2240, 'epoch': 2} {'type': 'loss', 'content': 0.008363660424947739, 'timestamp': '2025-10-01 04:19:11.441675', 'step': 2241, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:19:11.493479', 'step': 2241, 'epoch': 2} {'type': 'loss', 'content': 0.007540098857134581, 'timestamp': '2025-10-01 04:19:11.508788', 'step': 2242, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:19:11.550829', 'step': 2242, 'epoch': 2} {'type': 'loss', 'content': 0.007505703717470169, 'timestamp': '2025-10-01 04:19:11.565011', 'step': 2243, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:19:11.597715', 'step': 2243, 'epoch': 2} {'type': 'loss', 'content': 0.01010338868945837, 'timestamp': '2025-10-01 04:19:11.631410', 'step': 2244, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:19:11.669725', 'step': 2244, 'epoch': 2} {'type': 'loss', 'content': 0.012346346862614155, 'timestamp': '2025-10-01 04:19:11.682521', 'step': 2245, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:11.714083', 'step': 2245, 'epoch': 2} {'type': 'loss', 'content': 0.013032950460910797, 'timestamp': '2025-10-01 04:19:11.725080', 'step': 2246, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:19:11.756595', 'step': 2246, 'epoch': 2} {'type': 'loss', 'content': 0.012330540455877781, 'timestamp': '2025-10-01 04:19:11.764301', 'step': 2247, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:19:11.795688', 'step': 2247, 'epoch': 2} {'type': 'loss', 'content': 0.00840420089662075, 'timestamp': '2025-10-01 04:19:11.824594', 'step': 2248, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:11.855562', 'step': 2248, 'epoch': 2} {'type': 'loss', 'content': 0.013033434748649597, 'timestamp': '2025-10-01 04:19:11.861480', 'step': 2249, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:11.901089', 'step': 2249, 'epoch': 2} {'type': 'loss', 'content': 0.01028637308627367, 'timestamp': '2025-10-01 04:19:11.909467', 'step': 2250, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-10-01 04:19:11.944155', 'step': 2250, 'epoch': 2} {'type': 'loss', 'content': 0.046644553542137146, 'timestamp': '2025-10-01 04:19:11.949017', 'step': 2251, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:19:11.981877', 'step': 2251, 'epoch': 2} {'type': 'loss', 'content': 0.01544640026986599, 'timestamp': '2025-10-01 04:19:12.009989', 'step': 2252, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 15187581968384}, 'timestamp': '2025-10-01 04:19:12.055351', 'step': 2252, 'epoch': 2} {'type': 'loss', 'content': 0.006525931879878044, 'timestamp': '2025-10-01 04:19:12.072865', 'step': 2253, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:19:12.110286', 'step': 2253, 'epoch': 2} {'type': 'loss', 'content': 0.007533328142017126, 'timestamp': '2025-10-01 04:19:12.124255', 'step': 2254, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:19:12.160445', 'step': 2254, 'epoch': 2} {'type': 'loss', 'content': 0.012910621240735054, 'timestamp': '2025-10-01 04:19:12.165351', 'step': 2255, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:12.201543', 'step': 2255, 'epoch': 2} {'type': 'loss', 'content': 0.009871935471892357, 'timestamp': '2025-10-01 04:19:12.233990', 'step': 2256, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:19:12.268589', 'step': 2256, 'epoch': 2} {'type': 'loss', 'content': 0.01182785164564848, 'timestamp': '2025-10-01 04:19:12.274304', 'step': 2257, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:12.306141', 'step': 2257, 'epoch': 2} {'type': 'loss', 'content': 0.01862550899386406, 'timestamp': '2025-10-01 04:19:12.314394', 'step': 2258, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 14712978242368}, 'timestamp': '2025-10-01 04:19:12.365024', 'step': 2258, 'epoch': 2} {'type': 'loss', 'content': 0.008045617491006851, 'timestamp': '2025-10-01 04:19:12.382717', 'step': 2259, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:19:12.421451', 'step': 2259, 'epoch': 2} {'type': 'loss', 'content': 0.011755536310374737, 'timestamp': '2025-10-01 04:19:12.450259', 'step': 2260, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:12.487822', 'step': 2260, 'epoch': 2} {'type': 'loss', 'content': 0.01154033187776804, 'timestamp': '2025-10-01 04:19:12.496185', 'step': 2261, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:12.534911', 'step': 2261, 'epoch': 2} {'type': 'loss', 'content': 0.012599697336554527, 'timestamp': '2025-10-01 04:19:12.543063', 'step': 2262, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:12.575868', 'step': 2262, 'epoch': 2} {'type': 'loss', 'content': 0.008603425696492195, 'timestamp': '2025-10-01 04:19:12.584157', 'step': 2263, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:19:12.623860', 'step': 2263, 'epoch': 2} {'type': 'loss', 'content': 0.013321705162525177, 'timestamp': '2025-10-01 04:19:12.658753', 'step': 2264, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:19:12.689667', 'step': 2264, 'epoch': 2} {'type': 'loss', 'content': 0.011240750551223755, 'timestamp': '2025-10-01 04:19:12.695195', 'step': 2265, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:19:12.733655', 'step': 2265, 'epoch': 2} {'type': 'loss', 'content': 0.008608750998973846, 'timestamp': '2025-10-01 04:19:12.747685', 'step': 2266, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:19:12.782165', 'step': 2266, 'epoch': 2} {'type': 'loss', 'content': 0.006943197455257177, 'timestamp': '2025-10-01 04:19:12.795714', 'step': 2267, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:19:12.839523', 'step': 2267, 'epoch': 2} {'type': 'loss', 'content': 0.011456957086920738, 'timestamp': '2025-10-01 04:19:12.874491', 'step': 2268, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-10-01 04:19:12.920734', 'step': 2268, 'epoch': 2} {'type': 'loss', 'content': 0.0054956721141934395, 'timestamp': '2025-10-01 04:19:12.937538', 'step': 2269, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:12.975062', 'step': 2269, 'epoch': 2} {'type': 'loss', 'content': 0.011643268167972565, 'timestamp': '2025-10-01 04:19:12.985977', 'step': 2270, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:13.024733', 'step': 2270, 'epoch': 2} {'type': 'loss', 'content': 0.034327466040849686, 'timestamp': '2025-10-01 04:19:13.032946', 'step': 2271, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:13.066272', 'step': 2271, 'epoch': 2} {'type': 'loss', 'content': 0.014183835126459599, 'timestamp': '2025-10-01 04:19:13.099120', 'step': 2272, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:19:13.132170', 'step': 2272, 'epoch': 2} {'type': 'loss', 'content': 0.013837282545864582, 'timestamp': '2025-10-01 04:19:13.137954', 'step': 2273, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:19:13.182180', 'step': 2273, 'epoch': 2} {'type': 'loss', 'content': 0.005985114723443985, 'timestamp': '2025-10-01 04:19:13.198200', 'step': 2274, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:19:13.239118', 'step': 2274, 'epoch': 2} {'type': 'loss', 'content': 0.008214767090976238, 'timestamp': '2025-10-01 04:19:13.253318', 'step': 2275, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:19:13.293449', 'step': 2275, 'epoch': 2} {'type': 'loss', 'content': 0.0113050676882267, 'timestamp': '2025-10-01 04:19:13.328396', 'step': 2276, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:13.364120', 'step': 2276, 'epoch': 2} {'type': 'loss', 'content': 0.0067376974038779736, 'timestamp': '2025-10-01 04:19:13.373391', 'step': 2277, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:19:13.404399', 'step': 2277, 'epoch': 2} {'type': 'loss', 'content': 0.011021988466382027, 'timestamp': '2025-10-01 04:19:13.411096', 'step': 2278, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:19:13.446092', 'step': 2278, 'epoch': 2} {'type': 'loss', 'content': 0.014089785516262054, 'timestamp': '2025-10-01 04:19:13.458527', 'step': 2279, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:13.489815', 'step': 2279, 'epoch': 2} {'type': 'loss', 'content': 0.010907324030995369, 'timestamp': '2025-10-01 04:19:13.522351', 'step': 2280, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:19:13.557117', 'step': 2280, 'epoch': 2} {'type': 'loss', 'content': 0.0150785893201828, 'timestamp': '2025-10-01 04:19:13.569985', 'step': 2281, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:19:13.603887', 'step': 2281, 'epoch': 2} {'type': 'loss', 'content': 0.011201570741832256, 'timestamp': '2025-10-01 04:19:13.611754', 'step': 2282, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:13.644988', 'step': 2282, 'epoch': 2} {'type': 'loss', 'content': 0.012201984412968159, 'timestamp': '2025-10-01 04:19:13.656504', 'step': 2283, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:13.690941', 'step': 2283, 'epoch': 2} {'type': 'loss', 'content': 0.007954757660627365, 'timestamp': '2025-10-01 04:19:13.723410', 'step': 2284, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:19:13.758704', 'step': 2284, 'epoch': 2} {'type': 'loss', 'content': 0.01756960153579712, 'timestamp': '2025-10-01 04:19:13.769880', 'step': 2285, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:13.801791', 'step': 2285, 'epoch': 2} {'type': 'loss', 'content': 0.02172759175300598, 'timestamp': '2025-10-01 04:19:13.810401', 'step': 2286, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:19:13.850068', 'step': 2286, 'epoch': 2} {'type': 'loss', 'content': 0.011571618728339672, 'timestamp': '2025-10-01 04:19:13.862761', 'step': 2287, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:19:13.909543', 'step': 2287, 'epoch': 2} {'type': 'loss', 'content': 0.009310690686106682, 'timestamp': '2025-10-01 04:19:13.944479', 'step': 2288, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:19:13.979790', 'step': 2288, 'epoch': 2} {'type': 'loss', 'content': 0.006786558777093887, 'timestamp': '2025-10-01 04:19:13.993182', 'step': 2289, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:19:14.039923', 'step': 2289, 'epoch': 2} {'type': 'loss', 'content': 0.011490142904222012, 'timestamp': '2025-10-01 04:19:14.053996', 'step': 2290, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-10-01 04:19:14.110879', 'step': 2290, 'epoch': 2} {'type': 'loss', 'content': 0.01951078325510025, 'timestamp': '2025-10-01 04:19:14.127411', 'step': 2291, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:19:14.159944', 'step': 2291, 'epoch': 2} {'type': 'loss', 'content': 0.010025802999734879, 'timestamp': '2025-10-01 04:19:14.195189', 'step': 2292, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:19:14.239369', 'step': 2292, 'epoch': 2} {'type': 'loss', 'content': 0.010769554413855076, 'timestamp': '2025-10-01 04:19:14.252188', 'step': 2293, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:19:14.293650', 'step': 2293, 'epoch': 2} {'type': 'loss', 'content': 0.008517670445144176, 'timestamp': '2025-10-01 04:19:14.309803', 'step': 2294, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:14.354099', 'step': 2294, 'epoch': 2} {'type': 'loss', 'content': 0.016392048448324203, 'timestamp': '2025-10-01 04:19:14.364998', 'step': 2295, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:14.409964', 'step': 2295, 'epoch': 2} {'type': 'loss', 'content': 0.006362204905599356, 'timestamp': '2025-10-01 04:19:14.443366', 'step': 2296, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:14.484649', 'step': 2296, 'epoch': 2} {'type': 'loss', 'content': 0.011040894314646721, 'timestamp': '2025-10-01 04:19:14.493024', 'step': 2297, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:19:14.529414', 'step': 2297, 'epoch': 2} {'type': 'loss', 'content': 0.00923159345984459, 'timestamp': '2025-10-01 04:19:14.544842', 'step': 2298, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:19:14.592888', 'step': 2298, 'epoch': 2} {'type': 'loss', 'content': 0.007049804553389549, 'timestamp': '2025-10-01 04:19:14.609176', 'step': 2299, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:19:14.642497', 'step': 2299, 'epoch': 2} {'type': 'loss', 'content': 0.010695367120206356, 'timestamp': '2025-10-01 04:19:14.677388', 'step': 2300, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-10-01 04:19:17.192897', 'step': 2300, 'epoch': 2} {'type': 'pplx', 'content': 5.645433573690867, 'timestamp': '2025-10-01 04:19:17.198531', 'step': 2300, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:19:17.234975', 'step': 2300, 'epoch': 2} {'type': 'loss', 'content': 0.011024121195077896, 'timestamp': '2025-10-01 04:19:17.244182', 'step': 2301, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:19:17.295138', 'step': 2301, 'epoch': 2} {'type': 'loss', 'content': 0.013958295807242393, 'timestamp': '2025-10-01 04:19:17.307897', 'step': 2302, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:17.350579', 'step': 2302, 'epoch': 2} {'type': 'loss', 'content': 0.0125004593282938, 'timestamp': '2025-10-01 04:19:17.358754', 'step': 2303, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:19:17.405894', 'step': 2303, 'epoch': 2} {'type': 'loss', 'content': 0.010168910957872868, 'timestamp': '2025-10-01 04:19:17.442604', 'step': 2304, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:19:17.491562', 'step': 2304, 'epoch': 2} {'type': 'loss', 'content': 0.006456979084759951, 'timestamp': '2025-10-01 04:19:17.507144', 'step': 2305, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:19:17.553178', 'step': 2305, 'epoch': 2} {'type': 'loss', 'content': 0.009935032576322556, 'timestamp': '2025-10-01 04:19:17.567194', 'step': 2306, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:19:17.605385', 'step': 2306, 'epoch': 2} {'type': 'loss', 'content': 0.012955346144735813, 'timestamp': '2025-10-01 04:19:17.618149', 'step': 2307, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:19:17.658974', 'step': 2307, 'epoch': 2} {'type': 'loss', 'content': 0.008229161612689495, 'timestamp': '2025-10-01 04:19:17.694133', 'step': 2308, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:19:17.740752', 'step': 2308, 'epoch': 2} {'type': 'loss', 'content': 0.010822479613125324, 'timestamp': '2025-10-01 04:19:17.745482', 'step': 2309, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:19:17.796877', 'step': 2309, 'epoch': 2} {'type': 'loss', 'content': 0.006300068460404873, 'timestamp': '2025-10-01 04:19:17.810880', 'step': 2310, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:17.855721', 'step': 2310, 'epoch': 2} {'type': 'loss', 'content': 0.018858889117836952, 'timestamp': '2025-10-01 04:19:17.863992', 'step': 2311, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:19:17.907406', 'step': 2311, 'epoch': 2} {'type': 'loss', 'content': 0.01361115649342537, 'timestamp': '2025-10-01 04:19:17.936165', 'step': 2312, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 14238374516352}, 'timestamp': '2025-10-01 04:19:17.991943', 'step': 2312, 'epoch': 2} {'type': 'loss', 'content': 0.00546282110735774, 'timestamp': '2025-10-01 04:19:18.009118', 'step': 2313, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:18.051013', 'step': 2313, 'epoch': 2} {'type': 'loss', 'content': 0.022815171629190445, 'timestamp': '2025-10-01 04:19:18.059257', 'step': 2314, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:19:18.098863', 'step': 2314, 'epoch': 2} {'type': 'loss', 'content': 0.012264501303434372, 'timestamp': '2025-10-01 04:19:18.106715', 'step': 2315, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:18.151945', 'step': 2315, 'epoch': 2} {'type': 'loss', 'content': 0.011121842078864574, 'timestamp': '2025-10-01 04:19:18.183517', 'step': 2316, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:19:18.228836', 'step': 2316, 'epoch': 2} {'type': 'loss', 'content': 0.010460934601724148, 'timestamp': '2025-10-01 04:19:18.240029', 'step': 2317, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-10-01 04:19:18.296945', 'step': 2317, 'epoch': 2} {'type': 'loss', 'content': 0.009002177976071835, 'timestamp': '2025-10-01 04:19:18.313323', 'step': 2318, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:19:18.365625', 'step': 2318, 'epoch': 2} {'type': 'loss', 'content': 0.011473285034298897, 'timestamp': '2025-10-01 04:19:18.378153', 'step': 2319, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:19:18.435097', 'step': 2319, 'epoch': 2} {'type': 'loss', 'content': 0.01238553412258625, 'timestamp': '2025-10-01 04:19:18.470536', 'step': 2320, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:18.513677', 'step': 2320, 'epoch': 2} {'type': 'loss', 'content': 0.012078301049768925, 'timestamp': '2025-10-01 04:19:18.526281', 'step': 2321, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:18.571682', 'step': 2321, 'epoch': 2} {'type': 'loss', 'content': 0.011172293685376644, 'timestamp': '2025-10-01 04:19:18.588016', 'step': 2322, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:19:18.637526', 'step': 2322, 'epoch': 2} {'type': 'loss', 'content': 0.009172617457807064, 'timestamp': '2025-10-01 04:19:18.645620', 'step': 2323, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:19:18.685923', 'step': 2323, 'epoch': 2} {'type': 'loss', 'content': 0.009440874680876732, 'timestamp': '2025-10-01 04:19:18.719419', 'step': 2324, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:19:18.764614', 'step': 2324, 'epoch': 2} {'type': 'loss', 'content': 0.0068310583010315895, 'timestamp': '2025-10-01 04:19:18.777407', 'step': 2325, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:19:18.822920', 'step': 2325, 'epoch': 2} {'type': 'loss', 'content': 0.01139372494071722, 'timestamp': '2025-10-01 04:19:18.835673', 'step': 2326, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:19:18.883813', 'step': 2326, 'epoch': 2} {'type': 'loss', 'content': 0.00898691639304161, 'timestamp': '2025-10-01 04:19:18.897819', 'step': 2327, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:18.942462', 'step': 2327, 'epoch': 2} {'type': 'loss', 'content': 0.00855393148958683, 'timestamp': '2025-10-01 04:19:18.974867', 'step': 2328, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:19.016245', 'step': 2328, 'epoch': 2} {'type': 'loss', 'content': 0.005201603751629591, 'timestamp': '2025-10-01 04:19:19.025576', 'step': 2329, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:19.067499', 'step': 2329, 'epoch': 2} {'type': 'loss', 'content': 0.014490672387182713, 'timestamp': '2025-10-01 04:19:19.079514', 'step': 2330, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:19.125783', 'step': 2330, 'epoch': 2} {'type': 'loss', 'content': 0.01185675896704197, 'timestamp': '2025-10-01 04:19:19.136683', 'step': 2331, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:19:19.194294', 'step': 2331, 'epoch': 2} {'type': 'loss', 'content': 0.007376262918114662, 'timestamp': '2025-10-01 04:19:19.231053', 'step': 2332, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:19:19.283846', 'step': 2332, 'epoch': 2} {'type': 'loss', 'content': 0.014044169336557388, 'timestamp': '2025-10-01 04:19:19.296728', 'step': 2333, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:19:19.343290', 'step': 2333, 'epoch': 2} {'type': 'loss', 'content': 0.006928737740963697, 'timestamp': '2025-10-01 04:19:19.357271', 'step': 2334, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:19.401562', 'step': 2334, 'epoch': 2} {'type': 'loss', 'content': 0.010382061824202538, 'timestamp': '2025-10-01 04:19:19.412134', 'step': 2335, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:19.450427', 'step': 2335, 'epoch': 2} {'type': 'loss', 'content': 0.009514085948467255, 'timestamp': '2025-10-01 04:19:19.482131', 'step': 2336, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:19:19.523530', 'step': 2336, 'epoch': 2} {'type': 'loss', 'content': 0.009212557226419449, 'timestamp': '2025-10-01 04:19:19.534623', 'step': 2337, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:19:19.590170', 'step': 2337, 'epoch': 2} {'type': 'loss', 'content': 0.006827318575233221, 'timestamp': '2025-10-01 04:19:19.604198', 'step': 2338, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:19:19.658164', 'step': 2338, 'epoch': 2} {'type': 'loss', 'content': 0.0062919072806835175, 'timestamp': '2025-10-01 04:19:19.672302', 'step': 2339, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:19:19.720196', 'step': 2339, 'epoch': 2} {'type': 'loss', 'content': 0.004459880758076906, 'timestamp': '2025-10-01 04:19:19.755136', 'step': 2340, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:19:19.801074', 'step': 2340, 'epoch': 2} {'type': 'loss', 'content': 0.00800326094031334, 'timestamp': '2025-10-01 04:19:19.814473', 'step': 2341, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:19:19.860904', 'step': 2341, 'epoch': 2} {'type': 'loss', 'content': 0.009779841639101505, 'timestamp': '2025-10-01 04:19:19.873430', 'step': 2342, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:19.919020', 'step': 2342, 'epoch': 2} {'type': 'loss', 'content': 0.014758595265448093, 'timestamp': '2025-10-01 04:19:19.927303', 'step': 2343, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:19:19.971503', 'step': 2343, 'epoch': 2} {'type': 'loss', 'content': 0.013375641778111458, 'timestamp': '2025-10-01 04:19:20.005057', 'step': 2344, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:20.048333', 'step': 2344, 'epoch': 2} {'type': 'loss', 'content': 0.010102779604494572, 'timestamp': '2025-10-01 04:19:20.057083', 'step': 2345, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:20.102084', 'step': 2345, 'epoch': 2} {'type': 'loss', 'content': 0.014045996591448784, 'timestamp': '2025-10-01 04:19:20.110402', 'step': 2346, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:19:20.162047', 'step': 2346, 'epoch': 2} {'type': 'loss', 'content': 0.008576063439249992, 'timestamp': '2025-10-01 04:19:20.177884', 'step': 2347, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:20.224551', 'step': 2347, 'epoch': 2} {'type': 'loss', 'content': 0.011928622610867023, 'timestamp': '2025-10-01 04:19:20.253842', 'step': 2348, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:19:20.300630', 'step': 2348, 'epoch': 2} {'type': 'loss', 'content': 0.009862824343144894, 'timestamp': '2025-10-01 04:19:20.313514', 'step': 2349, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:20.361542', 'step': 2349, 'epoch': 2} {'type': 'loss', 'content': 0.009112492203712463, 'timestamp': '2025-10-01 04:19:20.373143', 'step': 2350, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:19:20.424811', 'step': 2350, 'epoch': 2} {'type': 'loss', 'content': 0.005028019659221172, 'timestamp': '2025-10-01 04:19:20.438863', 'step': 2351, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:19:20.484403', 'step': 2351, 'epoch': 2} {'type': 'loss', 'content': 0.00407068058848381, 'timestamp': '2025-10-01 04:19:20.518956', 'step': 2352, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:19:20.577632', 'step': 2352, 'epoch': 2} {'type': 'loss', 'content': 0.00809461809694767, 'timestamp': '2025-10-01 04:19:20.588527', 'step': 2353, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:19:20.634982', 'step': 2353, 'epoch': 2} {'type': 'loss', 'content': 0.006641853600740433, 'timestamp': '2025-10-01 04:19:20.647518', 'step': 2354, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:19:20.704885', 'step': 2354, 'epoch': 2} {'type': 'loss', 'content': 0.00778185436502099, 'timestamp': '2025-10-01 04:19:20.720896', 'step': 2355, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:19:20.771964', 'step': 2355, 'epoch': 2} {'type': 'loss', 'content': 0.008167404681444168, 'timestamp': '2025-10-01 04:19:20.807121', 'step': 2356, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:20.852900', 'step': 2356, 'epoch': 2} {'type': 'loss', 'content': 0.007187299896031618, 'timestamp': '2025-10-01 04:19:20.861112', 'step': 2357, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:19:20.921803', 'step': 2357, 'epoch': 2} {'type': 'loss', 'content': 0.00637078657746315, 'timestamp': '2025-10-01 04:19:20.935814', 'step': 2358, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:19:20.981381', 'step': 2358, 'epoch': 2} {'type': 'loss', 'content': 0.007564273662865162, 'timestamp': '2025-10-01 04:19:20.993886', 'step': 2359, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:21.047285', 'step': 2359, 'epoch': 2} {'type': 'loss', 'content': 0.008830086328089237, 'timestamp': '2025-10-01 04:19:21.082674', 'step': 2360, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:19:21.133127', 'step': 2360, 'epoch': 2} {'type': 'loss', 'content': 0.013163380324840546, 'timestamp': '2025-10-01 04:19:21.138427', 'step': 2361, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:21.185596', 'step': 2361, 'epoch': 2} {'type': 'loss', 'content': 0.012125848792493343, 'timestamp': '2025-10-01 04:19:21.196045', 'step': 2362, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:21.250520', 'step': 2362, 'epoch': 2} {'type': 'loss', 'content': 0.006338410545140505, 'timestamp': '2025-10-01 04:19:21.262001', 'step': 2363, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:19:21.313549', 'step': 2363, 'epoch': 2} {'type': 'loss', 'content': 0.006842540577054024, 'timestamp': '2025-10-01 04:19:21.349031', 'step': 2364, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:21.394347', 'step': 2364, 'epoch': 2} {'type': 'loss', 'content': 0.012909031473100185, 'timestamp': '2025-10-01 04:19:21.400278', 'step': 2365, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:19:21.452702', 'step': 2365, 'epoch': 2} {'type': 'loss', 'content': 0.01249758806079626, 'timestamp': '2025-10-01 04:19:21.460220', 'step': 2366, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:19:21.517274', 'step': 2366, 'epoch': 2} {'type': 'loss', 'content': 0.008362263441085815, 'timestamp': '2025-10-01 04:19:21.531254', 'step': 2367, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:21.586957', 'step': 2367, 'epoch': 2} {'type': 'loss', 'content': 0.005183870438486338, 'timestamp': '2025-10-01 04:19:21.616139', 'step': 2368, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:19:21.665922', 'step': 2368, 'epoch': 2} {'type': 'loss', 'content': 0.006676408462226391, 'timestamp': '2025-10-01 04:19:21.679355', 'step': 2369, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:19:21.722214', 'step': 2369, 'epoch': 2} {'type': 'loss', 'content': 0.008973420597612858, 'timestamp': '2025-10-01 04:19:21.735126', 'step': 2370, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:21.781156', 'step': 2370, 'epoch': 2} {'type': 'loss', 'content': 0.010668257251381874, 'timestamp': '2025-10-01 04:19:21.789288', 'step': 2371, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:19:21.830000', 'step': 2371, 'epoch': 2} {'type': 'loss', 'content': 0.01373037975281477, 'timestamp': '2025-10-01 04:19:21.863438', 'step': 2372, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:19:21.908154', 'step': 2372, 'epoch': 2} {'type': 'loss', 'content': 0.015149123966693878, 'timestamp': '2025-10-01 04:19:21.921785', 'step': 2373, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:19:21.963279', 'step': 2373, 'epoch': 2} {'type': 'loss', 'content': 0.004664137493818998, 'timestamp': '2025-10-01 04:19:21.976154', 'step': 2374, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:19:22.029390', 'step': 2374, 'epoch': 2} {'type': 'loss', 'content': 0.0040018861182034016, 'timestamp': '2025-10-01 04:19:22.043362', 'step': 2375, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:19:22.108202', 'step': 2375, 'epoch': 2} {'type': 'loss', 'content': 0.016274072229862213, 'timestamp': '2025-10-01 04:19:22.142776', 'step': 2376, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:19:22.190768', 'step': 2376, 'epoch': 2} {'type': 'loss', 'content': 0.016006361693143845, 'timestamp': '2025-10-01 04:19:22.195726', 'step': 2377, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:22.231064', 'step': 2377, 'epoch': 2} {'type': 'loss', 'content': 0.011966003105044365, 'timestamp': '2025-10-01 04:19:22.239209', 'step': 2378, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:19:22.289196', 'step': 2378, 'epoch': 2} {'type': 'loss', 'content': 0.007365915924310684, 'timestamp': '2025-10-01 04:19:22.302707', 'step': 2379, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:19:22.365358', 'step': 2379, 'epoch': 2} {'type': 'loss', 'content': 0.007993210107088089, 'timestamp': '2025-10-01 04:19:22.398134', 'step': 2380, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:19:22.439230', 'step': 2380, 'epoch': 2} {'type': 'loss', 'content': 0.012932179495692253, 'timestamp': '2025-10-01 04:19:22.449054', 'step': 2381, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:19:22.500905', 'step': 2381, 'epoch': 2} {'type': 'loss', 'content': 0.008378755301237106, 'timestamp': '2025-10-01 04:19:22.515954', 'step': 2382, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:19:22.564567', 'step': 2382, 'epoch': 2} {'type': 'loss', 'content': 0.007461161352694035, 'timestamp': '2025-10-01 04:19:22.578088', 'step': 2383, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:22.624011', 'step': 2383, 'epoch': 2} {'type': 'loss', 'content': 0.01013202965259552, 'timestamp': '2025-10-01 04:19:22.656347', 'step': 2384, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:19:22.715530', 'step': 2384, 'epoch': 2} {'type': 'loss', 'content': 0.008043903857469559, 'timestamp': '2025-10-01 04:19:22.728793', 'step': 2385, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:19:22.772902', 'step': 2385, 'epoch': 2} {'type': 'loss', 'content': 0.010093423537909985, 'timestamp': '2025-10-01 04:19:22.786818', 'step': 2386, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:19:22.838627', 'step': 2386, 'epoch': 2} {'type': 'loss', 'content': 0.013087011873722076, 'timestamp': '2025-10-01 04:19:22.852228', 'step': 2387, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:19:22.893335', 'step': 2387, 'epoch': 2} {'type': 'loss', 'content': 0.010724530555307865, 'timestamp': '2025-10-01 04:19:22.928451', 'step': 2388, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:19:22.967319', 'step': 2388, 'epoch': 2} {'type': 'loss', 'content': 0.005468372721225023, 'timestamp': '2025-10-01 04:19:22.978105', 'step': 2389, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:23.021986', 'step': 2389, 'epoch': 2} {'type': 'loss', 'content': 0.010745416395366192, 'timestamp': '2025-10-01 04:19:23.033499', 'step': 2390, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:19:23.086212', 'step': 2390, 'epoch': 2} {'type': 'loss', 'content': 0.0075510405004024506, 'timestamp': '2025-10-01 04:19:23.098945', 'step': 2391, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:19:23.135858', 'step': 2391, 'epoch': 2} {'type': 'loss', 'content': 0.006814200431108475, 'timestamp': '2025-10-01 04:19:23.169498', 'step': 2392, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:23.207859', 'step': 2392, 'epoch': 2} {'type': 'loss', 'content': 0.011218193918466568, 'timestamp': '2025-10-01 04:19:23.215903', 'step': 2393, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:19:23.265060', 'step': 2393, 'epoch': 2} {'type': 'loss', 'content': 0.013960839249193668, 'timestamp': '2025-10-01 04:19:23.272717', 'step': 2394, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:19:23.320912', 'step': 2394, 'epoch': 2} {'type': 'loss', 'content': 0.010626561008393764, 'timestamp': '2025-10-01 04:19:23.328226', 'step': 2395, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:19:23.379950', 'step': 2395, 'epoch': 2} {'type': 'loss', 'content': 0.009455938823521137, 'timestamp': '2025-10-01 04:19:23.413115', 'step': 2396, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:23.452223', 'step': 2396, 'epoch': 2} {'type': 'loss', 'content': 0.0077408356592059135, 'timestamp': '2025-10-01 04:19:23.459204', 'step': 2397, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:19:23.506363', 'step': 2397, 'epoch': 2} {'type': 'loss', 'content': 0.01656491681933403, 'timestamp': '2025-10-01 04:19:23.516759', 'step': 2398, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:19:23.571369', 'step': 2398, 'epoch': 2} {'type': 'loss', 'content': 0.009400352835655212, 'timestamp': '2025-10-01 04:19:23.587178', 'step': 2399, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:19:23.632581', 'step': 2399, 'epoch': 2} {'type': 'loss', 'content': 0.009133011102676392, 'timestamp': '2025-10-01 04:19:23.667497', 'step': 2400, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:19:23.716008', 'step': 2400, 'epoch': 2} {'type': 'loss', 'content': 0.006245605647563934, 'timestamp': '2025-10-01 04:19:23.723284', 'step': 2401, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-10-01 04:19:23.770872', 'step': 2401, 'epoch': 2} {'type': 'loss', 'content': 0.015264391899108887, 'timestamp': '2025-10-01 04:19:23.782677', 'step': 2402, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:19:23.845820', 'step': 2402, 'epoch': 2} {'type': 'loss', 'content': 0.011726902797818184, 'timestamp': '2025-10-01 04:19:23.849757', 'step': 2403, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:19:23.905893', 'step': 2403, 'epoch': 2} {'type': 'loss', 'content': 0.007087836042046547, 'timestamp': '2025-10-01 04:19:23.941066', 'step': 2404, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:19:23.986568', 'step': 2404, 'epoch': 2} {'type': 'loss', 'content': 0.010818040929734707, 'timestamp': '2025-10-01 04:19:23.992169', 'step': 2405, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:19:24.030590', 'step': 2405, 'epoch': 2} {'type': 'loss', 'content': 0.006523734424263239, 'timestamp': '2025-10-01 04:19:24.043164', 'step': 2406, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:24.085516', 'step': 2406, 'epoch': 2} {'type': 'loss', 'content': 0.007816510275006294, 'timestamp': '2025-10-01 04:19:24.100420', 'step': 2407, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:24.155998', 'step': 2407, 'epoch': 2} {'type': 'loss', 'content': 0.007565835025161505, 'timestamp': '2025-10-01 04:19:24.185262', 'step': 2408, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:24.231724', 'step': 2408, 'epoch': 2} {'type': 'loss', 'content': 0.012403231114149094, 'timestamp': '2025-10-01 04:19:24.238772', 'step': 2409, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:24.283610', 'step': 2409, 'epoch': 2} {'type': 'loss', 'content': 0.007091097068041563, 'timestamp': '2025-10-01 04:19:24.296414', 'step': 2410, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:19:24.345675', 'step': 2410, 'epoch': 2} {'type': 'loss', 'content': 0.007058456540107727, 'timestamp': '2025-10-01 04:19:24.360079', 'step': 2411, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-10-01 04:19:24.407241', 'step': 2411, 'epoch': 2} {'type': 'loss', 'content': 0.006771209184080362, 'timestamp': '2025-10-01 04:19:24.434291', 'step': 2412, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:24.478019', 'step': 2412, 'epoch': 2} {'type': 'loss', 'content': 0.007517782039940357, 'timestamp': '2025-10-01 04:19:24.485968', 'step': 2413, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:19:24.539070', 'step': 2413, 'epoch': 2} {'type': 'loss', 'content': 0.011539969593286514, 'timestamp': '2025-10-01 04:19:24.546470', 'step': 2414, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:19:24.588553', 'step': 2414, 'epoch': 2} {'type': 'loss', 'content': 0.00848992820829153, 'timestamp': '2025-10-01 04:19:24.602546', 'step': 2415, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-10-01 04:19:27.412678', 'step': 2415, 'epoch': 2} {'type': 'pplx', 'content': 5.75591141053313, 'timestamp': '2025-10-01 04:19:27.419731', 'step': 2415, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:19:27.460563', 'step': 2415, 'epoch': 2} {'type': 'loss', 'content': 0.006172639783471823, 'timestamp': '2025-10-01 04:19:27.495492', 'step': 2416, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:19:27.543467', 'step': 2416, 'epoch': 2} {'type': 'loss', 'content': 0.005107243545353413, 'timestamp': '2025-10-01 04:19:27.556320', 'step': 2417, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:27.603606', 'step': 2417, 'epoch': 2} {'type': 'loss', 'content': 0.011720502749085426, 'timestamp': '2025-10-01 04:19:27.614751', 'step': 2418, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:19:27.659373', 'step': 2418, 'epoch': 2} {'type': 'loss', 'content': 0.00529983127489686, 'timestamp': '2025-10-01 04:19:27.667988', 'step': 2419, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:27.709689', 'step': 2419, 'epoch': 2} {'type': 'loss', 'content': 0.007049353793263435, 'timestamp': '2025-10-01 04:19:27.741995', 'step': 2420, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-10-01 04:19:27.785275', 'step': 2420, 'epoch': 2} {'type': 'loss', 'content': 0.0081711420789361, 'timestamp': '2025-10-01 04:19:27.801278', 'step': 2421, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:27.843532', 'step': 2421, 'epoch': 2} {'type': 'loss', 'content': 0.01576230116188526, 'timestamp': '2025-10-01 04:19:27.855099', 'step': 2422, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:19:27.895557', 'step': 2422, 'epoch': 2} {'type': 'loss', 'content': 0.009834472090005875, 'timestamp': '2025-10-01 04:19:27.908043', 'step': 2423, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:27.951043', 'step': 2423, 'epoch': 2} {'type': 'loss', 'content': 0.00867343507707119, 'timestamp': '2025-10-01 04:19:27.980287', 'step': 2424, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 17085996872448}, 'timestamp': '2025-10-01 04:19:28.038009', 'step': 2424, 'epoch': 2} {'type': 'loss', 'content': 0.00831505749374628, 'timestamp': '2025-10-01 04:19:28.057432', 'step': 2425, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:19:28.104964', 'step': 2425, 'epoch': 2} {'type': 'loss', 'content': 0.005719278007745743, 'timestamp': '2025-10-01 04:19:28.119155', 'step': 2426, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:28.158114', 'step': 2426, 'epoch': 2} {'type': 'loss', 'content': 0.013783016242086887, 'timestamp': '2025-10-01 04:19:28.168899', 'step': 2427, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:28.208582', 'step': 2427, 'epoch': 2} {'type': 'loss', 'content': 0.013215205632150173, 'timestamp': '2025-10-01 04:19:28.240627', 'step': 2428, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:19:28.285742', 'step': 2428, 'epoch': 2} {'type': 'loss', 'content': 0.011243275366723537, 'timestamp': '2025-10-01 04:19:28.296764', 'step': 2429, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:19:28.348452', 'step': 2429, 'epoch': 2} {'type': 'loss', 'content': 0.0034785715397447348, 'timestamp': '2025-10-01 04:19:28.362523', 'step': 2430, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:19:28.418291', 'step': 2430, 'epoch': 2} {'type': 'loss', 'content': 0.006271759048104286, 'timestamp': '2025-10-01 04:19:28.432261', 'step': 2431, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:19:28.482854', 'step': 2431, 'epoch': 2} {'type': 'loss', 'content': 0.014758102595806122, 'timestamp': '2025-10-01 04:19:28.511244', 'step': 2432, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:28.557592', 'step': 2432, 'epoch': 2} {'type': 'loss', 'content': 0.008314085192978382, 'timestamp': '2025-10-01 04:19:28.565925', 'step': 2433, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:28.611095', 'step': 2433, 'epoch': 2} {'type': 'loss', 'content': 0.011514493264257908, 'timestamp': '2025-10-01 04:19:28.622675', 'step': 2434, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:28.666407', 'step': 2434, 'epoch': 2} {'type': 'loss', 'content': 0.011734064668416977, 'timestamp': '2025-10-01 04:19:28.677785', 'step': 2435, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:28.723797', 'step': 2435, 'epoch': 2} {'type': 'loss', 'content': 0.007003100588917732, 'timestamp': '2025-10-01 04:19:28.755395', 'step': 2436, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-10-01 04:19:28.812796', 'step': 2436, 'epoch': 2} {'type': 'loss', 'content': 0.005431232508271933, 'timestamp': '2025-10-01 04:19:28.829606', 'step': 2437, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:19:28.880220', 'step': 2437, 'epoch': 2} {'type': 'loss', 'content': 0.007207023911178112, 'timestamp': '2025-10-01 04:19:28.887791', 'step': 2438, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:28.932528', 'step': 2438, 'epoch': 2} {'type': 'loss', 'content': 0.011103983037173748, 'timestamp': '2025-10-01 04:19:28.943227', 'step': 2439, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:28.994315', 'step': 2439, 'epoch': 2} {'type': 'loss', 'content': 0.015481662005186081, 'timestamp': '2025-10-01 04:19:29.023530', 'step': 2440, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:29.077241', 'step': 2440, 'epoch': 2} {'type': 'loss', 'content': 0.0035067072603851557, 'timestamp': '2025-10-01 04:19:29.083177', 'step': 2441, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:29.124239', 'step': 2441, 'epoch': 2} {'type': 'loss', 'content': 0.011361384764313698, 'timestamp': '2025-10-01 04:19:29.135863', 'step': 2442, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:19:29.189106', 'step': 2442, 'epoch': 2} {'type': 'loss', 'content': 0.012891820631921291, 'timestamp': '2025-10-01 04:19:29.202666', 'step': 2443, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:29.252545', 'step': 2443, 'epoch': 2} {'type': 'loss', 'content': 0.010958991013467312, 'timestamp': '2025-10-01 04:19:29.285029', 'step': 2444, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:19:29.331203', 'step': 2444, 'epoch': 2} {'type': 'loss', 'content': 0.01773439161479473, 'timestamp': '2025-10-01 04:19:29.345117', 'step': 2445, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:19:29.381785', 'step': 2445, 'epoch': 2} {'type': 'loss', 'content': 0.013530808500945568, 'timestamp': '2025-10-01 04:19:29.393991', 'step': 2446, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:19:29.457801', 'step': 2446, 'epoch': 2} {'type': 'loss', 'content': 0.009291102178394794, 'timestamp': '2025-10-01 04:19:29.465568', 'step': 2447, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:29.516534', 'step': 2447, 'epoch': 2} {'type': 'loss', 'content': 0.00866714771836996, 'timestamp': '2025-10-01 04:19:29.548858', 'step': 2448, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:19:29.595563', 'step': 2448, 'epoch': 2} {'type': 'loss', 'content': 0.010234866291284561, 'timestamp': '2025-10-01 04:19:29.601172', 'step': 2449, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:29.638527', 'step': 2449, 'epoch': 2} {'type': 'loss', 'content': 0.008862472139298916, 'timestamp': '2025-10-01 04:19:29.649292', 'step': 2450, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:19:29.706220', 'step': 2450, 'epoch': 2} {'type': 'loss', 'content': 0.00828456599265337, 'timestamp': '2025-10-01 04:19:29.720253', 'step': 2451, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:19:29.761561', 'step': 2451, 'epoch': 2} {'type': 'loss', 'content': 0.011661963537335396, 'timestamp': '2025-10-01 04:19:29.794314', 'step': 2452, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:29.834856', 'step': 2452, 'epoch': 2} {'type': 'loss', 'content': 0.013549159280955791, 'timestamp': '2025-10-01 04:19:29.843932', 'step': 2453, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:29.881176', 'step': 2453, 'epoch': 2} {'type': 'loss', 'content': 0.017223162576556206, 'timestamp': '2025-10-01 04:19:29.892742', 'step': 2454, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:29.938762', 'step': 2454, 'epoch': 2} {'type': 'loss', 'content': 0.007560073398053646, 'timestamp': '2025-10-01 04:19:29.950286', 'step': 2455, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:29.998585', 'step': 2455, 'epoch': 2} {'type': 'loss', 'content': 0.011876596137881279, 'timestamp': '2025-10-01 04:19:30.031071', 'step': 2456, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:19:30.071166', 'step': 2456, 'epoch': 2} {'type': 'loss', 'content': 0.005895639769732952, 'timestamp': '2025-10-01 04:19:30.076452', 'step': 2457, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:30.116790', 'step': 2457, 'epoch': 2} {'type': 'loss', 'content': 0.012462913058698177, 'timestamp': '2025-10-01 04:19:30.128480', 'step': 2458, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:30.166449', 'step': 2458, 'epoch': 2} {'type': 'loss', 'content': 0.004901203792542219, 'timestamp': '2025-10-01 04:19:30.178027', 'step': 2459, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:19:30.231275', 'step': 2459, 'epoch': 2} {'type': 'loss', 'content': 0.00970637146383524, 'timestamp': '2025-10-01 04:19:30.264902', 'step': 2460, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:30.316729', 'step': 2460, 'epoch': 2} {'type': 'loss', 'content': 0.005252528935670853, 'timestamp': '2025-10-01 04:19:30.322414', 'step': 2461, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:30.363739', 'step': 2461, 'epoch': 2} {'type': 'loss', 'content': 0.010704261250793934, 'timestamp': '2025-10-01 04:19:30.374498', 'step': 2462, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:30.414976', 'step': 2462, 'epoch': 2} {'type': 'loss', 'content': 0.009654855355620384, 'timestamp': '2025-10-01 04:19:30.426821', 'step': 2463, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:30.471501', 'step': 2463, 'epoch': 2} {'type': 'loss', 'content': 0.013850544579327106, 'timestamp': '2025-10-01 04:19:30.503215', 'step': 2464, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:19:30.555181', 'step': 2464, 'epoch': 2} {'type': 'loss', 'content': 0.007819890044629574, 'timestamp': '2025-10-01 04:19:30.568532', 'step': 2465, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 14238374516352}, 'timestamp': '2025-10-01 04:19:30.625599', 'step': 2465, 'epoch': 2} {'type': 'loss', 'content': 0.012108131311833858, 'timestamp': '2025-10-01 04:19:30.643068', 'step': 2466, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:19:30.692844', 'step': 2466, 'epoch': 2} {'type': 'loss', 'content': 0.007548796944320202, 'timestamp': '2025-10-01 04:19:30.707124', 'step': 2467, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:19:30.756901', 'step': 2467, 'epoch': 2} {'type': 'loss', 'content': 0.011511501856148243, 'timestamp': '2025-10-01 04:19:30.791833', 'step': 2468, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:19:30.840138', 'step': 2468, 'epoch': 2} {'type': 'loss', 'content': 0.012634082697331905, 'timestamp': '2025-10-01 04:19:30.845531', 'step': 2469, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:19:30.892449', 'step': 2469, 'epoch': 2} {'type': 'loss', 'content': 0.008832798339426517, 'timestamp': '2025-10-01 04:19:30.906477', 'step': 2470, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-10-01 04:19:30.962398', 'step': 2470, 'epoch': 2} {'type': 'loss', 'content': 0.005481818690896034, 'timestamp': '2025-10-01 04:19:30.979637', 'step': 2471, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:19:31.020015', 'step': 2471, 'epoch': 2} {'type': 'loss', 'content': 0.008529743179678917, 'timestamp': '2025-10-01 04:19:31.055002', 'step': 2472, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:19:31.116069', 'step': 2472, 'epoch': 2} {'type': 'loss', 'content': 0.011244467459619045, 'timestamp': '2025-10-01 04:19:31.129553', 'step': 2473, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-10-01 04:19:31.180228', 'step': 2473, 'epoch': 2} {'type': 'loss', 'content': 0.008086386136710644, 'timestamp': '2025-10-01 04:19:31.196831', 'step': 2474, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 14238374516352}, 'timestamp': '2025-10-01 04:19:31.249690', 'step': 2474, 'epoch': 2} {'type': 'loss', 'content': 0.003994782455265522, 'timestamp': '2025-10-01 04:19:31.267192', 'step': 2475, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:19:31.320863', 'step': 2475, 'epoch': 2} {'type': 'loss', 'content': 0.008034972473978996, 'timestamp': '2025-10-01 04:19:31.355918', 'step': 2476, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-10-01 04:19:31.417211', 'step': 2476, 'epoch': 2} {'type': 'loss', 'content': 0.004500200040638447, 'timestamp': '2025-10-01 04:19:31.433244', 'step': 2477, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:19:31.480695', 'step': 2477, 'epoch': 2} {'type': 'loss', 'content': 0.03819195181131363, 'timestamp': '2025-10-01 04:19:31.488534', 'step': 2478, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:31.533028', 'step': 2478, 'epoch': 2} {'type': 'loss', 'content': 0.003303627949208021, 'timestamp': '2025-10-01 04:19:31.540989', 'step': 2479, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:19:31.584070', 'step': 2479, 'epoch': 2} {'type': 'loss', 'content': 0.011750278994441032, 'timestamp': '2025-10-01 04:19:31.617546', 'step': 2480, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:19:31.654534', 'step': 2480, 'epoch': 2} {'type': 'loss', 'content': 0.007589318323880434, 'timestamp': '2025-10-01 04:19:31.664981', 'step': 2481, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:19:31.708945', 'step': 2481, 'epoch': 2} {'type': 'loss', 'content': 0.005739981308579445, 'timestamp': '2025-10-01 04:19:31.721502', 'step': 2482, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:19:31.761215', 'step': 2482, 'epoch': 2} {'type': 'loss', 'content': 0.00855331402271986, 'timestamp': '2025-10-01 04:19:31.773981', 'step': 2483, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:31.835203', 'step': 2483, 'epoch': 2} {'type': 'loss', 'content': 0.004858980420976877, 'timestamp': '2025-10-01 04:19:31.869408', 'step': 2484, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:19:31.914433', 'step': 2484, 'epoch': 2} {'type': 'loss', 'content': 0.006589639000594616, 'timestamp': '2025-10-01 04:19:31.927513', 'step': 2485, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:19:31.974724', 'step': 2485, 'epoch': 2} {'type': 'loss', 'content': 0.013014500960707664, 'timestamp': '2025-10-01 04:19:31.988318', 'step': 2486, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:19:32.041849', 'step': 2486, 'epoch': 2} {'type': 'loss', 'content': 0.01757582277059555, 'timestamp': '2025-10-01 04:19:32.049847', 'step': 2487, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:19:32.101503', 'step': 2487, 'epoch': 2} {'type': 'loss', 'content': 0.014721767976880074, 'timestamp': '2025-10-01 04:19:32.129764', 'step': 2488, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:32.171702', 'step': 2488, 'epoch': 2} {'type': 'loss', 'content': 0.0093047134578228, 'timestamp': '2025-10-01 04:19:32.180850', 'step': 2489, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:19:32.224833', 'step': 2489, 'epoch': 2} {'type': 'loss', 'content': 0.0055207498371601105, 'timestamp': '2025-10-01 04:19:32.237356', 'step': 2490, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:19:32.285304', 'step': 2490, 'epoch': 2} {'type': 'loss', 'content': 0.008066551759839058, 'timestamp': '2025-10-01 04:19:32.292941', 'step': 2491, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:19:32.337208', 'step': 2491, 'epoch': 2} {'type': 'loss', 'content': 0.006797643378376961, 'timestamp': '2025-10-01 04:19:32.370660', 'step': 2492, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:19:32.409594', 'step': 2492, 'epoch': 2} {'type': 'loss', 'content': 0.008562956005334854, 'timestamp': '2025-10-01 04:19:32.415022', 'step': 2493, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:32.456447', 'step': 2493, 'epoch': 2} {'type': 'loss', 'content': 0.011179862543940544, 'timestamp': '2025-10-01 04:19:32.467431', 'step': 2494, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:19:32.503880', 'step': 2494, 'epoch': 2} {'type': 'loss', 'content': 0.010123427957296371, 'timestamp': '2025-10-01 04:19:32.515257', 'step': 2495, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:32.562348', 'step': 2495, 'epoch': 2} {'type': 'loss', 'content': 0.013084394857287407, 'timestamp': '2025-10-01 04:19:32.594202', 'step': 2496, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:32.649567', 'step': 2496, 'epoch': 2} {'type': 'loss', 'content': 0.005194587633013725, 'timestamp': '2025-10-01 04:19:32.659405', 'step': 2497, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:19:32.706218', 'step': 2497, 'epoch': 2} {'type': 'loss', 'content': 0.01395785715430975, 'timestamp': '2025-10-01 04:19:32.718956', 'step': 2498, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:32.765589', 'step': 2498, 'epoch': 2} {'type': 'loss', 'content': 0.007155020255595446, 'timestamp': '2025-10-01 04:19:32.776606', 'step': 2499, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:32.826321', 'step': 2499, 'epoch': 2} {'type': 'loss', 'content': 0.005470855161547661, 'timestamp': '2025-10-01 04:19:32.855564', 'step': 2500, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 2500', 'timestamp': '2025-10-01 04:19:38.278035', 'step': 2500, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:19:38.324784', 'step': 2500, 'epoch': 2} {'type': 'loss', 'content': 0.012123343534767628, 'timestamp': '2025-10-01 04:19:38.331132', 'step': 2501, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:19:38.400303', 'step': 2501, 'epoch': 2} {'type': 'loss', 'content': 0.007479384075850248, 'timestamp': '2025-10-01 04:19:38.407497', 'step': 2502, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:38.479732', 'step': 2502, 'epoch': 2} {'type': 'loss', 'content': 0.012820422649383545, 'timestamp': '2025-10-01 04:19:38.487971', 'step': 2503, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:38.553794', 'step': 2503, 'epoch': 2} {'type': 'loss', 'content': 0.009034382179379463, 'timestamp': '2025-10-01 04:19:38.583058', 'step': 2504, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:38.647165', 'step': 2504, 'epoch': 2} {'type': 'loss', 'content': 0.006063381675630808, 'timestamp': '2025-10-01 04:19:38.655555', 'step': 2505, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:19:38.722157', 'step': 2505, 'epoch': 2} {'type': 'loss', 'content': 0.004940928425639868, 'timestamp': '2025-10-01 04:19:38.734908', 'step': 2506, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:19:38.798460', 'step': 2506, 'epoch': 2} {'type': 'loss', 'content': 0.006269724108278751, 'timestamp': '2025-10-01 04:19:38.808445', 'step': 2507, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:19:38.877301', 'step': 2507, 'epoch': 2} {'type': 'loss', 'content': 0.02730487659573555, 'timestamp': '2025-10-01 04:19:38.905671', 'step': 2508, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:19:38.975318', 'step': 2508, 'epoch': 2} {'type': 'loss', 'content': 0.015841586515307426, 'timestamp': '2025-10-01 04:19:38.985563', 'step': 2509, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:39.048142', 'step': 2509, 'epoch': 2} {'type': 'loss', 'content': 0.005090412683784962, 'timestamp': '2025-10-01 04:19:39.059076', 'step': 2510, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:19:39.126418', 'step': 2510, 'epoch': 2} {'type': 'loss', 'content': 0.007235792465507984, 'timestamp': '2025-10-01 04:19:39.140030', 'step': 2511, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:39.201924', 'step': 2511, 'epoch': 2} {'type': 'loss', 'content': 0.014398216269910336, 'timestamp': '2025-10-01 04:19:39.231240', 'step': 2512, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:39.269377', 'step': 2512, 'epoch': 2} {'type': 'loss', 'content': 0.006891167256981134, 'timestamp': '2025-10-01 04:19:39.277793', 'step': 2513, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:19:39.353114', 'step': 2513, 'epoch': 2} {'type': 'loss', 'content': 0.010726706124842167, 'timestamp': '2025-10-01 04:19:39.369131', 'step': 2514, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:19:39.440688', 'step': 2514, 'epoch': 2} {'type': 'loss', 'content': 0.005380088929086924, 'timestamp': '2025-10-01 04:19:39.454274', 'step': 2515, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:19:39.515792', 'step': 2515, 'epoch': 2} {'type': 'loss', 'content': 0.009253215044736862, 'timestamp': '2025-10-01 04:19:39.550741', 'step': 2516, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:19:39.616209', 'step': 2516, 'epoch': 2} {'type': 'loss', 'content': 0.009510495699942112, 'timestamp': '2025-10-01 04:19:39.629499', 'step': 2517, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:19:39.692940', 'step': 2517, 'epoch': 2} {'type': 'loss', 'content': 0.00690173776820302, 'timestamp': '2025-10-01 04:19:39.705456', 'step': 2518, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:39.758908', 'step': 2518, 'epoch': 2} {'type': 'loss', 'content': 0.010759741067886353, 'timestamp': '2025-10-01 04:19:39.767158', 'step': 2519, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:19:39.807626', 'step': 2519, 'epoch': 2} {'type': 'loss', 'content': 0.007469541393220425, 'timestamp': '2025-10-01 04:19:39.842598', 'step': 2520, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:19:39.909105', 'step': 2520, 'epoch': 2} {'type': 'loss', 'content': 0.011216584593057632, 'timestamp': '2025-10-01 04:19:39.921980', 'step': 2521, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:19:39.991287', 'step': 2521, 'epoch': 2} {'type': 'loss', 'content': 0.0102983508259058, 'timestamp': '2025-10-01 04:19:40.005283', 'step': 2522, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:19:40.063697', 'step': 2522, 'epoch': 2} {'type': 'loss', 'content': 0.007040380034595728, 'timestamp': '2025-10-01 04:19:40.070930', 'step': 2523, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:19:40.122278', 'step': 2523, 'epoch': 2} {'type': 'loss', 'content': 0.004817429929971695, 'timestamp': '2025-10-01 04:19:40.151090', 'step': 2524, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:19:40.216806', 'step': 2524, 'epoch': 2} {'type': 'loss', 'content': 0.008786136284470558, 'timestamp': '2025-10-01 04:19:40.232066', 'step': 2525, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:40.293397', 'step': 2525, 'epoch': 2} {'type': 'loss', 'content': 0.004219204653054476, 'timestamp': '2025-10-01 04:19:40.304115', 'step': 2526, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:19:40.359681', 'step': 2526, 'epoch': 2} {'type': 'loss', 'content': 0.007273993920534849, 'timestamp': '2025-10-01 04:19:40.367172', 'step': 2527, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:40.421998', 'step': 2527, 'epoch': 2} {'type': 'loss', 'content': 0.0066550010815262794, 'timestamp': '2025-10-01 04:19:40.454562', 'step': 2528, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:40.509566', 'step': 2528, 'epoch': 2} {'type': 'loss', 'content': 0.010674547404050827, 'timestamp': '2025-10-01 04:19:40.515416', 'step': 2529, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:19:40.577605', 'step': 2529, 'epoch': 2} {'type': 'loss', 'content': 0.01485006045550108, 'timestamp': '2025-10-01 04:19:40.590156', 'step': 2530, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-10-01 04:19:43.888438', 'step': 2530, 'epoch': 2} {'type': 'pplx', 'content': 5.6826618173887855, 'timestamp': '2025-10-01 04:19:43.898991', 'step': 2530, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:19:43.942624', 'step': 2530, 'epoch': 2} {'type': 'loss', 'content': 0.009853575378656387, 'timestamp': '2025-10-01 04:19:43.952207', 'step': 2531, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:19:44.002483', 'step': 2531, 'epoch': 2} {'type': 'loss', 'content': 0.018134772777557373, 'timestamp': '2025-10-01 04:19:44.036058', 'step': 2532, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:19:44.081518', 'step': 2532, 'epoch': 2} {'type': 'loss', 'content': 0.00989855732768774, 'timestamp': '2025-10-01 04:19:44.094912', 'step': 2533, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:44.140793', 'step': 2533, 'epoch': 2} {'type': 'loss', 'content': 0.008898516185581684, 'timestamp': '2025-10-01 04:19:44.151761', 'step': 2534, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:44.194006', 'step': 2534, 'epoch': 2} {'type': 'loss', 'content': 0.012228530831634998, 'timestamp': '2025-10-01 04:19:44.205020', 'step': 2535, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:44.249338', 'step': 2535, 'epoch': 2} {'type': 'loss', 'content': 0.008941440843045712, 'timestamp': '2025-10-01 04:19:44.282069', 'step': 2536, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:44.321666', 'step': 2536, 'epoch': 2} {'type': 'loss', 'content': 0.0027018312830477953, 'timestamp': '2025-10-01 04:19:44.330158', 'step': 2537, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:19:44.377428', 'step': 2537, 'epoch': 2} {'type': 'loss', 'content': 0.010913853533565998, 'timestamp': '2025-10-01 04:19:44.385173', 'step': 2538, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:44.427673', 'step': 2538, 'epoch': 2} {'type': 'loss', 'content': 0.009436598978936672, 'timestamp': '2025-10-01 04:19:44.439325', 'step': 2539, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-10-01 04:19:44.514767', 'step': 2539, 'epoch': 2} {'type': 'loss', 'content': 0.005883959122002125, 'timestamp': '2025-10-01 04:19:44.552052', 'step': 2540, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:19:44.604500', 'step': 2540, 'epoch': 2} {'type': 'loss', 'content': 0.005892371758818626, 'timestamp': '2025-10-01 04:19:44.611208', 'step': 2541, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:19:44.654110', 'step': 2541, 'epoch': 2} {'type': 'loss', 'content': 0.016915790736675262, 'timestamp': '2025-10-01 04:19:44.661572', 'step': 2542, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:19:44.712097', 'step': 2542, 'epoch': 2} {'type': 'loss', 'content': 0.011818713508546352, 'timestamp': '2025-10-01 04:19:44.720084', 'step': 2543, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:44.767284', 'step': 2543, 'epoch': 2} {'type': 'loss', 'content': 0.006536450702697039, 'timestamp': '2025-10-01 04:19:44.799958', 'step': 2544, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:44.842166', 'step': 2544, 'epoch': 2} {'type': 'loss', 'content': 0.00921714399009943, 'timestamp': '2025-10-01 04:19:44.848351', 'step': 2545, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:44.894056', 'step': 2545, 'epoch': 2} {'type': 'loss', 'content': 0.011850626207888126, 'timestamp': '2025-10-01 04:19:44.905768', 'step': 2546, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:19:44.952396', 'step': 2546, 'epoch': 2} {'type': 'loss', 'content': 0.0098267188295722, 'timestamp': '2025-10-01 04:19:44.964895', 'step': 2547, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:19:45.008038', 'step': 2547, 'epoch': 2} {'type': 'loss', 'content': 0.007137166801840067, 'timestamp': '2025-10-01 04:19:45.039580', 'step': 2548, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:45.082181', 'step': 2548, 'epoch': 2} {'type': 'loss', 'content': 0.011120856739580631, 'timestamp': '2025-10-01 04:19:45.091394', 'step': 2549, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:19:45.136427', 'step': 2549, 'epoch': 2} {'type': 'loss', 'content': 0.012665021233260632, 'timestamp': '2025-10-01 04:19:45.144044', 'step': 2550, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:19:45.183639', 'step': 2550, 'epoch': 2} {'type': 'loss', 'content': 0.009229086339473724, 'timestamp': '2025-10-01 04:19:45.191010', 'step': 2551, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:19:45.230872', 'step': 2551, 'epoch': 2} {'type': 'loss', 'content': 0.015306942164897919, 'timestamp': '2025-10-01 04:19:45.259922', 'step': 2552, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:19:45.309179', 'step': 2552, 'epoch': 2} {'type': 'loss', 'content': 0.009130693972110748, 'timestamp': '2025-10-01 04:19:45.314964', 'step': 2553, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:19:45.361717', 'step': 2553, 'epoch': 2} {'type': 'loss', 'content': 0.004679187200963497, 'timestamp': '2025-10-01 04:19:45.369288', 'step': 2554, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:45.416556', 'step': 2554, 'epoch': 2} {'type': 'loss', 'content': 0.015215248800814152, 'timestamp': '2025-10-01 04:19:45.426563', 'step': 2555, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:45.486675', 'step': 2555, 'epoch': 2} {'type': 'loss', 'content': 0.005208189133554697, 'timestamp': '2025-10-01 04:19:45.518569', 'step': 2556, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:19:45.557285', 'step': 2556, 'epoch': 2} {'type': 'loss', 'content': 0.007675572298467159, 'timestamp': '2025-10-01 04:19:45.562498', 'step': 2557, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:45.602316', 'step': 2557, 'epoch': 2} {'type': 'loss', 'content': 0.009726385585963726, 'timestamp': '2025-10-01 04:19:45.613274', 'step': 2558, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-10-01 04:19:45.652049', 'step': 2558, 'epoch': 2} {'type': 'loss', 'content': 0.005649697035551071, 'timestamp': '2025-10-01 04:19:45.660378', 'step': 2559, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:19:45.695826', 'step': 2559, 'epoch': 2} {'type': 'loss', 'content': 0.014997649937868118, 'timestamp': '2025-10-01 04:19:45.722847', 'step': 2560, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:19:45.760610', 'step': 2560, 'epoch': 2} {'type': 'loss', 'content': 0.009228796698153019, 'timestamp': '2025-10-01 04:19:45.767952', 'step': 2561, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:19:45.801028', 'step': 2561, 'epoch': 2} {'type': 'loss', 'content': 0.023889822885394096, 'timestamp': '2025-10-01 04:19:45.808483', 'step': 2562, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:19:45.861940', 'step': 2562, 'epoch': 2} {'type': 'loss', 'content': 0.00889755692332983, 'timestamp': '2025-10-01 04:19:45.869507', 'step': 2563, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:19:45.915215', 'step': 2563, 'epoch': 2} {'type': 'loss', 'content': 0.00137180439196527, 'timestamp': '2025-10-01 04:19:45.943921', 'step': 2564, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:45.984348', 'step': 2564, 'epoch': 2} {'type': 'loss', 'content': 0.006962904240936041, 'timestamp': '2025-10-01 04:19:45.990369', 'step': 2565, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:19:46.039059', 'step': 2565, 'epoch': 2} {'type': 'loss', 'content': 0.0039288648404181, 'timestamp': '2025-10-01 04:19:46.046618', 'step': 2566, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:19:46.090314', 'step': 2566, 'epoch': 2} {'type': 'loss', 'content': 0.01098353136330843, 'timestamp': '2025-10-01 04:19:46.098035', 'step': 2567, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:19:46.150972', 'step': 2567, 'epoch': 2} {'type': 'loss', 'content': 0.008943567983806133, 'timestamp': '2025-10-01 04:19:46.188027', 'step': 2568, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:19:46.237275', 'step': 2568, 'epoch': 2} {'type': 'loss', 'content': 0.011124568060040474, 'timestamp': '2025-10-01 04:19:46.244244', 'step': 2569, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:19:46.300074', 'step': 2569, 'epoch': 2} {'type': 'loss', 'content': 0.008334394544363022, 'timestamp': '2025-10-01 04:19:46.307757', 'step': 2570, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:46.354146', 'step': 2570, 'epoch': 2} {'type': 'loss', 'content': 0.008906105533242226, 'timestamp': '2025-10-01 04:19:46.365832', 'step': 2571, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:19:46.416758', 'step': 2571, 'epoch': 2} {'type': 'loss', 'content': 0.004849575459957123, 'timestamp': '2025-10-01 04:19:46.451740', 'step': 2572, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:46.499899', 'step': 2572, 'epoch': 2} {'type': 'loss', 'content': 0.020320238545536995, 'timestamp': '2025-10-01 04:19:46.509309', 'step': 2573, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:19:46.558033', 'step': 2573, 'epoch': 2} {'type': 'loss', 'content': 0.008675608783960342, 'timestamp': '2025-10-01 04:19:46.568095', 'step': 2574, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:19:46.616400', 'step': 2574, 'epoch': 2} {'type': 'loss', 'content': 0.0037062575574964285, 'timestamp': '2025-10-01 04:19:46.624437', 'step': 2575, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:46.680467', 'step': 2575, 'epoch': 2} {'type': 'loss', 'content': 0.012477071955800056, 'timestamp': '2025-10-01 04:19:46.712400', 'step': 2576, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:46.765943', 'step': 2576, 'epoch': 2} {'type': 'loss', 'content': 0.006453280337154865, 'timestamp': '2025-10-01 04:19:46.771872', 'step': 2577, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:19:46.819434', 'step': 2577, 'epoch': 2} {'type': 'loss', 'content': 0.00686405086889863, 'timestamp': '2025-10-01 04:19:46.832116', 'step': 2578, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:46.881790', 'step': 2578, 'epoch': 2} {'type': 'loss', 'content': 0.007903585210442543, 'timestamp': '2025-10-01 04:19:46.893506', 'step': 2579, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:19:46.935247', 'step': 2579, 'epoch': 2} {'type': 'loss', 'content': 0.011966027319431305, 'timestamp': '2025-10-01 04:19:46.965843', 'step': 2580, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:19:47.021084', 'step': 2580, 'epoch': 2} {'type': 'loss', 'content': 0.007635287940502167, 'timestamp': '2025-10-01 04:19:47.036391', 'step': 2581, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:47.086538', 'step': 2581, 'epoch': 2} {'type': 'loss', 'content': 0.01323575247079134, 'timestamp': '2025-10-01 04:19:47.094661', 'step': 2582, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:47.141320', 'step': 2582, 'epoch': 2} {'type': 'loss', 'content': 0.0049056559801101685, 'timestamp': '2025-10-01 04:19:47.151893', 'step': 2583, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:47.197349', 'step': 2583, 'epoch': 2} {'type': 'loss', 'content': 0.011528060771524906, 'timestamp': '2025-10-01 04:19:47.229082', 'step': 2584, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:47.283676', 'step': 2584, 'epoch': 2} {'type': 'loss', 'content': 0.015017666853964329, 'timestamp': '2025-10-01 04:19:47.289690', 'step': 2585, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 14238374516352}, 'timestamp': '2025-10-01 04:19:47.347988', 'step': 2585, 'epoch': 2} {'type': 'loss', 'content': 0.010478155687451363, 'timestamp': '2025-10-01 04:19:47.365440', 'step': 2586, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:19:47.413212', 'step': 2586, 'epoch': 2} {'type': 'loss', 'content': 0.009849750436842442, 'timestamp': '2025-10-01 04:19:47.425766', 'step': 2587, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:19:47.479508', 'step': 2587, 'epoch': 2} {'type': 'loss', 'content': 0.017766831442713737, 'timestamp': '2025-10-01 04:19:47.508605', 'step': 2588, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:19:47.553795', 'step': 2588, 'epoch': 2} {'type': 'loss', 'content': 0.009823139756917953, 'timestamp': '2025-10-01 04:19:47.564843', 'step': 2589, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:47.609392', 'step': 2589, 'epoch': 2} {'type': 'loss', 'content': 0.014855247922241688, 'timestamp': '2025-10-01 04:19:47.618435', 'step': 2590, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:19:47.665944', 'step': 2590, 'epoch': 2} {'type': 'loss', 'content': 0.011815907433629036, 'timestamp': '2025-10-01 04:19:47.676810', 'step': 2591, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:47.721558', 'step': 2591, 'epoch': 2} {'type': 'loss', 'content': 0.013904322870075703, 'timestamp': '2025-10-01 04:19:47.752595', 'step': 2592, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:47.794057', 'step': 2592, 'epoch': 2} {'type': 'loss', 'content': 0.009444703347980976, 'timestamp': '2025-10-01 04:19:47.803342', 'step': 2593, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:47.839732', 'step': 2593, 'epoch': 2} {'type': 'loss', 'content': 0.008178465999662876, 'timestamp': '2025-10-01 04:19:47.851381', 'step': 2594, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:47.897173', 'step': 2594, 'epoch': 2} {'type': 'loss', 'content': 0.005212042015045881, 'timestamp': '2025-10-01 04:19:47.910542', 'step': 2595, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:19:47.972103', 'step': 2595, 'epoch': 2} {'type': 'loss', 'content': 0.005496013909578323, 'timestamp': '2025-10-01 04:19:48.007081', 'step': 2596, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:48.052548', 'step': 2596, 'epoch': 2} {'type': 'loss', 'content': 0.008420528843998909, 'timestamp': '2025-10-01 04:19:48.061556', 'step': 2597, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:48.104464', 'step': 2597, 'epoch': 2} {'type': 'loss', 'content': 0.009678514674305916, 'timestamp': '2025-10-01 04:19:48.116151', 'step': 2598, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:48.155680', 'step': 2598, 'epoch': 2} {'type': 'loss', 'content': 0.009122279472649097, 'timestamp': '2025-10-01 04:19:48.167117', 'step': 2599, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:19:48.213899', 'step': 2599, 'epoch': 2} {'type': 'loss', 'content': 0.00734214810654521, 'timestamp': '2025-10-01 04:19:48.247612', 'step': 2600, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:48.287285', 'step': 2600, 'epoch': 2} {'type': 'loss', 'content': 0.01454841997474432, 'timestamp': '2025-10-01 04:19:48.295767', 'step': 2601, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:48.342030', 'step': 2601, 'epoch': 2} {'type': 'loss', 'content': 0.008531257510185242, 'timestamp': '2025-10-01 04:19:48.353432', 'step': 2602, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:19:48.403723', 'step': 2602, 'epoch': 2} {'type': 'loss', 'content': 0.005879604257643223, 'timestamp': '2025-10-01 04:19:48.417800', 'step': 2603, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:19:48.459535', 'step': 2603, 'epoch': 2} {'type': 'loss', 'content': 0.01977338083088398, 'timestamp': '2025-10-01 04:19:48.494529', 'step': 2604, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:19:48.535235', 'step': 2604, 'epoch': 2} {'type': 'loss', 'content': 0.010712184011936188, 'timestamp': '2025-10-01 04:19:48.548539', 'step': 2605, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:48.595032', 'step': 2605, 'epoch': 2} {'type': 'loss', 'content': 0.006467817816883326, 'timestamp': '2025-10-01 04:19:48.605928', 'step': 2606, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:19:48.656182', 'step': 2606, 'epoch': 2} {'type': 'loss', 'content': 0.004199515096843243, 'timestamp': '2025-10-01 04:19:48.670150', 'step': 2607, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:48.706200', 'step': 2607, 'epoch': 2} {'type': 'loss', 'content': 0.011752782389521599, 'timestamp': '2025-10-01 04:19:48.738744', 'step': 2608, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:19:48.788448', 'step': 2608, 'epoch': 2} {'type': 'loss', 'content': 0.008688748814165592, 'timestamp': '2025-10-01 04:19:48.798800', 'step': 2609, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 18984411776512}, 'timestamp': '2025-10-01 04:19:48.867465', 'step': 2609, 'epoch': 2} {'type': 'loss', 'content': 0.005211221054196358, 'timestamp': '2025-10-01 04:19:48.889458', 'step': 2610, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:48.938209', 'step': 2610, 'epoch': 2} {'type': 'loss', 'content': 0.004088517744094133, 'timestamp': '2025-10-01 04:19:48.949019', 'step': 2611, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:19:48.993182', 'step': 2611, 'epoch': 2} {'type': 'loss', 'content': 0.006693352945148945, 'timestamp': '2025-10-01 04:19:49.019356', 'step': 2612, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:19:49.055855', 'step': 2612, 'epoch': 2} {'type': 'loss', 'content': 0.020728131756186485, 'timestamp': '2025-10-01 04:19:49.060729', 'step': 2613, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:19:49.098993', 'step': 2613, 'epoch': 2} {'type': 'loss', 'content': 0.002308117225766182, 'timestamp': '2025-10-01 04:19:49.107082', 'step': 2614, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:19:49.145496', 'step': 2614, 'epoch': 2} {'type': 'loss', 'content': 0.0130852609872818, 'timestamp': '2025-10-01 04:19:49.153570', 'step': 2615, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:19:49.210467', 'step': 2615, 'epoch': 2} {'type': 'loss', 'content': 0.003523361636325717, 'timestamp': '2025-10-01 04:19:49.245423', 'step': 2616, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:49.294055', 'step': 2616, 'epoch': 2} {'type': 'loss', 'content': 0.010371292941272259, 'timestamp': '2025-10-01 04:19:49.302659', 'step': 2617, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:19:49.349602', 'step': 2617, 'epoch': 2} {'type': 'loss', 'content': 0.007453656755387783, 'timestamp': '2025-10-01 04:19:49.363190', 'step': 2618, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:19:49.408879', 'step': 2618, 'epoch': 2} {'type': 'loss', 'content': 0.0065449816174805164, 'timestamp': '2025-10-01 04:19:49.422447', 'step': 2619, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:19:49.471083', 'step': 2619, 'epoch': 2} {'type': 'loss', 'content': 0.00720055541023612, 'timestamp': '2025-10-01 04:19:49.506086', 'step': 2620, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:19:49.551408', 'step': 2620, 'epoch': 2} {'type': 'loss', 'content': 0.01139106135815382, 'timestamp': '2025-10-01 04:19:49.564960', 'step': 2621, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:49.604786', 'step': 2621, 'epoch': 2} {'type': 'loss', 'content': 0.016818488016724586, 'timestamp': '2025-10-01 04:19:49.613139', 'step': 2622, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:19:49.656074', 'step': 2622, 'epoch': 2} {'type': 'loss', 'content': 0.011159333400428295, 'timestamp': '2025-10-01 04:19:49.669576', 'step': 2623, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:19:49.714038', 'step': 2623, 'epoch': 2} {'type': 'loss', 'content': 0.010750924237072468, 'timestamp': '2025-10-01 04:19:49.747766', 'step': 2624, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:19:49.797065', 'step': 2624, 'epoch': 2} {'type': 'loss', 'content': 0.003972245380282402, 'timestamp': '2025-10-01 04:19:49.812719', 'step': 2625, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:49.855982', 'step': 2625, 'epoch': 2} {'type': 'loss', 'content': 0.010046936571598053, 'timestamp': '2025-10-01 04:19:49.868357', 'step': 2626, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:19:49.923150', 'step': 2626, 'epoch': 2} {'type': 'loss', 'content': 0.006403615698218346, 'timestamp': '2025-10-01 04:19:49.936711', 'step': 2627, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:19:49.985876', 'step': 2627, 'epoch': 2} {'type': 'loss', 'content': 0.022906264290213585, 'timestamp': '2025-10-01 04:19:50.019564', 'step': 2628, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:50.067728', 'step': 2628, 'epoch': 2} {'type': 'loss', 'content': 0.008339909836649895, 'timestamp': '2025-10-01 04:19:50.076211', 'step': 2629, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:19:50.121052', 'step': 2629, 'epoch': 2} {'type': 'loss', 'content': 0.007127535995095968, 'timestamp': '2025-10-01 04:19:50.128792', 'step': 2630, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 16136789420416}, 'timestamp': '2025-10-01 04:19:50.189399', 'step': 2630, 'epoch': 2} {'type': 'loss', 'content': 0.004706819541752338, 'timestamp': '2025-10-01 04:19:50.208692', 'step': 2631, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:19:50.253680', 'step': 2631, 'epoch': 2} {'type': 'loss', 'content': 0.011458968743681908, 'timestamp': '2025-10-01 04:19:50.282037', 'step': 2632, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:50.322346', 'step': 2632, 'epoch': 2} {'type': 'loss', 'content': 0.005395845975726843, 'timestamp': '2025-10-01 04:19:50.330160', 'step': 2633, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:19:50.387834', 'step': 2633, 'epoch': 2} {'type': 'loss', 'content': 0.007787360809743404, 'timestamp': '2025-10-01 04:19:50.401829', 'step': 2634, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:19:50.456784', 'step': 2634, 'epoch': 2} {'type': 'loss', 'content': 0.013342727907001972, 'timestamp': '2025-10-01 04:19:50.464517', 'step': 2635, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:50.508255', 'step': 2635, 'epoch': 2} {'type': 'loss', 'content': 0.007115037180483341, 'timestamp': '2025-10-01 04:19:50.537474', 'step': 2636, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:19:50.579203', 'step': 2636, 'epoch': 2} {'type': 'loss', 'content': 0.007463380694389343, 'timestamp': '2025-10-01 04:19:50.590450', 'step': 2637, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:50.636798', 'step': 2637, 'epoch': 2} {'type': 'loss', 'content': 0.00755828432738781, 'timestamp': '2025-10-01 04:19:50.647577', 'step': 2638, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:50.694065', 'step': 2638, 'epoch': 2} {'type': 'loss', 'content': 0.0074048419483006, 'timestamp': '2025-10-01 04:19:50.702466', 'step': 2639, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:50.743305', 'step': 2639, 'epoch': 2} {'type': 'loss', 'content': 0.007771433796733618, 'timestamp': '2025-10-01 04:19:50.777068', 'step': 2640, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:19:50.824921', 'step': 2640, 'epoch': 2} {'type': 'loss', 'content': 0.012903441675007343, 'timestamp': '2025-10-01 04:19:50.835846', 'step': 2641, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:19:50.880028', 'step': 2641, 'epoch': 2} {'type': 'loss', 'content': 0.007357155438512564, 'timestamp': '2025-10-01 04:19:50.887524', 'step': 2642, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:50.932261', 'step': 2642, 'epoch': 2} {'type': 'loss', 'content': 0.045794859528541565, 'timestamp': '2025-10-01 04:19:50.940647', 'step': 2643, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:19:50.979849', 'step': 2643, 'epoch': 2} {'type': 'loss', 'content': 0.004533797036856413, 'timestamp': '2025-10-01 04:19:51.008616', 'step': 2644, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:51.060071', 'step': 2644, 'epoch': 2} {'type': 'loss', 'content': 0.00794311985373497, 'timestamp': '2025-10-01 04:19:51.070173', 'step': 2645, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-10-01 04:19:53.862924', 'step': 2645, 'epoch': 2} {'type': 'pplx', 'content': 5.64380305041824, 'timestamp': '2025-10-01 04:19:53.866331', 'step': 2645, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:53.904038', 'step': 2645, 'epoch': 2} {'type': 'loss', 'content': 0.00503899622708559, 'timestamp': '2025-10-01 04:19:53.911542', 'step': 2646, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:19:53.956768', 'step': 2646, 'epoch': 2} {'type': 'loss', 'content': 0.009994405321776867, 'timestamp': '2025-10-01 04:19:53.968980', 'step': 2647, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:54.011533', 'step': 2647, 'epoch': 2} {'type': 'loss', 'content': 0.008538792841136456, 'timestamp': '2025-10-01 04:19:54.043177', 'step': 2648, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:54.092405', 'step': 2648, 'epoch': 2} {'type': 'loss', 'content': 0.014455202035605907, 'timestamp': '2025-10-01 04:19:54.100926', 'step': 2649, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:19:54.142188', 'step': 2649, 'epoch': 2} {'type': 'loss', 'content': 0.004924486391246319, 'timestamp': '2025-10-01 04:19:54.150109', 'step': 2650, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:19:54.198029', 'step': 2650, 'epoch': 2} {'type': 'loss', 'content': 0.007674529682844877, 'timestamp': '2025-10-01 04:19:54.213803', 'step': 2651, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:19:54.252544', 'step': 2651, 'epoch': 2} {'type': 'loss', 'content': 0.009011272341012955, 'timestamp': '2025-10-01 04:19:54.280760', 'step': 2652, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:19:54.327484', 'step': 2652, 'epoch': 2} {'type': 'loss', 'content': 0.005595976486802101, 'timestamp': '2025-10-01 04:19:54.340901', 'step': 2653, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 14238374516352}, 'timestamp': '2025-10-01 04:19:54.390988', 'step': 2653, 'epoch': 2} {'type': 'loss', 'content': 0.0038728772196918726, 'timestamp': '2025-10-01 04:19:54.408510', 'step': 2654, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:54.445043', 'step': 2654, 'epoch': 2} {'type': 'loss', 'content': 0.007697528228163719, 'timestamp': '2025-10-01 04:19:54.453297', 'step': 2655, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:54.489806', 'step': 2655, 'epoch': 2} {'type': 'loss', 'content': 0.006514648906886578, 'timestamp': '2025-10-01 04:19:54.521642', 'step': 2656, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:19:54.564447', 'step': 2656, 'epoch': 2} {'type': 'loss', 'content': 0.004579686094075441, 'timestamp': '2025-10-01 04:19:54.577751', 'step': 2657, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:54.614950', 'step': 2657, 'epoch': 2} {'type': 'loss', 'content': 0.007301211357116699, 'timestamp': '2025-10-01 04:19:54.626547', 'step': 2658, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:54.663289', 'step': 2658, 'epoch': 2} {'type': 'loss', 'content': 0.012674224562942982, 'timestamp': '2025-10-01 04:19:54.674444', 'step': 2659, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:19:54.723752', 'step': 2659, 'epoch': 2} {'type': 'loss', 'content': 0.008797531947493553, 'timestamp': '2025-10-01 04:19:54.758819', 'step': 2660, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:19:54.798249', 'step': 2660, 'epoch': 2} {'type': 'loss', 'content': 0.008878525346517563, 'timestamp': '2025-10-01 04:19:54.809318', 'step': 2661, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:54.848062', 'step': 2661, 'epoch': 2} {'type': 'loss', 'content': 0.007564171217381954, 'timestamp': '2025-10-01 04:19:54.859607', 'step': 2662, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:19:54.899258', 'step': 2662, 'epoch': 2} {'type': 'loss', 'content': 0.011105844751000404, 'timestamp': '2025-10-01 04:19:54.913482', 'step': 2663, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:19:54.951564', 'step': 2663, 'epoch': 2} {'type': 'loss', 'content': 0.012026282027363777, 'timestamp': '2025-10-01 04:19:54.980391', 'step': 2664, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:19:55.017870', 'step': 2664, 'epoch': 2} {'type': 'loss', 'content': 0.008663873188197613, 'timestamp': '2025-10-01 04:19:55.028859', 'step': 2665, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:19:55.068521', 'step': 2665, 'epoch': 2} {'type': 'loss', 'content': 0.007833486422896385, 'timestamp': '2025-10-01 04:19:55.076581', 'step': 2666, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:19:55.119328', 'step': 2666, 'epoch': 2} {'type': 'loss', 'content': 0.011712497100234032, 'timestamp': '2025-10-01 04:19:55.127428', 'step': 2667, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:55.169140', 'step': 2667, 'epoch': 2} {'type': 'loss', 'content': 0.005806285887956619, 'timestamp': '2025-10-01 04:19:55.200845', 'step': 2668, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:55.241286', 'step': 2668, 'epoch': 2} {'type': 'loss', 'content': 0.01537516713142395, 'timestamp': '2025-10-01 04:19:55.247207', 'step': 2669, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:19:55.290915', 'step': 2669, 'epoch': 2} {'type': 'loss', 'content': 0.004093547351658344, 'timestamp': '2025-10-01 04:19:55.303658', 'step': 2670, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:19:55.352046', 'step': 2670, 'epoch': 2} {'type': 'loss', 'content': 0.005733805242925882, 'timestamp': '2025-10-01 04:19:55.364810', 'step': 2671, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:55.412669', 'step': 2671, 'epoch': 2} {'type': 'loss', 'content': 0.0058628409169614315, 'timestamp': '2025-10-01 04:19:55.445248', 'step': 2672, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:19:55.491121', 'step': 2672, 'epoch': 2} {'type': 'loss', 'content': 0.01177005935460329, 'timestamp': '2025-10-01 04:19:55.496873', 'step': 2673, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:55.543995', 'step': 2673, 'epoch': 2} {'type': 'loss', 'content': 0.007424590643495321, 'timestamp': '2025-10-01 04:19:55.555608', 'step': 2674, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:19:55.611686', 'step': 2674, 'epoch': 2} {'type': 'loss', 'content': 0.007167980074882507, 'timestamp': '2025-10-01 04:19:55.625695', 'step': 2675, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:55.677699', 'step': 2675, 'epoch': 2} {'type': 'loss', 'content': 0.00835730042308569, 'timestamp': '2025-10-01 04:19:55.709458', 'step': 2676, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:55.753268', 'step': 2676, 'epoch': 2} {'type': 'loss', 'content': 0.016567528247833252, 'timestamp': '2025-10-01 04:19:55.759083', 'step': 2677, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:55.806057', 'step': 2677, 'epoch': 2} {'type': 'loss', 'content': 0.004802677780389786, 'timestamp': '2025-10-01 04:19:55.817377', 'step': 2678, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:55.857140', 'step': 2678, 'epoch': 2} {'type': 'loss', 'content': 0.018324336037039757, 'timestamp': '2025-10-01 04:19:55.867721', 'step': 2679, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:19:55.911079', 'step': 2679, 'epoch': 2} {'type': 'loss', 'content': 0.006317678838968277, 'timestamp': '2025-10-01 04:19:55.944589', 'step': 2680, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:55.992962', 'step': 2680, 'epoch': 2} {'type': 'loss', 'content': 0.008088508620858192, 'timestamp': '2025-10-01 04:19:56.002372', 'step': 2681, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:56.048991', 'step': 2681, 'epoch': 2} {'type': 'loss', 'content': 0.006050795782357454, 'timestamp': '2025-10-01 04:19:56.057370', 'step': 2682, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:19:56.090296', 'step': 2682, 'epoch': 2} {'type': 'loss', 'content': 0.009526766836643219, 'timestamp': '2025-10-01 04:19:56.097895', 'step': 2683, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:19:56.159726', 'step': 2683, 'epoch': 2} {'type': 'loss', 'content': 0.011680320836603642, 'timestamp': '2025-10-01 04:19:56.188213', 'step': 2684, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:19:56.237792', 'step': 2684, 'epoch': 2} {'type': 'loss', 'content': 0.010558126494288445, 'timestamp': '2025-10-01 04:19:56.248189', 'step': 2685, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:19:56.289222', 'step': 2685, 'epoch': 2} {'type': 'loss', 'content': 0.010066114366054535, 'timestamp': '2025-10-01 04:19:56.302048', 'step': 2686, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:19:56.348722', 'step': 2686, 'epoch': 2} {'type': 'loss', 'content': 0.005572781432420015, 'timestamp': '2025-10-01 04:19:56.362838', 'step': 2687, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:19:56.405008', 'step': 2687, 'epoch': 2} {'type': 'loss', 'content': 0.008295274339616299, 'timestamp': '2025-10-01 04:19:56.435779', 'step': 2688, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:56.484194', 'step': 2688, 'epoch': 2} {'type': 'loss', 'content': 0.007491642609238625, 'timestamp': '2025-10-01 04:19:56.495514', 'step': 2689, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:56.548356', 'step': 2689, 'epoch': 2} {'type': 'loss', 'content': 0.008724281564354897, 'timestamp': '2025-10-01 04:19:56.559249', 'step': 2690, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:56.613237', 'step': 2690, 'epoch': 2} {'type': 'loss', 'content': 0.00900518149137497, 'timestamp': '2025-10-01 04:19:56.625106', 'step': 2691, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:56.674408', 'step': 2691, 'epoch': 2} {'type': 'loss', 'content': 0.011000791564583778, 'timestamp': '2025-10-01 04:19:56.703811', 'step': 2692, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 15662185694400}, 'timestamp': '2025-10-01 04:19:56.765668', 'step': 2692, 'epoch': 2} {'type': 'loss', 'content': 0.006109387148171663, 'timestamp': '2025-10-01 04:19:56.784749', 'step': 2693, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:19:56.832930', 'step': 2693, 'epoch': 2} {'type': 'loss', 'content': 0.006541448179632425, 'timestamp': '2025-10-01 04:19:56.845735', 'step': 2694, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:19:56.902053', 'step': 2694, 'epoch': 2} {'type': 'loss', 'content': 0.01174502819776535, 'timestamp': '2025-10-01 04:19:56.916139', 'step': 2695, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:19:56.962291', 'step': 2695, 'epoch': 2} {'type': 'loss', 'content': 0.006854834035038948, 'timestamp': '2025-10-01 04:19:56.996815', 'step': 2696, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:19:57.036105', 'step': 2696, 'epoch': 2} {'type': 'loss', 'content': 0.006129550747573376, 'timestamp': '2025-10-01 04:19:57.046480', 'step': 2697, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:19:57.093014', 'step': 2697, 'epoch': 2} {'type': 'loss', 'content': 0.008765568025410175, 'timestamp': '2025-10-01 04:19:57.107047', 'step': 2698, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:19:57.148962', 'step': 2698, 'epoch': 2} {'type': 'loss', 'content': 0.01027489174157381, 'timestamp': '2025-10-01 04:19:57.161531', 'step': 2699, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:19:57.199780', 'step': 2699, 'epoch': 2} {'type': 'loss', 'content': 0.007682595402002335, 'timestamp': '2025-10-01 04:19:57.233297', 'step': 2700, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:19:57.269322', 'step': 2700, 'epoch': 2} {'type': 'loss', 'content': 0.017913540825247765, 'timestamp': '2025-10-01 04:19:57.280021', 'step': 2701, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:19:57.318581', 'step': 2701, 'epoch': 2} {'type': 'loss', 'content': 0.006379545200616121, 'timestamp': '2025-10-01 04:19:57.331140', 'step': 2702, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:19:57.372577', 'step': 2702, 'epoch': 2} {'type': 'loss', 'content': 0.01248596515506506, 'timestamp': '2025-10-01 04:19:57.386850', 'step': 2703, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:57.426175', 'step': 2703, 'epoch': 2} {'type': 'loss', 'content': 0.0059988247230648994, 'timestamp': '2025-10-01 04:19:57.455554', 'step': 2704, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:19:57.502213', 'step': 2704, 'epoch': 2} {'type': 'loss', 'content': 0.004863103851675987, 'timestamp': '2025-10-01 04:19:57.515773', 'step': 2705, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:19:57.559685', 'step': 2705, 'epoch': 2} {'type': 'loss', 'content': 0.008691847324371338, 'timestamp': '2025-10-01 04:19:57.572423', 'step': 2706, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:19:57.616858', 'step': 2706, 'epoch': 2} {'type': 'loss', 'content': 0.007914803922176361, 'timestamp': '2025-10-01 04:19:57.624870', 'step': 2707, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:57.668878', 'step': 2707, 'epoch': 2} {'type': 'loss', 'content': 0.010588197037577629, 'timestamp': '2025-10-01 04:19:57.697886', 'step': 2708, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:19:57.733033', 'step': 2708, 'epoch': 2} {'type': 'loss', 'content': 0.008305161260068417, 'timestamp': '2025-10-01 04:19:57.743811', 'step': 2709, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:19:57.783002', 'step': 2709, 'epoch': 2} {'type': 'loss', 'content': 0.006590509321540594, 'timestamp': '2025-10-01 04:19:57.797079', 'step': 2710, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:19:57.841844', 'step': 2710, 'epoch': 2} {'type': 'loss', 'content': 0.010043567046523094, 'timestamp': '2025-10-01 04:19:57.854423', 'step': 2711, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:19:57.893942', 'step': 2711, 'epoch': 2} {'type': 'loss', 'content': 0.004512489307671785, 'timestamp': '2025-10-01 04:19:57.927458', 'step': 2712, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:19:57.965074', 'step': 2712, 'epoch': 2} {'type': 'loss', 'content': 0.010248801670968533, 'timestamp': '2025-10-01 04:19:57.975188', 'step': 2713, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:19:58.014523', 'step': 2713, 'epoch': 2} {'type': 'loss', 'content': 0.00948659423738718, 'timestamp': '2025-10-01 04:19:58.027046', 'step': 2714, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:58.061464', 'step': 2714, 'epoch': 2} {'type': 'loss', 'content': 0.011193429119884968, 'timestamp': '2025-10-01 04:19:58.072076', 'step': 2715, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:19:58.106730', 'step': 2715, 'epoch': 2} {'type': 'loss', 'content': 0.007273251190781593, 'timestamp': '2025-10-01 04:19:58.135639', 'step': 2716, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:19:58.173744', 'step': 2716, 'epoch': 2} {'type': 'loss', 'content': 0.010983197018504143, 'timestamp': '2025-10-01 04:19:58.187255', 'step': 2717, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:19:58.231699', 'step': 2717, 'epoch': 2} {'type': 'loss', 'content': 0.0073629203252494335, 'timestamp': '2025-10-01 04:19:58.245707', 'step': 2718, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:19:58.300014', 'step': 2718, 'epoch': 2} {'type': 'loss', 'content': 0.0061669303104281425, 'timestamp': '2025-10-01 04:19:58.312756', 'step': 2719, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:19:58.354262', 'step': 2719, 'epoch': 2} {'type': 'loss', 'content': 0.012418197467923164, 'timestamp': '2025-10-01 04:19:58.388768', 'step': 2720, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:19:58.431237', 'step': 2720, 'epoch': 2} {'type': 'loss', 'content': 0.007612817920744419, 'timestamp': '2025-10-01 04:19:58.441734', 'step': 2721, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:58.479209', 'step': 2721, 'epoch': 2} {'type': 'loss', 'content': 0.007093400694429874, 'timestamp': '2025-10-01 04:19:58.490086', 'step': 2722, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:19:58.524617', 'step': 2722, 'epoch': 2} {'type': 'loss', 'content': 0.021469037979841232, 'timestamp': '2025-10-01 04:19:58.532391', 'step': 2723, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:19:58.575313', 'step': 2723, 'epoch': 2} {'type': 'loss', 'content': 0.0036643273197114468, 'timestamp': '2025-10-01 04:19:58.610455', 'step': 2724, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:19:58.650548', 'step': 2724, 'epoch': 2} {'type': 'loss', 'content': 0.016714150086045265, 'timestamp': '2025-10-01 04:19:58.661212', 'step': 2725, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:19:58.702483', 'step': 2725, 'epoch': 2} {'type': 'loss', 'content': 0.011271446011960506, 'timestamp': '2025-10-01 04:19:58.716499', 'step': 2726, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:19:58.753096', 'step': 2726, 'epoch': 2} {'type': 'loss', 'content': 0.008345658890902996, 'timestamp': '2025-10-01 04:19:58.765700', 'step': 2727, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:58.803454', 'step': 2727, 'epoch': 2} {'type': 'loss', 'content': 0.01077754981815815, 'timestamp': '2025-10-01 04:19:58.836173', 'step': 2728, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:58.870271', 'step': 2728, 'epoch': 2} {'type': 'loss', 'content': 0.015486798249185085, 'timestamp': '2025-10-01 04:19:58.875990', 'step': 2729, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:19:58.917968', 'step': 2729, 'epoch': 2} {'type': 'loss', 'content': 0.010050512850284576, 'timestamp': '2025-10-01 04:19:58.931981', 'step': 2730, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:19:58.980067', 'step': 2730, 'epoch': 2} {'type': 'loss', 'content': 0.008703194558620453, 'timestamp': '2025-10-01 04:19:58.992589', 'step': 2731, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:19:59.052901', 'step': 2731, 'epoch': 2} {'type': 'loss', 'content': 0.005330631509423256, 'timestamp': '2025-10-01 04:19:59.089618', 'step': 2732, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:19:59.128500', 'step': 2732, 'epoch': 2} {'type': 'loss', 'content': 0.010040638968348503, 'timestamp': '2025-10-01 04:19:59.139648', 'step': 2733, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:19:59.196133', 'step': 2733, 'epoch': 2} {'type': 'loss', 'content': 0.011791720055043697, 'timestamp': '2025-10-01 04:19:59.208685', 'step': 2734, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:19:59.254004', 'step': 2734, 'epoch': 2} {'type': 'loss', 'content': 0.005446192342787981, 'timestamp': '2025-10-01 04:19:59.258995', 'step': 2735, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:19:59.304873', 'step': 2735, 'epoch': 2} {'type': 'loss', 'content': 0.007481151260435581, 'timestamp': '2025-10-01 04:19:59.333226', 'step': 2736, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:19:59.370357', 'step': 2736, 'epoch': 2} {'type': 'loss', 'content': 0.0038906699046492577, 'timestamp': '2025-10-01 04:19:59.375337', 'step': 2737, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:19:59.407100', 'step': 2737, 'epoch': 2} {'type': 'loss', 'content': 0.005662490613758564, 'timestamp': '2025-10-01 04:19:59.415430', 'step': 2738, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:59.450573', 'step': 2738, 'epoch': 2} {'type': 'loss', 'content': 0.007225668523460627, 'timestamp': '2025-10-01 04:19:59.461538', 'step': 2739, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:59.495863', 'step': 2739, 'epoch': 2} {'type': 'loss', 'content': 0.005644384305924177, 'timestamp': '2025-10-01 04:19:59.528656', 'step': 2740, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:59.570735', 'step': 2740, 'epoch': 2} {'type': 'loss', 'content': 0.006858758628368378, 'timestamp': '2025-10-01 04:19:59.579591', 'step': 2741, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:19:59.622174', 'step': 2741, 'epoch': 2} {'type': 'loss', 'content': 0.004963989369571209, 'timestamp': '2025-10-01 04:19:59.635781', 'step': 2742, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:59.671039', 'step': 2742, 'epoch': 2} {'type': 'loss', 'content': 0.008574236184358597, 'timestamp': '2025-10-01 04:19:59.682359', 'step': 2743, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:19:59.715391', 'step': 2743, 'epoch': 2} {'type': 'loss', 'content': 0.0022066242527216673, 'timestamp': '2025-10-01 04:19:59.743877', 'step': 2744, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:59.780589', 'step': 2744, 'epoch': 2} {'type': 'loss', 'content': 0.012227991595864296, 'timestamp': '2025-10-01 04:19:59.789853', 'step': 2745, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:19:59.826096', 'step': 2745, 'epoch': 2} {'type': 'loss', 'content': 0.012536893598735332, 'timestamp': '2025-10-01 04:19:59.837941', 'step': 2746, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:19:59.875710', 'step': 2746, 'epoch': 2} {'type': 'loss', 'content': 0.009590533562004566, 'timestamp': '2025-10-01 04:19:59.886815', 'step': 2747, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:19:59.921381', 'step': 2747, 'epoch': 2} {'type': 'loss', 'content': 0.011384689249098301, 'timestamp': '2025-10-01 04:19:59.949890', 'step': 2748, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:19:59.986175', 'step': 2748, 'epoch': 2} {'type': 'loss', 'content': 0.005200240761041641, 'timestamp': '2025-10-01 04:19:59.996618', 'step': 2749, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:20:00.041188', 'step': 2749, 'epoch': 2} {'type': 'loss', 'content': 0.0013764083851128817, 'timestamp': '2025-10-01 04:20:00.046061', 'step': 2750, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:00.090655', 'step': 2750, 'epoch': 2} {'type': 'loss', 'content': 0.006514886859804392, 'timestamp': '2025-10-01 04:20:00.098892', 'step': 2751, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:20:00.135530', 'step': 2751, 'epoch': 2} {'type': 'loss', 'content': 0.008776111528277397, 'timestamp': '2025-10-01 04:20:00.161366', 'step': 2752, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:20:00.198944', 'step': 2752, 'epoch': 2} {'type': 'loss', 'content': 0.007739328313618898, 'timestamp': '2025-10-01 04:20:00.204175', 'step': 2753, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:20:00.241928', 'step': 2753, 'epoch': 2} {'type': 'loss', 'content': 0.02466069720685482, 'timestamp': '2025-10-01 04:20:00.249991', 'step': 2754, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:00.298941', 'step': 2754, 'epoch': 2} {'type': 'loss', 'content': 0.009229732677340508, 'timestamp': '2025-10-01 04:20:00.307123', 'step': 2755, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:00.345486', 'step': 2755, 'epoch': 2} {'type': 'loss', 'content': 0.009838208556175232, 'timestamp': '2025-10-01 04:20:00.377315', 'step': 2756, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:00.417970', 'step': 2756, 'epoch': 2} {'type': 'loss', 'content': 0.011494297534227371, 'timestamp': '2025-10-01 04:20:00.426632', 'step': 2757, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:00.466624', 'step': 2757, 'epoch': 2} {'type': 'loss', 'content': 0.002348638139665127, 'timestamp': '2025-10-01 04:20:00.474598', 'step': 2758, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 15187581968384}, 'timestamp': '2025-10-01 04:20:00.527435', 'step': 2758, 'epoch': 2} {'type': 'loss', 'content': 0.008111391216516495, 'timestamp': '2025-10-01 04:20:00.545354', 'step': 2759, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:00.582255', 'step': 2759, 'epoch': 2} {'type': 'loss', 'content': 0.013759814202785492, 'timestamp': '2025-10-01 04:20:00.615996', 'step': 2760, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-10-01 04:20:03.393646', 'step': 2760, 'epoch': 2} {'type': 'pplx', 'content': 5.662969614205734, 'timestamp': '2025-10-01 04:20:03.396591', 'step': 2760, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:20:03.430718', 'step': 2760, 'epoch': 2} {'type': 'loss', 'content': 0.009472260251641273, 'timestamp': '2025-10-01 04:20:03.443494', 'step': 2761, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:03.487734', 'step': 2761, 'epoch': 2} {'type': 'loss', 'content': 0.006601423490792513, 'timestamp': '2025-10-01 04:20:03.498261', 'step': 2762, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:03.537583', 'step': 2762, 'epoch': 2} {'type': 'loss', 'content': 0.0069335754960775375, 'timestamp': '2025-10-01 04:20:03.548399', 'step': 2763, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:03.593701', 'step': 2763, 'epoch': 2} {'type': 'loss', 'content': 0.004476483445614576, 'timestamp': '2025-10-01 04:20:03.622611', 'step': 2764, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:03.691210', 'step': 2764, 'epoch': 2} {'type': 'loss', 'content': 0.009364171884953976, 'timestamp': '2025-10-01 04:20:03.699606', 'step': 2765, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:03.739207', 'step': 2765, 'epoch': 2} {'type': 'loss', 'content': 0.0022827431093901396, 'timestamp': '2025-10-01 04:20:03.751755', 'step': 2766, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-10-01 04:20:03.800126', 'step': 2766, 'epoch': 2} {'type': 'loss', 'content': 0.0028697082307189703, 'timestamp': '2025-10-01 04:20:03.817378', 'step': 2767, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:03.861473', 'step': 2767, 'epoch': 2} {'type': 'loss', 'content': 0.004702012985944748, 'timestamp': '2025-10-01 04:20:03.893210', 'step': 2768, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:20:03.934378', 'step': 2768, 'epoch': 2} {'type': 'loss', 'content': 0.001989358803257346, 'timestamp': '2025-10-01 04:20:03.940429', 'step': 2769, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:20:03.974740', 'step': 2769, 'epoch': 2} {'type': 'loss', 'content': 0.0012353956699371338, 'timestamp': '2025-10-01 04:20:03.981736', 'step': 2770, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:04.014517', 'step': 2770, 'epoch': 2} {'type': 'loss', 'content': 0.0037311892956495285, 'timestamp': '2025-10-01 04:20:04.025296', 'step': 2771, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:20:04.062315', 'step': 2771, 'epoch': 2} {'type': 'loss', 'content': 0.001359867979772389, 'timestamp': '2025-10-01 04:20:04.088021', 'step': 2772, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:20:04.128244', 'step': 2772, 'epoch': 2} {'type': 'loss', 'content': 0.005257639102637768, 'timestamp': '2025-10-01 04:20:04.133515', 'step': 2773, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:20:04.172486', 'step': 2773, 'epoch': 2} {'type': 'loss', 'content': 0.0098988963291049, 'timestamp': '2025-10-01 04:20:04.186445', 'step': 2774, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:20:04.229850', 'step': 2774, 'epoch': 2} {'type': 'loss', 'content': 0.0013850328978151083, 'timestamp': '2025-10-01 04:20:04.237482', 'step': 2775, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:04.277560', 'step': 2775, 'epoch': 2} {'type': 'loss', 'content': 0.009057503193616867, 'timestamp': '2025-10-01 04:20:04.306478', 'step': 2776, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:04.353026', 'step': 2776, 'epoch': 2} {'type': 'loss', 'content': 0.0076464139856398106, 'timestamp': '2025-10-01 04:20:04.364089', 'step': 2777, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:20:04.399307', 'step': 2777, 'epoch': 2} {'type': 'loss', 'content': 0.01059963833540678, 'timestamp': '2025-10-01 04:20:04.404114', 'step': 2778, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:20:04.452526', 'step': 2778, 'epoch': 2} {'type': 'loss', 'content': 0.0038283111061900854, 'timestamp': '2025-10-01 04:20:04.460022', 'step': 2779, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:20:04.509703', 'step': 2779, 'epoch': 2} {'type': 'loss', 'content': 0.005443652626127005, 'timestamp': '2025-10-01 04:20:04.544148', 'step': 2780, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:20:04.586015', 'step': 2780, 'epoch': 2} {'type': 'loss', 'content': 0.006291181314736605, 'timestamp': '2025-10-01 04:20:04.601556', 'step': 2781, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:04.636742', 'step': 2781, 'epoch': 2} {'type': 'loss', 'content': 0.004968506749719381, 'timestamp': '2025-10-01 04:20:04.647526', 'step': 2782, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:20:04.691326', 'step': 2782, 'epoch': 2} {'type': 'loss', 'content': 0.015688329935073853, 'timestamp': '2025-10-01 04:20:04.705332', 'step': 2783, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-10-01 04:20:04.766131', 'step': 2783, 'epoch': 2} {'type': 'loss', 'content': 0.006138908211141825, 'timestamp': '2025-10-01 04:20:04.804315', 'step': 2784, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:04.841134', 'step': 2784, 'epoch': 2} {'type': 'loss', 'content': 0.004718221724033356, 'timestamp': '2025-10-01 04:20:04.851553', 'step': 2785, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:04.887403', 'step': 2785, 'epoch': 2} {'type': 'loss', 'content': 0.006512233521789312, 'timestamp': '2025-10-01 04:20:04.898198', 'step': 2786, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:20:04.941644', 'step': 2786, 'epoch': 2} {'type': 'loss', 'content': 0.0053376974537968636, 'timestamp': '2025-10-01 04:20:04.955660', 'step': 2787, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:04.999154', 'step': 2787, 'epoch': 2} {'type': 'loss', 'content': 0.015755876898765564, 'timestamp': '2025-10-01 04:20:05.037322', 'step': 2788, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:05.080578', 'step': 2788, 'epoch': 2} {'type': 'loss', 'content': 0.00944597739726305, 'timestamp': '2025-10-01 04:20:05.088705', 'step': 2789, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:20:05.150060', 'step': 2789, 'epoch': 2} {'type': 'loss', 'content': 0.005791092291474342, 'timestamp': '2025-10-01 04:20:05.165909', 'step': 2790, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:05.210971', 'step': 2790, 'epoch': 2} {'type': 'loss', 'content': 0.007202432490885258, 'timestamp': '2025-10-01 04:20:05.223710', 'step': 2791, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:20:05.266466', 'step': 2791, 'epoch': 2} {'type': 'loss', 'content': 0.00627396022900939, 'timestamp': '2025-10-01 04:20:05.301422', 'step': 2792, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:05.341848', 'step': 2792, 'epoch': 2} {'type': 'loss', 'content': 0.005290024448186159, 'timestamp': '2025-10-01 04:20:05.352328', 'step': 2793, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:20:05.400068', 'step': 2793, 'epoch': 2} {'type': 'loss', 'content': 0.007185117807239294, 'timestamp': '2025-10-01 04:20:05.408075', 'step': 2794, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:20:05.451693', 'step': 2794, 'epoch': 2} {'type': 'loss', 'content': 0.004961288999766111, 'timestamp': '2025-10-01 04:20:05.459898', 'step': 2795, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:20:05.500707', 'step': 2795, 'epoch': 2} {'type': 'loss', 'content': 0.009141203947365284, 'timestamp': '2025-10-01 04:20:05.529116', 'step': 2796, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:20:05.582549', 'step': 2796, 'epoch': 2} {'type': 'loss', 'content': 0.00777080375701189, 'timestamp': '2025-10-01 04:20:05.588484', 'step': 2797, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:05.645348', 'step': 2797, 'epoch': 2} {'type': 'loss', 'content': 0.011773726902902126, 'timestamp': '2025-10-01 04:20:05.656865', 'step': 2798, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:05.708808', 'step': 2798, 'epoch': 2} {'type': 'loss', 'content': 0.009269775822758675, 'timestamp': '2025-10-01 04:20:05.717093', 'step': 2799, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:05.761673', 'step': 2799, 'epoch': 2} {'type': 'loss', 'content': 0.008173949085175991, 'timestamp': '2025-10-01 04:20:05.794298', 'step': 2800, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:05.834054', 'step': 2800, 'epoch': 2} {'type': 'loss', 'content': 0.00479099852964282, 'timestamp': '2025-10-01 04:20:05.845090', 'step': 2801, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:05.881843', 'step': 2801, 'epoch': 2} {'type': 'loss', 'content': 0.0030352564062923193, 'timestamp': '2025-10-01 04:20:05.893106', 'step': 2802, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:05.929675', 'step': 2802, 'epoch': 2} {'type': 'loss', 'content': 0.012404393404722214, 'timestamp': '2025-10-01 04:20:05.942415', 'step': 2803, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:20:05.986064', 'step': 2803, 'epoch': 2} {'type': 'loss', 'content': 0.008430743589997292, 'timestamp': '2025-10-01 04:20:06.014892', 'step': 2804, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:06.046363', 'step': 2804, 'epoch': 2} {'type': 'loss', 'content': 0.010874360799789429, 'timestamp': '2025-10-01 04:20:06.055455', 'step': 2805, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:20:06.096503', 'step': 2805, 'epoch': 2} {'type': 'loss', 'content': 0.005739609245210886, 'timestamp': '2025-10-01 04:20:06.104411', 'step': 2806, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:06.144967', 'step': 2806, 'epoch': 2} {'type': 'loss', 'content': 0.007560627069324255, 'timestamp': '2025-10-01 04:20:06.156354', 'step': 2807, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:20:06.193958', 'step': 2807, 'epoch': 2} {'type': 'loss', 'content': 0.0011318337637931108, 'timestamp': '2025-10-01 04:20:06.222435', 'step': 2808, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:20:06.272701', 'step': 2808, 'epoch': 2} {'type': 'loss', 'content': 0.008041491732001305, 'timestamp': '2025-10-01 04:20:06.286148', 'step': 2809, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:06.327711', 'step': 2809, 'epoch': 2} {'type': 'loss', 'content': 0.018802277743816376, 'timestamp': '2025-10-01 04:20:06.336135', 'step': 2810, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:20:06.378744', 'step': 2810, 'epoch': 2} {'type': 'loss', 'content': 0.00951474066823721, 'timestamp': '2025-10-01 04:20:06.385915', 'step': 2811, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:20:06.428568', 'step': 2811, 'epoch': 2} {'type': 'loss', 'content': 0.004624214954674244, 'timestamp': '2025-10-01 04:20:06.463528', 'step': 2812, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:20:06.503991', 'step': 2812, 'epoch': 2} {'type': 'loss', 'content': 0.0072152335196733475, 'timestamp': '2025-10-01 04:20:06.516779', 'step': 2813, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:06.555851', 'step': 2813, 'epoch': 2} {'type': 'loss', 'content': 0.009925422258675098, 'timestamp': '2025-10-01 04:20:06.568358', 'step': 2814, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:06.615452', 'step': 2814, 'epoch': 2} {'type': 'loss', 'content': 0.0090789208188653, 'timestamp': '2025-10-01 04:20:06.628219', 'step': 2815, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:20:06.677380', 'step': 2815, 'epoch': 2} {'type': 'loss', 'content': 0.011762382462620735, 'timestamp': '2025-10-01 04:20:06.712336', 'step': 2816, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 15662185694400}, 'timestamp': '2025-10-01 04:20:06.763849', 'step': 2816, 'epoch': 2} {'type': 'loss', 'content': 0.0055811055935919285, 'timestamp': '2025-10-01 04:20:06.782969', 'step': 2817, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:20:06.827372', 'step': 2817, 'epoch': 2} {'type': 'loss', 'content': 0.008749759756028652, 'timestamp': '2025-10-01 04:20:06.843198', 'step': 2818, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:06.885683', 'step': 2818, 'epoch': 2} {'type': 'loss', 'content': 0.015758449211716652, 'timestamp': '2025-10-01 04:20:06.896999', 'step': 2819, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:20:06.937423', 'step': 2819, 'epoch': 2} {'type': 'loss', 'content': 0.012571955099701881, 'timestamp': '2025-10-01 04:20:06.972433', 'step': 2820, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:07.017787', 'step': 2820, 'epoch': 2} {'type': 'loss', 'content': 0.006868722848594189, 'timestamp': '2025-10-01 04:20:07.023784', 'step': 2821, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:07.065622', 'step': 2821, 'epoch': 2} {'type': 'loss', 'content': 0.012446926906704903, 'timestamp': '2025-10-01 04:20:07.078317', 'step': 2822, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:20:07.122004', 'step': 2822, 'epoch': 2} {'type': 'loss', 'content': 0.002157803624868393, 'timestamp': '2025-10-01 04:20:07.137761', 'step': 2823, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:20:07.178853', 'step': 2823, 'epoch': 2} {'type': 'loss', 'content': 0.006884641945362091, 'timestamp': '2025-10-01 04:20:07.213800', 'step': 2824, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:07.249881', 'step': 2824, 'epoch': 2} {'type': 'loss', 'content': 0.0132798682898283, 'timestamp': '2025-10-01 04:20:07.259141', 'step': 2825, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:20:07.295812', 'step': 2825, 'epoch': 2} {'type': 'loss', 'content': 0.0035172479692846537, 'timestamp': '2025-10-01 04:20:07.309277', 'step': 2826, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:20:07.343689', 'step': 2826, 'epoch': 2} {'type': 'loss', 'content': 0.003914228640496731, 'timestamp': '2025-10-01 04:20:07.351062', 'step': 2827, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:07.393689', 'step': 2827, 'epoch': 2} {'type': 'loss', 'content': 0.0036260385531932116, 'timestamp': '2025-10-01 04:20:07.425400', 'step': 2828, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:07.465854', 'step': 2828, 'epoch': 2} {'type': 'loss', 'content': 0.007590958382934332, 'timestamp': '2025-10-01 04:20:07.476674', 'step': 2829, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:20:07.517787', 'step': 2829, 'epoch': 2} {'type': 'loss', 'content': 0.003985843155533075, 'timestamp': '2025-10-01 04:20:07.525301', 'step': 2830, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:07.565399', 'step': 2830, 'epoch': 2} {'type': 'loss', 'content': 0.0084436209872365, 'timestamp': '2025-10-01 04:20:07.576774', 'step': 2831, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 16136789420416}, 'timestamp': '2025-10-01 04:20:07.629833', 'step': 2831, 'epoch': 2} {'type': 'loss', 'content': 0.006658701691776514, 'timestamp': '2025-10-01 04:20:07.670001', 'step': 2832, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:07.704999', 'step': 2832, 'epoch': 2} {'type': 'loss', 'content': 0.0058806296437978745, 'timestamp': '2025-10-01 04:20:07.714267', 'step': 2833, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:20:07.751382', 'step': 2833, 'epoch': 2} {'type': 'loss', 'content': 0.0006465135957114398, 'timestamp': '2025-10-01 04:20:07.758985', 'step': 2834, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-10-01 04:20:07.806171', 'step': 2834, 'epoch': 2} {'type': 'loss', 'content': 0.005202346947044134, 'timestamp': '2025-10-01 04:20:07.822634', 'step': 2835, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:07.859877', 'step': 2835, 'epoch': 2} {'type': 'loss', 'content': 0.009371449239552021, 'timestamp': '2025-10-01 04:20:07.888984', 'step': 2836, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:07.924343', 'step': 2836, 'epoch': 2} {'type': 'loss', 'content': 0.0066033718176186085, 'timestamp': '2025-10-01 04:20:07.934679', 'step': 2837, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:07.973204', 'step': 2837, 'epoch': 2} {'type': 'loss', 'content': 0.0034658657386898994, 'timestamp': '2025-10-01 04:20:07.981390', 'step': 2838, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:08.019056', 'step': 2838, 'epoch': 2} {'type': 'loss', 'content': 0.008503942750394344, 'timestamp': '2025-10-01 04:20:08.027275', 'step': 2839, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:20:08.068725', 'step': 2839, 'epoch': 2} {'type': 'loss', 'content': 0.007493161130696535, 'timestamp': '2025-10-01 04:20:08.103207', 'step': 2840, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:20:08.141443', 'step': 2840, 'epoch': 2} {'type': 'loss', 'content': 0.005511393770575523, 'timestamp': '2025-10-01 04:20:08.147089', 'step': 2841, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:20:08.183865', 'step': 2841, 'epoch': 2} {'type': 'loss', 'content': 0.01198074221611023, 'timestamp': '2025-10-01 04:20:08.191236', 'step': 2842, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:20:08.225751', 'step': 2842, 'epoch': 2} {'type': 'loss', 'content': 0.013743419200181961, 'timestamp': '2025-10-01 04:20:08.233618', 'step': 2843, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:08.270046', 'step': 2843, 'epoch': 2} {'type': 'loss', 'content': 0.007173336576670408, 'timestamp': '2025-10-01 04:20:08.302676', 'step': 2844, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:20:08.336681', 'step': 2844, 'epoch': 2} {'type': 'loss', 'content': 0.013284138403832912, 'timestamp': '2025-10-01 04:20:08.340236', 'step': 2845, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:08.375875', 'step': 2845, 'epoch': 2} {'type': 'loss', 'content': 0.005459666717797518, 'timestamp': '2025-10-01 04:20:08.387219', 'step': 2846, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-10-01 04:20:08.423991', 'step': 2846, 'epoch': 2} {'type': 'loss', 'content': 0.011744055896997452, 'timestamp': '2025-10-01 04:20:08.428604', 'step': 2847, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:20:08.465050', 'step': 2847, 'epoch': 2} {'type': 'loss', 'content': 0.010003515519201756, 'timestamp': '2025-10-01 04:20:08.493466', 'step': 2848, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:20:08.527674', 'step': 2848, 'epoch': 2} {'type': 'loss', 'content': 0.003672449616715312, 'timestamp': '2025-10-01 04:20:08.532819', 'step': 2849, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:20:08.567844', 'step': 2849, 'epoch': 2} {'type': 'loss', 'content': 0.005920943804085255, 'timestamp': '2025-10-01 04:20:08.572880', 'step': 2850, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:08.611133', 'step': 2850, 'epoch': 2} {'type': 'loss', 'content': 0.008229375816881657, 'timestamp': '2025-10-01 04:20:08.621978', 'step': 2851, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:20:08.661303', 'step': 2851, 'epoch': 2} {'type': 'loss', 'content': 0.005234968848526478, 'timestamp': '2025-10-01 04:20:08.696161', 'step': 2852, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:20:08.734095', 'step': 2852, 'epoch': 2} {'type': 'loss', 'content': 0.008635127916932106, 'timestamp': '2025-10-01 04:20:08.747431', 'step': 2853, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:20:08.782217', 'step': 2853, 'epoch': 2} {'type': 'loss', 'content': 0.013977881520986557, 'timestamp': '2025-10-01 04:20:08.789304', 'step': 2854, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:20:08.827823', 'step': 2854, 'epoch': 2} {'type': 'loss', 'content': 0.017527638003230095, 'timestamp': '2025-10-01 04:20:08.835719', 'step': 2855, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:08.871738', 'step': 2855, 'epoch': 2} {'type': 'loss', 'content': 0.015911849215626717, 'timestamp': '2025-10-01 04:20:08.904385', 'step': 2856, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:08.938039', 'step': 2856, 'epoch': 2} {'type': 'loss', 'content': 0.006404948886483908, 'timestamp': '2025-10-01 04:20:08.947301', 'step': 2857, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:08.982816', 'step': 2857, 'epoch': 2} {'type': 'loss', 'content': 0.017667852342128754, 'timestamp': '2025-10-01 04:20:08.993857', 'step': 2858, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:20:09.036096', 'step': 2858, 'epoch': 2} {'type': 'loss', 'content': 0.01043705828487873, 'timestamp': '2025-10-01 04:20:09.050087', 'step': 2859, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:09.086734', 'step': 2859, 'epoch': 2} {'type': 'loss', 'content': 0.011181355454027653, 'timestamp': '2025-10-01 04:20:09.119001', 'step': 2860, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:20:09.154021', 'step': 2860, 'epoch': 2} {'type': 'loss', 'content': 0.009997529909014702, 'timestamp': '2025-10-01 04:20:09.159409', 'step': 2861, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:20:09.193585', 'step': 2861, 'epoch': 2} {'type': 'loss', 'content': 0.003989727236330509, 'timestamp': '2025-10-01 04:20:09.201715', 'step': 2862, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:20:09.243706', 'step': 2862, 'epoch': 2} {'type': 'loss', 'content': 0.011372188106179237, 'timestamp': '2025-10-01 04:20:09.257848', 'step': 2863, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 17560600598464}, 'timestamp': '2025-10-01 04:20:09.314786', 'step': 2863, 'epoch': 2} {'type': 'loss', 'content': 0.003729795338585973, 'timestamp': '2025-10-01 04:20:09.356989', 'step': 2864, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:20:09.389513', 'step': 2864, 'epoch': 2} {'type': 'loss', 'content': 0.008692595176398754, 'timestamp': '2025-10-01 04:20:09.394981', 'step': 2865, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:20:09.434843', 'step': 2865, 'epoch': 2} {'type': 'loss', 'content': 0.001967673422768712, 'timestamp': '2025-10-01 04:20:09.448832', 'step': 2866, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:20:09.482934', 'step': 2866, 'epoch': 2} {'type': 'loss', 'content': 0.0047103953547775745, 'timestamp': '2025-10-01 04:20:09.490011', 'step': 2867, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:09.521022', 'step': 2867, 'epoch': 2} {'type': 'loss', 'content': 0.0046462141908705235, 'timestamp': '2025-10-01 04:20:09.552732', 'step': 2868, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:20:09.588288', 'step': 2868, 'epoch': 2} {'type': 'loss', 'content': 0.006229673977941275, 'timestamp': '2025-10-01 04:20:09.593695', 'step': 2869, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:20:09.630153', 'step': 2869, 'epoch': 2} {'type': 'loss', 'content': 0.006125009153038263, 'timestamp': '2025-10-01 04:20:09.638191', 'step': 2870, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:20:09.674075', 'step': 2870, 'epoch': 2} {'type': 'loss', 'content': 0.012813194654881954, 'timestamp': '2025-10-01 04:20:09.681746', 'step': 2871, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:09.720459', 'step': 2871, 'epoch': 2} {'type': 'loss', 'content': 0.011893192306160927, 'timestamp': '2025-10-01 04:20:09.754130', 'step': 2872, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:09.791797', 'step': 2872, 'epoch': 2} {'type': 'loss', 'content': 0.006346710026264191, 'timestamp': '2025-10-01 04:20:09.800973', 'step': 2873, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:09.838303', 'step': 2873, 'epoch': 2} {'type': 'loss', 'content': 0.003786935005337, 'timestamp': '2025-10-01 04:20:09.851020', 'step': 2874, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:20:09.889794', 'step': 2874, 'epoch': 2} {'type': 'loss', 'content': 0.006343259941786528, 'timestamp': '2025-10-01 04:20:09.897561', 'step': 2875, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-10-01 04:20:12.579614', 'step': 2875, 'epoch': 2} {'type': 'pplx', 'content': 5.7017018237464265, 'timestamp': '2025-10-01 04:20:12.583411', 'step': 2875, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:12.619390', 'step': 2875, 'epoch': 2} {'type': 'loss', 'content': 0.011050320230424404, 'timestamp': '2025-10-01 04:20:12.650024', 'step': 2876, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-10-01 04:20:12.691326', 'step': 2876, 'epoch': 2} {'type': 'loss', 'content': 0.007033767644315958, 'timestamp': '2025-10-01 04:20:12.694791', 'step': 2877, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:20:12.738262', 'step': 2877, 'epoch': 2} {'type': 'loss', 'content': 0.006943574640899897, 'timestamp': '2025-10-01 04:20:12.746260', 'step': 2878, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:20:12.787790', 'step': 2878, 'epoch': 2} {'type': 'loss', 'content': 0.00696661276742816, 'timestamp': '2025-10-01 04:20:12.801312', 'step': 2879, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:12.840529', 'step': 2879, 'epoch': 2} {'type': 'loss', 'content': 0.01203068345785141, 'timestamp': '2025-10-01 04:20:12.869757', 'step': 2880, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:12.910943', 'step': 2880, 'epoch': 2} {'type': 'loss', 'content': 0.00823859591037035, 'timestamp': '2025-10-01 04:20:12.919511', 'step': 2881, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:12.954287', 'step': 2881, 'epoch': 2} {'type': 'loss', 'content': 0.01075970008969307, 'timestamp': '2025-10-01 04:20:12.962615', 'step': 2882, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:13.000895', 'step': 2882, 'epoch': 2} {'type': 'loss', 'content': 0.006671941839158535, 'timestamp': '2025-10-01 04:20:13.013394', 'step': 2883, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:20:13.054683', 'step': 2883, 'epoch': 2} {'type': 'loss', 'content': 0.008895883336663246, 'timestamp': '2025-10-01 04:20:13.089190', 'step': 2884, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:13.130699', 'step': 2884, 'epoch': 2} {'type': 'loss', 'content': 0.010510117746889591, 'timestamp': '2025-10-01 04:20:13.136509', 'step': 2885, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:13.175802', 'step': 2885, 'epoch': 2} {'type': 'loss', 'content': 0.00869205966591835, 'timestamp': '2025-10-01 04:20:13.184181', 'step': 2886, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:13.221586', 'step': 2886, 'epoch': 2} {'type': 'loss', 'content': 0.005199786741286516, 'timestamp': '2025-10-01 04:20:13.230029', 'step': 2887, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:13.267461', 'step': 2887, 'epoch': 2} {'type': 'loss', 'content': 0.00498600909486413, 'timestamp': '2025-10-01 04:20:13.301171', 'step': 2888, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:20:13.354169', 'step': 2888, 'epoch': 2} {'type': 'loss', 'content': 0.006628416944295168, 'timestamp': '2025-10-01 04:20:13.369447', 'step': 2889, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:20:13.428841', 'step': 2889, 'epoch': 2} {'type': 'loss', 'content': 0.01063564233481884, 'timestamp': '2025-10-01 04:20:13.442820', 'step': 2890, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:20:13.482147', 'step': 2890, 'epoch': 2} {'type': 'loss', 'content': 0.01249577198177576, 'timestamp': '2025-10-01 04:20:13.489327', 'step': 2891, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-10-01 04:20:13.536603', 'step': 2891, 'epoch': 2} {'type': 'loss', 'content': 0.006182270124554634, 'timestamp': '2025-10-01 04:20:13.574020', 'step': 2892, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:20:13.611898', 'step': 2892, 'epoch': 2} {'type': 'loss', 'content': 0.00674985907971859, 'timestamp': '2025-10-01 04:20:13.624728', 'step': 2893, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:20:13.668548', 'step': 2893, 'epoch': 2} {'type': 'loss', 'content': 0.003937273286283016, 'timestamp': '2025-10-01 04:20:13.682534', 'step': 2894, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:13.717917', 'step': 2894, 'epoch': 2} {'type': 'loss', 'content': 0.008274332620203495, 'timestamp': '2025-10-01 04:20:13.726154', 'step': 2895, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:20:13.763970', 'step': 2895, 'epoch': 2} {'type': 'loss', 'content': 0.009593036025762558, 'timestamp': '2025-10-01 04:20:13.799016', 'step': 2896, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:20:13.836357', 'step': 2896, 'epoch': 2} {'type': 'loss', 'content': 0.0026286705397069454, 'timestamp': '2025-10-01 04:20:13.842035', 'step': 2897, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:13.876484', 'step': 2897, 'epoch': 2} {'type': 'loss', 'content': 0.0030282428488135338, 'timestamp': '2025-10-01 04:20:13.884850', 'step': 2898, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:13.919982', 'step': 2898, 'epoch': 2} {'type': 'loss', 'content': 0.0075779864564538, 'timestamp': '2025-10-01 04:20:13.930754', 'step': 2899, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:13.971904', 'step': 2899, 'epoch': 2} {'type': 'loss', 'content': 0.0035738463047891855, 'timestamp': '2025-10-01 04:20:14.005557', 'step': 2900, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-10-01 04:20:14.050048', 'step': 2900, 'epoch': 2} {'type': 'loss', 'content': 0.0046758693642914295, 'timestamp': '2025-10-01 04:20:14.066045', 'step': 2901, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:14.102176', 'step': 2901, 'epoch': 2} {'type': 'loss', 'content': 0.009584681130945683, 'timestamp': '2025-10-01 04:20:14.112978', 'step': 2902, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:14.163038', 'step': 2902, 'epoch': 2} {'type': 'loss', 'content': 0.01251443475484848, 'timestamp': '2025-10-01 04:20:14.171956', 'step': 2903, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:20:14.206648', 'step': 2903, 'epoch': 2} {'type': 'loss', 'content': 0.002298504114151001, 'timestamp': '2025-10-01 04:20:14.235640', 'step': 2904, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:20:14.275312', 'step': 2904, 'epoch': 2} {'type': 'loss', 'content': 0.0067415363155305386, 'timestamp': '2025-10-01 04:20:14.288645', 'step': 2905, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:20:14.330703', 'step': 2905, 'epoch': 2} {'type': 'loss', 'content': 0.004968035034835339, 'timestamp': '2025-10-01 04:20:14.338126', 'step': 2906, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:20:14.371465', 'step': 2906, 'epoch': 2} {'type': 'loss', 'content': 0.01209714263677597, 'timestamp': '2025-10-01 04:20:14.379512', 'step': 2907, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:14.417696', 'step': 2907, 'epoch': 2} {'type': 'loss', 'content': 0.008974813856184483, 'timestamp': '2025-10-01 04:20:14.450085', 'step': 2908, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:14.484972', 'step': 2908, 'epoch': 2} {'type': 'loss', 'content': 0.00694839283823967, 'timestamp': '2025-10-01 04:20:14.492901', 'step': 2909, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:14.526794', 'step': 2909, 'epoch': 2} {'type': 'loss', 'content': 0.01402649562805891, 'timestamp': '2025-10-01 04:20:14.537399', 'step': 2910, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:20:14.582148', 'step': 2910, 'epoch': 2} {'type': 'loss', 'content': 0.013558730483055115, 'timestamp': '2025-10-01 04:20:14.594629', 'step': 2911, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:20:14.652591', 'step': 2911, 'epoch': 2} {'type': 'loss', 'content': 0.005118176341056824, 'timestamp': '2025-10-01 04:20:14.687078', 'step': 2912, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:14.722118', 'step': 2912, 'epoch': 2} {'type': 'loss', 'content': 0.011846575886011124, 'timestamp': '2025-10-01 04:20:14.730500', 'step': 2913, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:20:14.765560', 'step': 2913, 'epoch': 2} {'type': 'loss', 'content': 0.00419828575104475, 'timestamp': '2025-10-01 04:20:14.773573', 'step': 2914, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:20:14.815435', 'step': 2914, 'epoch': 2} {'type': 'loss', 'content': 0.004554925486445427, 'timestamp': '2025-10-01 04:20:14.820360', 'step': 2915, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:14.862402', 'step': 2915, 'epoch': 2} {'type': 'loss', 'content': 0.011143339797854424, 'timestamp': '2025-10-01 04:20:14.894027', 'step': 2916, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:20:14.929619', 'step': 2916, 'epoch': 2} {'type': 'loss', 'content': 0.011164728552103043, 'timestamp': '2025-10-01 04:20:14.932724', 'step': 2917, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:20:14.971880', 'step': 2917, 'epoch': 2} {'type': 'loss', 'content': 0.011686003766953945, 'timestamp': '2025-10-01 04:20:14.979126', 'step': 2918, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-10-01 04:20:15.018335', 'step': 2918, 'epoch': 2} {'type': 'loss', 'content': 0.005336758214980364, 'timestamp': '2025-10-01 04:20:15.022859', 'step': 2919, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:15.060816', 'step': 2919, 'epoch': 2} {'type': 'loss', 'content': 0.007220887113362551, 'timestamp': '2025-10-01 04:20:15.094493', 'step': 2920, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-10-01 04:20:15.127115', 'step': 2920, 'epoch': 2} {'type': 'loss', 'content': 0.007441872730851173, 'timestamp': '2025-10-01 04:20:15.131215', 'step': 2921, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:20:15.165808', 'step': 2921, 'epoch': 2} {'type': 'loss', 'content': 0.005334662273526192, 'timestamp': '2025-10-01 04:20:15.170715', 'step': 2922, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:20:15.202923', 'step': 2922, 'epoch': 2} {'type': 'loss', 'content': 0.0018872080836445093, 'timestamp': '2025-10-01 04:20:15.207840', 'step': 2923, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:15.252942', 'step': 2923, 'epoch': 2} {'type': 'loss', 'content': 0.008337759412825108, 'timestamp': '2025-10-01 04:20:15.282211', 'step': 2924, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:20:15.323280', 'step': 2924, 'epoch': 2} {'type': 'loss', 'content': 0.00440233526751399, 'timestamp': '2025-10-01 04:20:15.339768', 'step': 2925, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:20:15.377269', 'step': 2925, 'epoch': 2} {'type': 'loss', 'content': 0.007447332609444857, 'timestamp': '2025-10-01 04:20:15.388126', 'step': 2926, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:20:15.425841', 'step': 2926, 'epoch': 2} {'type': 'loss', 'content': 0.005711056757718325, 'timestamp': '2025-10-01 04:20:15.433796', 'step': 2927, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:20:15.470729', 'step': 2927, 'epoch': 2} {'type': 'loss', 'content': 0.008309951983392239, 'timestamp': '2025-10-01 04:20:15.498893', 'step': 2928, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:20:15.539383', 'step': 2928, 'epoch': 2} {'type': 'loss', 'content': 0.009367150254547596, 'timestamp': '2025-10-01 04:20:15.544507', 'step': 2929, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:15.581481', 'step': 2929, 'epoch': 2} {'type': 'loss', 'content': 0.005230510141700506, 'timestamp': '2025-10-01 04:20:15.592802', 'step': 2930, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:15.627872', 'step': 2930, 'epoch': 2} {'type': 'loss', 'content': 0.003488334361463785, 'timestamp': '2025-10-01 04:20:15.640404', 'step': 2931, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:20:15.687196', 'step': 2931, 'epoch': 2} {'type': 'loss', 'content': 0.011724540963768959, 'timestamp': '2025-10-01 04:20:15.722368', 'step': 2932, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:15.756854', 'step': 2932, 'epoch': 2} {'type': 'loss', 'content': 0.00728189991787076, 'timestamp': '2025-10-01 04:20:15.767497', 'step': 2933, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:15.800888', 'step': 2933, 'epoch': 2} {'type': 'loss', 'content': 0.007082611322402954, 'timestamp': '2025-10-01 04:20:15.812347', 'step': 2934, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:15.845082', 'step': 2934, 'epoch': 2} {'type': 'loss', 'content': 0.007455884478986263, 'timestamp': '2025-10-01 04:20:15.855777', 'step': 2935, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:20:15.891262', 'step': 2935, 'epoch': 2} {'type': 'loss', 'content': 0.0036624986678361893, 'timestamp': '2025-10-01 04:20:15.919175', 'step': 2936, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:20:15.959300', 'step': 2936, 'epoch': 2} {'type': 'loss', 'content': 0.012950951233506203, 'timestamp': '2025-10-01 04:20:15.974567', 'step': 2937, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:20:16.015197', 'step': 2937, 'epoch': 2} {'type': 'loss', 'content': 0.007570296060293913, 'timestamp': '2025-10-01 04:20:16.029188', 'step': 2938, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:16.065429', 'step': 2938, 'epoch': 2} {'type': 'loss', 'content': 0.014969254843890667, 'timestamp': '2025-10-01 04:20:16.076211', 'step': 2939, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:16.115724', 'step': 2939, 'epoch': 2} {'type': 'loss', 'content': 0.007613092660903931, 'timestamp': '2025-10-01 04:20:16.144934', 'step': 2940, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:20:16.179698', 'step': 2940, 'epoch': 2} {'type': 'loss', 'content': 0.005192306824028492, 'timestamp': '2025-10-01 04:20:16.185348', 'step': 2941, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:20:16.220040', 'step': 2941, 'epoch': 2} {'type': 'loss', 'content': 0.008015657775104046, 'timestamp': '2025-10-01 04:20:16.227901', 'step': 2942, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:20:16.266914', 'step': 2942, 'epoch': 2} {'type': 'loss', 'content': 0.007415547966957092, 'timestamp': '2025-10-01 04:20:16.281116', 'step': 2943, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:20:16.315714', 'step': 2943, 'epoch': 2} {'type': 'loss', 'content': 0.005365028046071529, 'timestamp': '2025-10-01 04:20:16.344899', 'step': 2944, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 14238374516352}, 'timestamp': '2025-10-01 04:20:16.388955', 'step': 2944, 'epoch': 2} {'type': 'loss', 'content': 0.006125005427747965, 'timestamp': '2025-10-01 04:20:16.406062', 'step': 2945, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:16.439680', 'step': 2945, 'epoch': 2} {'type': 'loss', 'content': 0.004234274383634329, 'timestamp': '2025-10-01 04:20:16.448019', 'step': 2946, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:20:16.484066', 'step': 2946, 'epoch': 2} {'type': 'loss', 'content': 0.00722013134509325, 'timestamp': '2025-10-01 04:20:16.491484', 'step': 2947, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:20:16.524799', 'step': 2947, 'epoch': 2} {'type': 'loss', 'content': 0.008409693837165833, 'timestamp': '2025-10-01 04:20:16.553849', 'step': 2948, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:16.589105', 'step': 2948, 'epoch': 2} {'type': 'loss', 'content': 0.010376404039561749, 'timestamp': '2025-10-01 04:20:16.594775', 'step': 2949, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:20:16.629290', 'step': 2949, 'epoch': 2} {'type': 'loss', 'content': 0.007167127449065447, 'timestamp': '2025-10-01 04:20:16.636899', 'step': 2950, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:16.671689', 'step': 2950, 'epoch': 2} {'type': 'loss', 'content': 0.0053911940194666386, 'timestamp': '2025-10-01 04:20:16.684101', 'step': 2951, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:20:16.717213', 'step': 2951, 'epoch': 2} {'type': 'loss', 'content': 0.0021107583306729794, 'timestamp': '2025-10-01 04:20:16.745684', 'step': 2952, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:20:16.780263', 'step': 2952, 'epoch': 2} {'type': 'loss', 'content': 0.005493012256920338, 'timestamp': '2025-10-01 04:20:16.785984', 'step': 2953, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:16.820976', 'step': 2953, 'epoch': 2} {'type': 'loss', 'content': 0.011599089950323105, 'timestamp': '2025-10-01 04:20:16.833760', 'step': 2954, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:16.866704', 'step': 2954, 'epoch': 2} {'type': 'loss', 'content': 0.007048914674669504, 'timestamp': '2025-10-01 04:20:16.877466', 'step': 2955, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:20:16.917623', 'step': 2955, 'epoch': 2} {'type': 'loss', 'content': 0.0037417348939925432, 'timestamp': '2025-10-01 04:20:16.946176', 'step': 2956, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:20:16.986993', 'step': 2956, 'epoch': 2} {'type': 'loss', 'content': 0.007826708257198334, 'timestamp': '2025-10-01 04:20:17.000337', 'step': 2957, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:20:17.035678', 'step': 2957, 'epoch': 2} {'type': 'loss', 'content': 0.004416683688759804, 'timestamp': '2025-10-01 04:20:17.043140', 'step': 2958, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:17.079399', 'step': 2958, 'epoch': 2} {'type': 'loss', 'content': 0.004867905750870705, 'timestamp': '2025-10-01 04:20:17.090063', 'step': 2959, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:20:17.122844', 'step': 2959, 'epoch': 2} {'type': 'loss', 'content': 0.004675948992371559, 'timestamp': '2025-10-01 04:20:17.151244', 'step': 2960, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-10-01 04:20:17.197181', 'step': 2960, 'epoch': 2} {'type': 'loss', 'content': 0.0012698180507868528, 'timestamp': '2025-10-01 04:20:17.213268', 'step': 2961, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:20:17.249993', 'step': 2961, 'epoch': 2} {'type': 'loss', 'content': 0.010383439250290394, 'timestamp': '2025-10-01 04:20:17.257322', 'step': 2962, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:17.293276', 'step': 2962, 'epoch': 2} {'type': 'loss', 'content': 0.006809711456298828, 'timestamp': '2025-10-01 04:20:17.305981', 'step': 2963, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:17.343694', 'step': 2963, 'epoch': 2} {'type': 'loss', 'content': 0.009840160608291626, 'timestamp': '2025-10-01 04:20:17.375469', 'step': 2964, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:20:17.413197', 'step': 2964, 'epoch': 2} {'type': 'loss', 'content': 0.011258180253207684, 'timestamp': '2025-10-01 04:20:17.426742', 'step': 2965, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:20:17.465468', 'step': 2965, 'epoch': 2} {'type': 'loss', 'content': 0.0049808816984295845, 'timestamp': '2025-10-01 04:20:17.479674', 'step': 2966, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:17.514103', 'step': 2966, 'epoch': 2} {'type': 'loss', 'content': 0.003090056125074625, 'timestamp': '2025-10-01 04:20:17.526616', 'step': 2967, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:20:17.562176', 'step': 2967, 'epoch': 2} {'type': 'loss', 'content': 0.012052194215357304, 'timestamp': '2025-10-01 04:20:17.591006', 'step': 2968, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:20:17.627111', 'step': 2968, 'epoch': 2} {'type': 'loss', 'content': 0.00992939155548811, 'timestamp': '2025-10-01 04:20:17.640655', 'step': 2969, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:17.675183', 'step': 2969, 'epoch': 2} {'type': 'loss', 'content': 0.0038914831820875406, 'timestamp': '2025-10-01 04:20:17.687944', 'step': 2970, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:20:17.730015', 'step': 2970, 'epoch': 2} {'type': 'loss', 'content': 0.013575401157140732, 'timestamp': '2025-10-01 04:20:17.745743', 'step': 2971, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:17.780794', 'step': 2971, 'epoch': 2} {'type': 'loss', 'content': 0.006032585632055998, 'timestamp': '2025-10-01 04:20:17.809983', 'step': 2972, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:20:17.848010', 'step': 2972, 'epoch': 2} {'type': 'loss', 'content': 0.008904431946575642, 'timestamp': '2025-10-01 04:20:17.861333', 'step': 2973, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:20:17.899792', 'step': 2973, 'epoch': 2} {'type': 'loss', 'content': 0.006328975316137075, 'timestamp': '2025-10-01 04:20:17.913756', 'step': 2974, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:20:17.947341', 'step': 2974, 'epoch': 2} {'type': 'loss', 'content': 0.007992769591510296, 'timestamp': '2025-10-01 04:20:17.955007', 'step': 2975, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:20:17.995322', 'step': 2975, 'epoch': 2} {'type': 'loss', 'content': 0.00900503434240818, 'timestamp': '2025-10-01 04:20:18.029835', 'step': 2976, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:18.066103', 'step': 2976, 'epoch': 2} {'type': 'loss', 'content': 0.008098537102341652, 'timestamp': '2025-10-01 04:20:18.077008', 'step': 2977, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:18.113972', 'step': 2977, 'epoch': 2} {'type': 'loss', 'content': 0.0017875717021524906, 'timestamp': '2025-10-01 04:20:18.126500', 'step': 2978, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:18.162034', 'step': 2978, 'epoch': 2} {'type': 'loss', 'content': 0.006715565454214811, 'timestamp': '2025-10-01 04:20:18.173599', 'step': 2979, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:20:18.207671', 'step': 2979, 'epoch': 2} {'type': 'loss', 'content': 0.011321460828185081, 'timestamp': '2025-10-01 04:20:18.236103', 'step': 2980, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:20:18.271408', 'step': 2980, 'epoch': 2} {'type': 'loss', 'content': 0.001267609535716474, 'timestamp': '2025-10-01 04:20:18.276552', 'step': 2981, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:18.313841', 'step': 2981, 'epoch': 2} {'type': 'loss', 'content': 0.009146043099462986, 'timestamp': '2025-10-01 04:20:18.326403', 'step': 2982, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:18.364553', 'step': 2982, 'epoch': 2} {'type': 'loss', 'content': 0.012785335071384907, 'timestamp': '2025-10-01 04:20:18.376317', 'step': 2983, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:18.414338', 'step': 2983, 'epoch': 2} {'type': 'loss', 'content': 0.0038476521149277687, 'timestamp': '2025-10-01 04:20:18.446932', 'step': 2984, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:20:18.485409', 'step': 2984, 'epoch': 2} {'type': 'loss', 'content': 0.010602974332869053, 'timestamp': '2025-10-01 04:20:18.498796', 'step': 2985, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:20:18.537709', 'step': 2985, 'epoch': 2} {'type': 'loss', 'content': 0.007367995101958513, 'timestamp': '2025-10-01 04:20:18.545584', 'step': 2986, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:18.582975', 'step': 2986, 'epoch': 2} {'type': 'loss', 'content': 0.005316292867064476, 'timestamp': '2025-10-01 04:20:18.595454', 'step': 2987, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:18.629902', 'step': 2987, 'epoch': 2} {'type': 'loss', 'content': 0.013725318014621735, 'timestamp': '2025-10-01 04:20:18.662475', 'step': 2988, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:18.697930', 'step': 2988, 'epoch': 2} {'type': 'loss', 'content': 0.003205403219908476, 'timestamp': '2025-10-01 04:20:18.703363', 'step': 2989, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:18.741418', 'step': 2989, 'epoch': 2} {'type': 'loss', 'content': 0.004831740632653236, 'timestamp': '2025-10-01 04:20:18.753913', 'step': 2990, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-10-01 04:20:21.308196', 'step': 2990, 'epoch': 2} {'type': 'pplx', 'content': 6.08135961289695, 'timestamp': '2025-10-01 04:20:21.310617', 'step': 2990, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:20:21.343467', 'step': 2990, 'epoch': 2} {'type': 'loss', 'content': 0.018393509089946747, 'timestamp': '2025-10-01 04:20:21.350073', 'step': 2991, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:21.387354', 'step': 2991, 'epoch': 2} {'type': 'loss', 'content': 0.0023882524110376835, 'timestamp': '2025-10-01 04:20:21.420881', 'step': 2992, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:21.457922', 'step': 2992, 'epoch': 2} {'type': 'loss', 'content': 0.006412725895643234, 'timestamp': '2025-10-01 04:20:21.467069', 'step': 2993, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:20:21.504226', 'step': 2993, 'epoch': 2} {'type': 'loss', 'content': 0.002937781158834696, 'timestamp': '2025-10-01 04:20:21.512000', 'step': 2994, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:21.549049', 'step': 2994, 'epoch': 2} {'type': 'loss', 'content': 0.01626615598797798, 'timestamp': '2025-10-01 04:20:21.560737', 'step': 2995, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:21.597217', 'step': 2995, 'epoch': 2} {'type': 'loss', 'content': 0.011737152002751827, 'timestamp': '2025-10-01 04:20:21.629521', 'step': 2996, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-10-01 04:20:21.673544', 'step': 2996, 'epoch': 2} {'type': 'loss', 'content': 0.0038055642507970333, 'timestamp': '2025-10-01 04:20:21.690389', 'step': 2997, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:20:21.729196', 'step': 2997, 'epoch': 2} {'type': 'loss', 'content': 0.008987641893327236, 'timestamp': '2025-10-01 04:20:21.742754', 'step': 2998, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:20:21.783076', 'step': 2998, 'epoch': 2} {'type': 'loss', 'content': 0.022937040776014328, 'timestamp': '2025-10-01 04:20:21.797136', 'step': 2999, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:20:21.835141', 'step': 2999, 'epoch': 2} {'type': 'loss', 'content': 0.014617622829973698, 'timestamp': '2025-10-01 04:20:21.869683', 'step': 3000, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 3000', 'timestamp': '2025-10-01 04:20:27.242296', 'step': 3000, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:20:27.290092', 'step': 3000, 'epoch': 2} {'type': 'loss', 'content': 0.0068433526903390884, 'timestamp': '2025-10-01 04:20:27.303306', 'step': 3001, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:27.335775', 'step': 3001, 'epoch': 2} {'type': 'loss', 'content': 0.006373145617544651, 'timestamp': '2025-10-01 04:20:27.348340', 'step': 3002, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:27.380369', 'step': 3002, 'epoch': 2} {'type': 'loss', 'content': 0.00855661928653717, 'timestamp': '2025-10-01 04:20:27.392895', 'step': 3003, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:20:27.428026', 'step': 3003, 'epoch': 2} {'type': 'loss', 'content': 0.003829771187156439, 'timestamp': '2025-10-01 04:20:27.462990', 'step': 3004, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-10-01 04:20:27.500253', 'step': 3004, 'epoch': 2} {'type': 'loss', 'content': 0.005711012054234743, 'timestamp': '2025-10-01 04:20:27.516147', 'step': 3005, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:27.548537', 'step': 3005, 'epoch': 2} {'type': 'loss', 'content': 0.010866481810808182, 'timestamp': '2025-10-01 04:20:27.559976', 'step': 3006, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:27.592977', 'step': 3006, 'epoch': 2} {'type': 'loss', 'content': 0.006775491870939732, 'timestamp': '2025-10-01 04:20:27.605726', 'step': 3007, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:20:27.640926', 'step': 3007, 'epoch': 2} {'type': 'loss', 'content': 0.0068846396170556545, 'timestamp': '2025-10-01 04:20:27.675996', 'step': 3008, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:27.706675', 'step': 3008, 'epoch': 2} {'type': 'loss', 'content': 0.012444336898624897, 'timestamp': '2025-10-01 04:20:27.717462', 'step': 3009, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:27.748947', 'step': 3009, 'epoch': 2} {'type': 'loss', 'content': 0.007706195581704378, 'timestamp': '2025-10-01 04:20:27.761833', 'step': 3010, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:27.793890', 'step': 3010, 'epoch': 2} {'type': 'loss', 'content': 0.007974070496857166, 'timestamp': '2025-10-01 04:20:27.806680', 'step': 3011, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:27.838820', 'step': 3011, 'epoch': 2} {'type': 'loss', 'content': 0.007637516595423222, 'timestamp': '2025-10-01 04:20:27.872549', 'step': 3012, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:27.904863', 'step': 3012, 'epoch': 2} {'type': 'loss', 'content': 0.008238261565566063, 'timestamp': '2025-10-01 04:20:27.915825', 'step': 3013, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:20:27.955990', 'step': 3013, 'epoch': 2} {'type': 'loss', 'content': 0.009296061471104622, 'timestamp': '2025-10-01 04:20:27.969966', 'step': 3014, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:20:28.011701', 'step': 3014, 'epoch': 2} {'type': 'loss', 'content': 0.008963196538388729, 'timestamp': '2025-10-01 04:20:28.027572', 'step': 3015, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:20:28.067679', 'step': 3015, 'epoch': 2} {'type': 'loss', 'content': 0.009730679914355278, 'timestamp': '2025-10-01 04:20:28.102641', 'step': 3016, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 15187581968384}, 'timestamp': '2025-10-01 04:20:28.146407', 'step': 3016, 'epoch': 2} {'type': 'loss', 'content': 0.00497211841866374, 'timestamp': '2025-10-01 04:20:28.163953', 'step': 3017, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:20:28.200858', 'step': 3017, 'epoch': 2} {'type': 'loss', 'content': 0.019599618390202522, 'timestamp': '2025-10-01 04:20:28.214549', 'step': 3018, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:28.253214', 'step': 3018, 'epoch': 2} {'type': 'loss', 'content': 0.005168627016246319, 'timestamp': '2025-10-01 04:20:28.264793', 'step': 3019, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:28.304016', 'step': 3019, 'epoch': 2} {'type': 'loss', 'content': 0.008879032917320728, 'timestamp': '2025-10-01 04:20:28.337557', 'step': 3020, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:20:28.372448', 'step': 3020, 'epoch': 2} {'type': 'loss', 'content': 0.008503242395818233, 'timestamp': '2025-10-01 04:20:28.385293', 'step': 3021, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:20:28.425424', 'step': 3021, 'epoch': 2} {'type': 'loss', 'content': 0.00830403808504343, 'timestamp': '2025-10-01 04:20:28.439463', 'step': 3022, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:20:28.476692', 'step': 3022, 'epoch': 2} {'type': 'loss', 'content': 0.00783919170498848, 'timestamp': '2025-10-01 04:20:28.490668', 'step': 3023, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:20:28.529270', 'step': 3023, 'epoch': 2} {'type': 'loss', 'content': 0.006628255359828472, 'timestamp': '2025-10-01 04:20:28.563759', 'step': 3024, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:28.599517', 'step': 3024, 'epoch': 2} {'type': 'loss', 'content': 0.011155834421515465, 'timestamp': '2025-10-01 04:20:28.609737', 'step': 3025, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:28.655170', 'step': 3025, 'epoch': 2} {'type': 'loss', 'content': 0.011049460619688034, 'timestamp': '2025-10-01 04:20:28.666857', 'step': 3026, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:28.708244', 'step': 3026, 'epoch': 2} {'type': 'loss', 'content': 0.020688027143478394, 'timestamp': '2025-10-01 04:20:28.719719', 'step': 3027, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:28.757908', 'step': 3027, 'epoch': 2} {'type': 'loss', 'content': 0.009577865712344646, 'timestamp': '2025-10-01 04:20:28.796515', 'step': 3028, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:20:28.831972', 'step': 3028, 'epoch': 2} {'type': 'loss', 'content': 0.007822707295417786, 'timestamp': '2025-10-01 04:20:28.850161', 'step': 3029, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:28.888960', 'step': 3029, 'epoch': 2} {'type': 'loss', 'content': 0.003746434347704053, 'timestamp': '2025-10-01 04:20:28.901002', 'step': 3030, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-10-01 04:20:28.952551', 'step': 3030, 'epoch': 2} {'type': 'loss', 'content': 0.014493756927549839, 'timestamp': '2025-10-01 04:20:28.969844', 'step': 3031, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:29.006515', 'step': 3031, 'epoch': 2} {'type': 'loss', 'content': 0.0071036322042346, 'timestamp': '2025-10-01 04:20:29.039939', 'step': 3032, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:20:29.077922', 'step': 3032, 'epoch': 2} {'type': 'loss', 'content': 0.007076086942106485, 'timestamp': '2025-10-01 04:20:29.094055', 'step': 3033, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:20:29.141569', 'step': 3033, 'epoch': 2} {'type': 'loss', 'content': 0.006607582326978445, 'timestamp': '2025-10-01 04:20:29.157541', 'step': 3034, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 16136789420416}, 'timestamp': '2025-10-01 04:20:29.214139', 'step': 3034, 'epoch': 2} {'type': 'loss', 'content': 0.005411273799836636, 'timestamp': '2025-10-01 04:20:29.233447', 'step': 3035, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:20:29.284340', 'step': 3035, 'epoch': 2} {'type': 'loss', 'content': 0.008733340539038181, 'timestamp': '2025-10-01 04:20:29.319469', 'step': 3036, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:29.371063', 'step': 3036, 'epoch': 2} {'type': 'loss', 'content': 0.02402772754430771, 'timestamp': '2025-10-01 04:20:29.381554', 'step': 3037, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:29.422591', 'step': 3037, 'epoch': 2} {'type': 'loss', 'content': 0.00776132196187973, 'timestamp': '2025-10-01 04:20:29.435140', 'step': 3038, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:20:29.477603', 'step': 3038, 'epoch': 2} {'type': 'loss', 'content': 0.004589858464896679, 'timestamp': '2025-10-01 04:20:29.491664', 'step': 3039, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:20:29.543535', 'step': 3039, 'epoch': 2} {'type': 'loss', 'content': 0.008054569363594055, 'timestamp': '2025-10-01 04:20:29.578662', 'step': 3040, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:20:29.620603', 'step': 3040, 'epoch': 2} {'type': 'loss', 'content': 0.007364220917224884, 'timestamp': '2025-10-01 04:20:29.636345', 'step': 3041, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:20:29.675854', 'step': 3041, 'epoch': 2} {'type': 'loss', 'content': 0.00405152840539813, 'timestamp': '2025-10-01 04:20:29.689858', 'step': 3042, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:29.723521', 'step': 3042, 'epoch': 2} {'type': 'loss', 'content': 0.006143369246274233, 'timestamp': '2025-10-01 04:20:29.735103', 'step': 3043, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:29.772628', 'step': 3043, 'epoch': 2} {'type': 'loss', 'content': 0.008318744599819183, 'timestamp': '2025-10-01 04:20:29.806401', 'step': 3044, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-10-01 04:20:29.855402', 'step': 3044, 'epoch': 2} {'type': 'loss', 'content': 0.008964316919445992, 'timestamp': '2025-10-01 04:20:29.871458', 'step': 3045, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:29.913367', 'step': 3045, 'epoch': 2} {'type': 'loss', 'content': 0.01345342118293047, 'timestamp': '2025-10-01 04:20:29.924983', 'step': 3046, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:20:29.966844', 'step': 3046, 'epoch': 2} {'type': 'loss', 'content': 0.007234620861709118, 'timestamp': '2025-10-01 04:20:29.980392', 'step': 3047, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-10-01 04:20:30.034152', 'step': 3047, 'epoch': 2} {'type': 'loss', 'content': 0.002688752254471183, 'timestamp': '2025-10-01 04:20:30.071435', 'step': 3048, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:20:30.108619', 'step': 3048, 'epoch': 2} {'type': 'loss', 'content': 0.008336780592799187, 'timestamp': '2025-10-01 04:20:30.121950', 'step': 3049, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:30.167426', 'step': 3049, 'epoch': 2} {'type': 'loss', 'content': 0.006187029182910919, 'timestamp': '2025-10-01 04:20:30.180328', 'step': 3050, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 14238374516352}, 'timestamp': '2025-10-01 04:20:30.229549', 'step': 3050, 'epoch': 2} {'type': 'loss', 'content': 0.007515027653425932, 'timestamp': '2025-10-01 04:20:30.247219', 'step': 3051, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:20:30.287298', 'step': 3051, 'epoch': 2} {'type': 'loss', 'content': 0.0061639584600925446, 'timestamp': '2025-10-01 04:20:30.322424', 'step': 3052, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:30.355786', 'step': 3052, 'epoch': 2} {'type': 'loss', 'content': 0.009405029937624931, 'timestamp': '2025-10-01 04:20:30.361692', 'step': 3053, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:20:30.393406', 'step': 3053, 'epoch': 2} {'type': 'loss', 'content': 0.00865138415247202, 'timestamp': '2025-10-01 04:20:30.400874', 'step': 3054, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:30.441805', 'step': 3054, 'epoch': 2} {'type': 'loss', 'content': 0.016464892774820328, 'timestamp': '2025-10-01 04:20:30.451235', 'step': 3055, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:30.487870', 'step': 3055, 'epoch': 2} {'type': 'loss', 'content': 0.016459010541439056, 'timestamp': '2025-10-01 04:20:30.516935', 'step': 3056, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:30.553900', 'step': 3056, 'epoch': 2} {'type': 'loss', 'content': 0.009026876650750637, 'timestamp': '2025-10-01 04:20:30.559742', 'step': 3057, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:30.593391', 'step': 3057, 'epoch': 2} {'type': 'loss', 'content': 0.0064561194740235806, 'timestamp': '2025-10-01 04:20:30.601759', 'step': 3058, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:20:30.635842', 'step': 3058, 'epoch': 2} {'type': 'loss', 'content': 0.009401097893714905, 'timestamp': '2025-10-01 04:20:30.647451', 'step': 3059, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:30.693591', 'step': 3059, 'epoch': 2} {'type': 'loss', 'content': 0.007500442676246166, 'timestamp': '2025-10-01 04:20:30.729194', 'step': 3060, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:20:30.771855', 'step': 3060, 'epoch': 2} {'type': 'loss', 'content': 0.007794241886585951, 'timestamp': '2025-10-01 04:20:30.777213', 'step': 3061, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:30.813724', 'step': 3061, 'epoch': 2} {'type': 'loss', 'content': 0.010098347440361977, 'timestamp': '2025-10-01 04:20:30.824513', 'step': 3062, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:30.860259', 'step': 3062, 'epoch': 2} {'type': 'loss', 'content': 0.007154751569032669, 'timestamp': '2025-10-01 04:20:30.873029', 'step': 3063, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:30.909694', 'step': 3063, 'epoch': 2} {'type': 'loss', 'content': 0.011734621599316597, 'timestamp': '2025-10-01 04:20:30.938797', 'step': 3064, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:20:30.973621', 'step': 3064, 'epoch': 2} {'type': 'loss', 'content': 0.0105584766715765, 'timestamp': '2025-10-01 04:20:30.979123', 'step': 3065, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:31.014696', 'step': 3065, 'epoch': 2} {'type': 'loss', 'content': 0.0042999242432415485, 'timestamp': '2025-10-01 04:20:31.025584', 'step': 3066, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:20:31.056080', 'step': 3066, 'epoch': 2} {'type': 'loss', 'content': 0.008726123720407486, 'timestamp': '2025-10-01 04:20:31.063686', 'step': 3067, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:20:31.098202', 'step': 3067, 'epoch': 2} {'type': 'loss', 'content': 0.007952769286930561, 'timestamp': '2025-10-01 04:20:31.126584', 'step': 3068, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:20:31.163240', 'step': 3068, 'epoch': 2} {'type': 'loss', 'content': 0.004213719628751278, 'timestamp': '2025-10-01 04:20:31.165876', 'step': 3069, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:20:31.201245', 'step': 3069, 'epoch': 2} {'type': 'loss', 'content': 0.00839912798255682, 'timestamp': '2025-10-01 04:20:31.209338', 'step': 3070, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:31.247984', 'step': 3070, 'epoch': 2} {'type': 'loss', 'content': 0.014646388590335846, 'timestamp': '2025-10-01 04:20:31.260435', 'step': 3071, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:31.297085', 'step': 3071, 'epoch': 2} {'type': 'loss', 'content': 0.0067513855174183846, 'timestamp': '2025-10-01 04:20:31.326355', 'step': 3072, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:20:31.365430', 'step': 3072, 'epoch': 2} {'type': 'loss', 'content': 0.005763888359069824, 'timestamp': '2025-10-01 04:20:31.373397', 'step': 3073, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:31.420178', 'step': 3073, 'epoch': 2} {'type': 'loss', 'content': 0.004169152118265629, 'timestamp': '2025-10-01 04:20:31.432697', 'step': 3074, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:20:31.472335', 'step': 3074, 'epoch': 2} {'type': 'loss', 'content': 0.0071925814263522625, 'timestamp': '2025-10-01 04:20:31.485923', 'step': 3075, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:20:31.521948', 'step': 3075, 'epoch': 2} {'type': 'loss', 'content': 0.010902591980993748, 'timestamp': '2025-10-01 04:20:31.550384', 'step': 3076, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:31.589139', 'step': 3076, 'epoch': 2} {'type': 'loss', 'content': 0.004412411246448755, 'timestamp': '2025-10-01 04:20:31.595015', 'step': 3077, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:31.631557', 'step': 3077, 'epoch': 2} {'type': 'loss', 'content': 0.005120635498315096, 'timestamp': '2025-10-01 04:20:31.639929', 'step': 3078, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:31.681070', 'step': 3078, 'epoch': 2} {'type': 'loss', 'content': 0.010697743855416775, 'timestamp': '2025-10-01 04:20:31.692617', 'step': 3079, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:31.727084', 'step': 3079, 'epoch': 2} {'type': 'loss', 'content': 0.0047607626765966415, 'timestamp': '2025-10-01 04:20:31.758898', 'step': 3080, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:20:31.798055', 'step': 3080, 'epoch': 2} {'type': 'loss', 'content': 0.020256204530596733, 'timestamp': '2025-10-01 04:20:31.803500', 'step': 3081, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:20:31.839498', 'step': 3081, 'epoch': 2} {'type': 'loss', 'content': 0.00444032484665513, 'timestamp': '2025-10-01 04:20:31.848790', 'step': 3082, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:20:31.888076', 'step': 3082, 'epoch': 2} {'type': 'loss', 'content': 0.006212353240698576, 'timestamp': '2025-10-01 04:20:31.895591', 'step': 3083, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:31.935333', 'step': 3083, 'epoch': 2} {'type': 'loss', 'content': 0.003711195895448327, 'timestamp': '2025-10-01 04:20:31.969027', 'step': 3084, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:32.003362', 'step': 3084, 'epoch': 2} {'type': 'loss', 'content': 0.002188276033848524, 'timestamp': '2025-10-01 04:20:32.011940', 'step': 3085, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:20:32.048533', 'step': 3085, 'epoch': 2} {'type': 'loss', 'content': 0.010010977275669575, 'timestamp': '2025-10-01 04:20:32.056592', 'step': 3086, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:20:32.093344', 'step': 3086, 'epoch': 2} {'type': 'loss', 'content': 0.005256820935755968, 'timestamp': '2025-10-01 04:20:32.101332', 'step': 3087, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:32.137972', 'step': 3087, 'epoch': 2} {'type': 'loss', 'content': 0.005671503022313118, 'timestamp': '2025-10-01 04:20:32.167010', 'step': 3088, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:20:32.204803', 'step': 3088, 'epoch': 2} {'type': 'loss', 'content': 0.004164811689406633, 'timestamp': '2025-10-01 04:20:32.209924', 'step': 3089, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:32.248677', 'step': 3089, 'epoch': 2} {'type': 'loss', 'content': 0.010228531435132027, 'timestamp': '2025-10-01 04:20:32.256754', 'step': 3090, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:32.291978', 'step': 3090, 'epoch': 2} {'type': 'loss', 'content': 0.009830030612647533, 'timestamp': '2025-10-01 04:20:32.303439', 'step': 3091, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:32.340560', 'step': 3091, 'epoch': 2} {'type': 'loss', 'content': 0.010923854075372219, 'timestamp': '2025-10-01 04:20:32.372197', 'step': 3092, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:32.404077', 'step': 3092, 'epoch': 2} {'type': 'loss', 'content': 0.014564474113285542, 'timestamp': '2025-10-01 04:20:32.413230', 'step': 3093, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:20:32.452527', 'step': 3093, 'epoch': 2} {'type': 'loss', 'content': 0.0016599894734099507, 'timestamp': '2025-10-01 04:20:32.461423', 'step': 3094, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:20:32.499997', 'step': 3094, 'epoch': 2} {'type': 'loss', 'content': 0.007619697600603104, 'timestamp': '2025-10-01 04:20:32.507891', 'step': 3095, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:32.543062', 'step': 3095, 'epoch': 2} {'type': 'loss', 'content': 0.006005777046084404, 'timestamp': '2025-10-01 04:20:32.572242', 'step': 3096, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:32.605951', 'step': 3096, 'epoch': 2} {'type': 'loss', 'content': 0.013955282978713512, 'timestamp': '2025-10-01 04:20:32.614429', 'step': 3097, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:32.655140', 'step': 3097, 'epoch': 2} {'type': 'loss', 'content': 0.003889199113473296, 'timestamp': '2025-10-01 04:20:32.665824', 'step': 3098, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:32.705845', 'step': 3098, 'epoch': 2} {'type': 'loss', 'content': 0.003814633935689926, 'timestamp': '2025-10-01 04:20:32.716670', 'step': 3099, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:32.752749', 'step': 3099, 'epoch': 2} {'type': 'loss', 'content': 0.0022650989703834057, 'timestamp': '2025-10-01 04:20:32.785400', 'step': 3100, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:32.820402', 'step': 3100, 'epoch': 2} {'type': 'loss', 'content': 0.00854854192584753, 'timestamp': '2025-10-01 04:20:32.828827', 'step': 3101, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:32.864919', 'step': 3101, 'epoch': 2} {'type': 'loss', 'content': 0.004718634765595198, 'timestamp': '2025-10-01 04:20:32.876495', 'step': 3102, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:32.911844', 'step': 3102, 'epoch': 2} {'type': 'loss', 'content': 0.002176657784730196, 'timestamp': '2025-10-01 04:20:32.920176', 'step': 3103, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:32.954337', 'step': 3103, 'epoch': 2} {'type': 'loss', 'content': 0.005696158390492201, 'timestamp': '2025-10-01 04:20:32.986899', 'step': 3104, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:33.020136', 'step': 3104, 'epoch': 2} {'type': 'loss', 'content': 0.00923148076981306, 'timestamp': '2025-10-01 04:20:33.029385', 'step': 3105, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-10-01 04:20:35.492941', 'step': 3105, 'epoch': 2} {'type': 'pplx', 'content': 5.977851900502005, 'timestamp': '2025-10-01 04:20:35.498186', 'step': 3105, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:35.533262', 'step': 3105, 'epoch': 2} {'type': 'loss', 'content': 0.013309795409440994, 'timestamp': '2025-10-01 04:20:35.545278', 'step': 3106, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:35.583512', 'step': 3106, 'epoch': 2} {'type': 'loss', 'content': 0.0045418147929012775, 'timestamp': '2025-10-01 04:20:35.596299', 'step': 3107, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:35.631646', 'step': 3107, 'epoch': 2} {'type': 'loss', 'content': 0.003550387918949127, 'timestamp': '2025-10-01 04:20:35.664063', 'step': 3108, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:35.701438', 'step': 3108, 'epoch': 2} {'type': 'loss', 'content': 0.003708882723003626, 'timestamp': '2025-10-01 04:20:35.710363', 'step': 3109, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:35.744686', 'step': 3109, 'epoch': 2} {'type': 'loss', 'content': 0.007122459355741739, 'timestamp': '2025-10-01 04:20:35.756323', 'step': 3110, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:35.803921', 'step': 3110, 'epoch': 2} {'type': 'loss', 'content': 0.004338547587394714, 'timestamp': '2025-10-01 04:20:35.816436', 'step': 3111, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:35.851193', 'step': 3111, 'epoch': 2} {'type': 'loss', 'content': 0.00860446784645319, 'timestamp': '2025-10-01 04:20:35.880119', 'step': 3112, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:20:35.922581', 'step': 3112, 'epoch': 2} {'type': 'loss', 'content': 0.012290001846849918, 'timestamp': '2025-10-01 04:20:35.935412', 'step': 3113, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:35.976002', 'step': 3113, 'epoch': 2} {'type': 'loss', 'content': 0.0010435197036713362, 'timestamp': '2025-10-01 04:20:35.987241', 'step': 3114, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:36.023429', 'step': 3114, 'epoch': 2} {'type': 'loss', 'content': 0.01622907444834709, 'timestamp': '2025-10-01 04:20:36.035971', 'step': 3115, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:20:36.075679', 'step': 3115, 'epoch': 2} {'type': 'loss', 'content': 0.011676115915179253, 'timestamp': '2025-10-01 04:20:36.110157', 'step': 3116, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:36.150402', 'step': 3116, 'epoch': 2} {'type': 'loss', 'content': 0.002560568740591407, 'timestamp': '2025-10-01 04:20:36.158675', 'step': 3117, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:36.201607', 'step': 3117, 'epoch': 2} {'type': 'loss', 'content': 0.004905077163130045, 'timestamp': '2025-10-01 04:20:36.214377', 'step': 3118, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:36.251916', 'step': 3118, 'epoch': 2} {'type': 'loss', 'content': 0.005320826079696417, 'timestamp': '2025-10-01 04:20:36.264431', 'step': 3119, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:36.316186', 'step': 3119, 'epoch': 2} {'type': 'loss', 'content': 0.009486476890742779, 'timestamp': '2025-10-01 04:20:36.349884', 'step': 3120, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:20:36.386793', 'step': 3120, 'epoch': 2} {'type': 'loss', 'content': 0.015359492972493172, 'timestamp': '2025-10-01 04:20:36.399606', 'step': 3121, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:36.439417', 'step': 3121, 'epoch': 2} {'type': 'loss', 'content': 0.004783677402883768, 'timestamp': '2025-10-01 04:20:36.450916', 'step': 3122, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:36.492225', 'step': 3122, 'epoch': 2} {'type': 'loss', 'content': 0.01758582703769207, 'timestamp': '2025-10-01 04:20:36.503611', 'step': 3123, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:36.543285', 'step': 3123, 'epoch': 2} {'type': 'loss', 'content': 0.006338105071336031, 'timestamp': '2025-10-01 04:20:36.576800', 'step': 3124, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:36.612384', 'step': 3124, 'epoch': 2} {'type': 'loss', 'content': 0.0022160508669912815, 'timestamp': '2025-10-01 04:20:36.621400', 'step': 3125, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:36.664901', 'step': 3125, 'epoch': 2} {'type': 'loss', 'content': 0.005976912099868059, 'timestamp': '2025-10-01 04:20:36.676482', 'step': 3126, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:36.723536', 'step': 3126, 'epoch': 2} {'type': 'loss', 'content': 0.0010553610045462847, 'timestamp': '2025-10-01 04:20:36.736330', 'step': 3127, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:36.772707', 'step': 3127, 'epoch': 2} {'type': 'loss', 'content': 0.004439316689968109, 'timestamp': '2025-10-01 04:20:36.804998', 'step': 3128, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:36.845793', 'step': 3128, 'epoch': 2} {'type': 'loss', 'content': 0.008871936239302158, 'timestamp': '2025-10-01 04:20:36.856855', 'step': 3129, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:36.898926', 'step': 3129, 'epoch': 2} {'type': 'loss', 'content': 0.0052215326577425, 'timestamp': '2025-10-01 04:20:36.909554', 'step': 3130, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:20:36.954020', 'step': 3130, 'epoch': 2} {'type': 'loss', 'content': 0.012393408454954624, 'timestamp': '2025-10-01 04:20:36.961922', 'step': 3131, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:37.006160', 'step': 3131, 'epoch': 2} {'type': 'loss', 'content': 0.001462516258470714, 'timestamp': '2025-10-01 04:20:37.039669', 'step': 3132, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:37.077681', 'step': 3132, 'epoch': 2} {'type': 'loss', 'content': 0.006242775823920965, 'timestamp': '2025-10-01 04:20:37.086899', 'step': 3133, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:37.125405', 'step': 3133, 'epoch': 2} {'type': 'loss', 'content': 0.0017451000167056918, 'timestamp': '2025-10-01 04:20:37.133703', 'step': 3134, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:37.169360', 'step': 3134, 'epoch': 2} {'type': 'loss', 'content': 0.0076455543749034405, 'timestamp': '2025-10-01 04:20:37.181940', 'step': 3135, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:37.231312', 'step': 3135, 'epoch': 2} {'type': 'loss', 'content': 0.0017806494142860174, 'timestamp': '2025-10-01 04:20:37.260454', 'step': 3136, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:37.298568', 'step': 3136, 'epoch': 2} {'type': 'loss', 'content': 0.0019059377955272794, 'timestamp': '2025-10-01 04:20:37.309634', 'step': 3137, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:37.345038', 'step': 3137, 'epoch': 2} {'type': 'loss', 'content': 0.0035374020226299763, 'timestamp': '2025-10-01 04:20:37.357622', 'step': 3138, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:37.399957', 'step': 3138, 'epoch': 2} {'type': 'loss', 'content': 0.009922304190695286, 'timestamp': '2025-10-01 04:20:37.412499', 'step': 3139, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:37.456625', 'step': 3139, 'epoch': 2} {'type': 'loss', 'content': 0.00974544882774353, 'timestamp': '2025-10-01 04:20:37.490326', 'step': 3140, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:20:37.530916', 'step': 3140, 'epoch': 2} {'type': 'loss', 'content': 0.0017779376357793808, 'timestamp': '2025-10-01 04:20:37.543789', 'step': 3141, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:37.580454', 'step': 3141, 'epoch': 2} {'type': 'loss', 'content': 0.0008057583472691476, 'timestamp': '2025-10-01 04:20:37.593229', 'step': 3142, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:37.632671', 'step': 3142, 'epoch': 2} {'type': 'loss', 'content': 0.003261096542701125, 'timestamp': '2025-10-01 04:20:37.644323', 'step': 3143, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:37.682945', 'step': 3143, 'epoch': 2} {'type': 'loss', 'content': 0.0035429815761744976, 'timestamp': '2025-10-01 04:20:37.716490', 'step': 3144, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:20:37.756694', 'step': 3144, 'epoch': 2} {'type': 'loss', 'content': 0.0020570512861013412, 'timestamp': '2025-10-01 04:20:37.769571', 'step': 3145, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:20:37.815022', 'step': 3145, 'epoch': 2} {'type': 'loss', 'content': 0.01990596204996109, 'timestamp': '2025-10-01 04:20:37.829051', 'step': 3146, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:37.870556', 'step': 3146, 'epoch': 2} {'type': 'loss', 'content': 0.0030950147192925215, 'timestamp': '2025-10-01 04:20:37.883306', 'step': 3147, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:37.927794', 'step': 3147, 'epoch': 2} {'type': 'loss', 'content': 0.000591388379689306, 'timestamp': '2025-10-01 04:20:37.961571', 'step': 3148, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:38.003200', 'step': 3148, 'epoch': 2} {'type': 'loss', 'content': 0.004586440976709127, 'timestamp': '2025-10-01 04:20:38.012023', 'step': 3149, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:38.050870', 'step': 3149, 'epoch': 2} {'type': 'loss', 'content': 0.0027982089668512344, 'timestamp': '2025-10-01 04:20:38.063595', 'step': 3150, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:38.101940', 'step': 3150, 'epoch': 2} {'type': 'loss', 'content': 0.0019391692476347089, 'timestamp': '2025-10-01 04:20:38.114756', 'step': 3151, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:38.154897', 'step': 3151, 'epoch': 2} {'type': 'loss', 'content': 0.013577991165220737, 'timestamp': '2025-10-01 04:20:38.188433', 'step': 3152, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:20:38.228138', 'step': 3152, 'epoch': 2} {'type': 'loss', 'content': 0.006959087215363979, 'timestamp': '2025-10-01 04:20:38.233697', 'step': 3153, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:38.271635', 'step': 3153, 'epoch': 2} {'type': 'loss', 'content': 0.015194098465144634, 'timestamp': '2025-10-01 04:20:38.284108', 'step': 3154, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:38.326873', 'step': 3154, 'epoch': 2} {'type': 'loss', 'content': 0.004414226394146681, 'timestamp': '2025-10-01 04:20:38.339626', 'step': 3155, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:38.384694', 'step': 3155, 'epoch': 2} {'type': 'loss', 'content': 0.0072347563691437244, 'timestamp': '2025-10-01 04:20:38.416592', 'step': 3156, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:38.457699', 'step': 3156, 'epoch': 2} {'type': 'loss', 'content': 0.010052269324660301, 'timestamp': '2025-10-01 04:20:38.466915', 'step': 3157, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:38.508158', 'step': 3157, 'epoch': 2} {'type': 'loss', 'content': 0.007560263853520155, 'timestamp': '2025-10-01 04:20:38.519079', 'step': 3158, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:20:38.558703', 'step': 3158, 'epoch': 2} {'type': 'loss', 'content': 0.005766516551375389, 'timestamp': '2025-10-01 04:20:38.566183', 'step': 3159, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:20:38.604376', 'step': 3159, 'epoch': 2} {'type': 'loss', 'content': 0.004770737607032061, 'timestamp': '2025-10-01 04:20:38.633127', 'step': 3160, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:20:38.670847', 'step': 3160, 'epoch': 2} {'type': 'loss', 'content': 0.0015783229609951377, 'timestamp': '2025-10-01 04:20:38.676564', 'step': 3161, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:20:38.713720', 'step': 3161, 'epoch': 2} {'type': 'loss', 'content': 0.002731951652094722, 'timestamp': '2025-10-01 04:20:38.721681', 'step': 3162, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:20:38.759051', 'step': 3162, 'epoch': 2} {'type': 'loss', 'content': 0.004524517804384232, 'timestamp': '2025-10-01 04:20:38.766844', 'step': 3163, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:20:38.808084', 'step': 3163, 'epoch': 2} {'type': 'loss', 'content': 0.0011994423111900687, 'timestamp': '2025-10-01 04:20:38.837124', 'step': 3164, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:38.870695', 'step': 3164, 'epoch': 2} {'type': 'loss', 'content': 0.010986123234033585, 'timestamp': '2025-10-01 04:20:38.876740', 'step': 3165, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:38.919477', 'step': 3165, 'epoch': 2} {'type': 'loss', 'content': 0.004797541070729494, 'timestamp': '2025-10-01 04:20:38.930309', 'step': 3166, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:20:38.967864', 'step': 3166, 'epoch': 2} {'type': 'loss', 'content': 0.00078280468005687, 'timestamp': '2025-10-01 04:20:38.975983', 'step': 3167, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:20:39.018356', 'step': 3167, 'epoch': 2} {'type': 'loss', 'content': 0.00857126247137785, 'timestamp': '2025-10-01 04:20:39.051063', 'step': 3168, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:20:39.084549', 'step': 3168, 'epoch': 2} {'type': 'loss', 'content': 0.008295583538711071, 'timestamp': '2025-10-01 04:20:39.090995', 'step': 3169, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-10-01 04:20:39.124872', 'step': 3169, 'epoch': 2} {'type': 'loss', 'content': 0.003914573695510626, 'timestamp': '2025-10-01 04:20:39.129442', 'step': 3170, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:39.166541', 'step': 3170, 'epoch': 2} {'type': 'loss', 'content': 0.007120965979993343, 'timestamp': '2025-10-01 04:20:39.177329', 'step': 3171, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:39.219892', 'step': 3171, 'epoch': 2} {'type': 'loss', 'content': 0.0123467231169343, 'timestamp': '2025-10-01 04:20:39.253394', 'step': 3172, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:20:39.298037', 'step': 3172, 'epoch': 2} {'type': 'loss', 'content': 0.00926834437996149, 'timestamp': '2025-10-01 04:20:39.311503', 'step': 3173, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:20:39.349436', 'step': 3173, 'epoch': 2} {'type': 'loss', 'content': 0.0029002453666180372, 'timestamp': '2025-10-01 04:20:39.357048', 'step': 3174, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:39.392164', 'step': 3174, 'epoch': 2} {'type': 'loss', 'content': 0.011593769304454327, 'timestamp': '2025-10-01 04:20:39.404675', 'step': 3175, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:20:39.441149', 'step': 3175, 'epoch': 2} {'type': 'loss', 'content': 0.008349215611815453, 'timestamp': '2025-10-01 04:20:39.475666', 'step': 3176, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:39.509575', 'step': 3176, 'epoch': 2} {'type': 'loss', 'content': 0.006125275976955891, 'timestamp': '2025-10-01 04:20:39.518703', 'step': 3177, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:20:39.560047', 'step': 3177, 'epoch': 2} {'type': 'loss', 'content': 0.011403979733586311, 'timestamp': '2025-10-01 04:20:39.573620', 'step': 3178, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:39.608805', 'step': 3178, 'epoch': 2} {'type': 'loss', 'content': 0.05084272846579552, 'timestamp': '2025-10-01 04:20:39.621369', 'step': 3179, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:39.655063', 'step': 3179, 'epoch': 2} {'type': 'loss', 'content': 0.0066191344521939754, 'timestamp': '2025-10-01 04:20:39.687624', 'step': 3180, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:39.724617', 'step': 3180, 'epoch': 2} {'type': 'loss', 'content': 0.006180546712130308, 'timestamp': '2025-10-01 04:20:39.733022', 'step': 3181, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:39.766682', 'step': 3181, 'epoch': 2} {'type': 'loss', 'content': 0.0013216717634350061, 'timestamp': '2025-10-01 04:20:39.779459', 'step': 3182, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:39.816894', 'step': 3182, 'epoch': 2} {'type': 'loss', 'content': 0.0032385787926614285, 'timestamp': '2025-10-01 04:20:39.829482', 'step': 3183, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:39.874827', 'step': 3183, 'epoch': 2} {'type': 'loss', 'content': 0.014222674071788788, 'timestamp': '2025-10-01 04:20:39.908294', 'step': 3184, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:39.943016', 'step': 3184, 'epoch': 2} {'type': 'loss', 'content': 0.008449282497167587, 'timestamp': '2025-10-01 04:20:39.951439', 'step': 3185, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:39.983209', 'step': 3185, 'epoch': 2} {'type': 'loss', 'content': 0.004331082571297884, 'timestamp': '2025-10-01 04:20:39.996926', 'step': 3186, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:40.028608', 'step': 3186, 'epoch': 2} {'type': 'loss', 'content': 0.0069956062361598015, 'timestamp': '2025-10-01 04:20:40.039408', 'step': 3187, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:40.073805', 'step': 3187, 'epoch': 2} {'type': 'loss', 'content': 0.006052176933735609, 'timestamp': '2025-10-01 04:20:40.105474', 'step': 3188, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:40.142018', 'step': 3188, 'epoch': 2} {'type': 'loss', 'content': 0.007408862002193928, 'timestamp': '2025-10-01 04:20:40.152500', 'step': 3189, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:40.190858', 'step': 3189, 'epoch': 2} {'type': 'loss', 'content': 0.009968570433557034, 'timestamp': '2025-10-01 04:20:40.203425', 'step': 3190, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:40.240630', 'step': 3190, 'epoch': 2} {'type': 'loss', 'content': 0.006778594106435776, 'timestamp': '2025-10-01 04:20:40.252045', 'step': 3191, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:20:40.289418', 'step': 3191, 'epoch': 2} {'type': 'loss', 'content': 0.01695774309337139, 'timestamp': '2025-10-01 04:20:40.324328', 'step': 3192, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:40.359252', 'step': 3192, 'epoch': 2} {'type': 'loss', 'content': 0.0036079552955925465, 'timestamp': '2025-10-01 04:20:40.370355', 'step': 3193, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:40.404271', 'step': 3193, 'epoch': 2} {'type': 'loss', 'content': 0.0033906404860317707, 'timestamp': '2025-10-01 04:20:40.412573', 'step': 3194, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:40.446797', 'step': 3194, 'epoch': 2} {'type': 'loss', 'content': 0.010401012375950813, 'timestamp': '2025-10-01 04:20:40.459514', 'step': 3195, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:20:40.492572', 'step': 3195, 'epoch': 2} {'type': 'loss', 'content': 0.006304367445409298, 'timestamp': '2025-10-01 04:20:40.521334', 'step': 3196, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:40.556282', 'step': 3196, 'epoch': 2} {'type': 'loss', 'content': 0.009461999870836735, 'timestamp': '2025-10-01 04:20:40.565545', 'step': 3197, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:40.600802', 'step': 3197, 'epoch': 2} {'type': 'loss', 'content': 0.004450852982699871, 'timestamp': '2025-10-01 04:20:40.612522', 'step': 3198, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:20:40.647510', 'step': 3198, 'epoch': 2} {'type': 'loss', 'content': 0.004887860268354416, 'timestamp': '2025-10-01 04:20:40.655228', 'step': 3199, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:20:40.689000', 'step': 3199, 'epoch': 2} {'type': 'loss', 'content': 0.010595076717436314, 'timestamp': '2025-10-01 04:20:40.717864', 'step': 3200, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:20:40.754798', 'step': 3200, 'epoch': 2} {'type': 'loss', 'content': 0.005791753064841032, 'timestamp': '2025-10-01 04:20:40.759720', 'step': 3201, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:20:40.794595', 'step': 3201, 'epoch': 2} {'type': 'loss', 'content': 0.00757408095523715, 'timestamp': '2025-10-01 04:20:40.799618', 'step': 3202, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:20:40.833988', 'step': 3202, 'epoch': 2} {'type': 'loss', 'content': 0.005310310050845146, 'timestamp': '2025-10-01 04:20:40.841655', 'step': 3203, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:40.874136', 'step': 3203, 'epoch': 2} {'type': 'loss', 'content': 0.00607795687392354, 'timestamp': '2025-10-01 04:20:40.906031', 'step': 3204, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:40.939628', 'step': 3204, 'epoch': 2} {'type': 'loss', 'content': 0.004152228124439716, 'timestamp': '2025-10-01 04:20:40.947946', 'step': 3205, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:40.986304', 'step': 3205, 'epoch': 2} {'type': 'loss', 'content': 0.005493194330483675, 'timestamp': '2025-10-01 04:20:40.999043', 'step': 3206, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:20:41.041593', 'step': 3206, 'epoch': 2} {'type': 'loss', 'content': 0.00980697013437748, 'timestamp': '2025-10-01 04:20:41.055558', 'step': 3207, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:41.094215', 'step': 3207, 'epoch': 2} {'type': 'loss', 'content': 0.0034246111754328012, 'timestamp': '2025-10-01 04:20:41.123503', 'step': 3208, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:20:41.161395', 'step': 3208, 'epoch': 2} {'type': 'loss', 'content': 0.0030806653667241335, 'timestamp': '2025-10-01 04:20:41.167061', 'step': 3209, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:20:41.200692', 'step': 3209, 'epoch': 2} {'type': 'loss', 'content': 0.007587325759232044, 'timestamp': '2025-10-01 04:20:41.208828', 'step': 3210, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:41.243359', 'step': 3210, 'epoch': 2} {'type': 'loss', 'content': 0.00434391712769866, 'timestamp': '2025-10-01 04:20:41.254964', 'step': 3211, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:20:41.288187', 'step': 3211, 'epoch': 2} {'type': 'loss', 'content': 0.007034026551991701, 'timestamp': '2025-10-01 04:20:41.316833', 'step': 3212, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:41.355397', 'step': 3212, 'epoch': 2} {'type': 'loss', 'content': 0.007655165158212185, 'timestamp': '2025-10-01 04:20:41.363908', 'step': 3213, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:20:41.405863', 'step': 3213, 'epoch': 2} {'type': 'loss', 'content': 0.007228820119053125, 'timestamp': '2025-10-01 04:20:41.419866', 'step': 3214, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:41.454623', 'step': 3214, 'epoch': 2} {'type': 'loss', 'content': 0.0015621811617165804, 'timestamp': '2025-10-01 04:20:41.463109', 'step': 3215, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:41.499111', 'step': 3215, 'epoch': 2} {'type': 'loss', 'content': 0.0061201369389891624, 'timestamp': '2025-10-01 04:20:41.531684', 'step': 3216, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:41.565250', 'step': 3216, 'epoch': 2} {'type': 'loss', 'content': 0.006498689763247967, 'timestamp': '2025-10-01 04:20:41.574239', 'step': 3217, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:41.610562', 'step': 3217, 'epoch': 2} {'type': 'loss', 'content': 0.005431507248431444, 'timestamp': '2025-10-01 04:20:41.623338', 'step': 3218, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:41.659805', 'step': 3218, 'epoch': 2} {'type': 'loss', 'content': 0.002970114815980196, 'timestamp': '2025-10-01 04:20:41.670734', 'step': 3219, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:41.706987', 'step': 3219, 'epoch': 2} {'type': 'loss', 'content': 0.003139460226520896, 'timestamp': '2025-10-01 04:20:41.738829', 'step': 3220, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-10-01 04:20:44.226955', 'step': 3220, 'epoch': 2} {'type': 'pplx', 'content': 5.803911655195827, 'timestamp': '2025-10-01 04:20:44.232059', 'step': 3220, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:44.266142', 'step': 3220, 'epoch': 2} {'type': 'loss', 'content': 0.00232279603369534, 'timestamp': '2025-10-01 04:20:44.275542', 'step': 3221, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:44.321568', 'step': 3221, 'epoch': 2} {'type': 'loss', 'content': 0.010304458439350128, 'timestamp': '2025-10-01 04:20:44.332871', 'step': 3222, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:20:44.369736', 'step': 3222, 'epoch': 2} {'type': 'loss', 'content': 0.0013434410793706775, 'timestamp': '2025-10-01 04:20:44.377732', 'step': 3223, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:44.420287', 'step': 3223, 'epoch': 2} {'type': 'loss', 'content': 0.0037858604919165373, 'timestamp': '2025-10-01 04:20:44.452666', 'step': 3224, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:20:44.494377', 'step': 3224, 'epoch': 2} {'type': 'loss', 'content': 0.005810546688735485, 'timestamp': '2025-10-01 04:20:44.507745', 'step': 3225, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:44.544651', 'step': 3225, 'epoch': 2} {'type': 'loss', 'content': 0.002152160042896867, 'timestamp': '2025-10-01 04:20:44.552835', 'step': 3226, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:20:44.600967', 'step': 3226, 'epoch': 2} {'type': 'loss', 'content': 0.0174376480281353, 'timestamp': '2025-10-01 04:20:44.614985', 'step': 3227, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:20:44.659053', 'step': 3227, 'epoch': 2} {'type': 'loss', 'content': 0.0036259491462260485, 'timestamp': '2025-10-01 04:20:44.693557', 'step': 3228, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:44.739882', 'step': 3228, 'epoch': 2} {'type': 'loss', 'content': 0.008769424632191658, 'timestamp': '2025-10-01 04:20:44.750131', 'step': 3229, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:20:44.800364', 'step': 3229, 'epoch': 2} {'type': 'loss', 'content': 0.01888885349035263, 'timestamp': '2025-10-01 04:20:44.813953', 'step': 3230, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:44.861286', 'step': 3230, 'epoch': 2} {'type': 'loss', 'content': 0.002702511614188552, 'timestamp': '2025-10-01 04:20:44.872101', 'step': 3231, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:44.917222', 'step': 3231, 'epoch': 2} {'type': 'loss', 'content': 0.017246978357434273, 'timestamp': '2025-10-01 04:20:44.949798', 'step': 3232, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:20:44.993615', 'step': 3232, 'epoch': 2} {'type': 'loss', 'content': 0.004265878349542618, 'timestamp': '2025-10-01 04:20:45.002974', 'step': 3233, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:20:45.036212', 'step': 3233, 'epoch': 2} {'type': 'loss', 'content': 0.008270415477454662, 'timestamp': '2025-10-01 04:20:45.044311', 'step': 3234, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:20:45.099941', 'step': 3234, 'epoch': 2} {'type': 'loss', 'content': 0.003631237894296646, 'timestamp': '2025-10-01 04:20:45.107995', 'step': 3235, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:20:45.156961', 'step': 3235, 'epoch': 2} {'type': 'loss', 'content': 0.003176675410941243, 'timestamp': '2025-10-01 04:20:45.188498', 'step': 3236, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:20:45.244625', 'step': 3236, 'epoch': 2} {'type': 'loss', 'content': 0.006037975195795298, 'timestamp': '2025-10-01 04:20:45.257446', 'step': 3237, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:45.299152', 'step': 3237, 'epoch': 2} {'type': 'loss', 'content': 0.0034833133686333895, 'timestamp': '2025-10-01 04:20:45.310831', 'step': 3238, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:45.353064', 'step': 3238, 'epoch': 2} {'type': 'loss', 'content': 0.002660988364368677, 'timestamp': '2025-10-01 04:20:45.364644', 'step': 3239, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:20:45.411384', 'step': 3239, 'epoch': 2} {'type': 'loss', 'content': 0.008020762354135513, 'timestamp': '2025-10-01 04:20:45.446300', 'step': 3240, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:45.490628', 'step': 3240, 'epoch': 2} {'type': 'loss', 'content': 0.003303792793303728, 'timestamp': '2025-10-01 04:20:45.499944', 'step': 3241, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:45.545435', 'step': 3241, 'epoch': 2} {'type': 'loss', 'content': 0.0028881486505270004, 'timestamp': '2025-10-01 04:20:45.557029', 'step': 3242, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:45.606437', 'step': 3242, 'epoch': 2} {'type': 'loss', 'content': 0.003846205538138747, 'timestamp': '2025-10-01 04:20:45.618980', 'step': 3243, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:45.676977', 'step': 3243, 'epoch': 2} {'type': 'loss', 'content': 0.0037206143606454134, 'timestamp': '2025-10-01 04:20:45.708868', 'step': 3244, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:45.763201', 'step': 3244, 'epoch': 2} {'type': 'loss', 'content': 0.0030986035708338022, 'timestamp': '2025-10-01 04:20:45.771792', 'step': 3245, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:20:45.816503', 'step': 3245, 'epoch': 2} {'type': 'loss', 'content': 0.0016635332722216845, 'timestamp': '2025-10-01 04:20:45.826377', 'step': 3246, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:45.896003', 'step': 3246, 'epoch': 2} {'type': 'loss', 'content': 0.001033489010296762, 'timestamp': '2025-10-01 04:20:45.906831', 'step': 3247, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:45.971280', 'step': 3247, 'epoch': 2} {'type': 'loss', 'content': 0.0023437109775841236, 'timestamp': '2025-10-01 04:20:46.000572', 'step': 3248, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:20:46.057895', 'step': 3248, 'epoch': 2} {'type': 'loss', 'content': 0.004705995786935091, 'timestamp': '2025-10-01 04:20:46.070778', 'step': 3249, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:46.126966', 'step': 3249, 'epoch': 2} {'type': 'loss', 'content': 0.009837440215051174, 'timestamp': '2025-10-01 04:20:46.135379', 'step': 3250, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:46.189415', 'step': 3250, 'epoch': 2} {'type': 'loss', 'content': 0.0022800632286816835, 'timestamp': '2025-10-01 04:20:46.201023', 'step': 3251, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:46.268589', 'step': 3251, 'epoch': 2} {'type': 'loss', 'content': 0.015152368694543839, 'timestamp': '2025-10-01 04:20:46.302355', 'step': 3252, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:46.359609', 'step': 3252, 'epoch': 2} {'type': 'loss', 'content': 0.007684213109314442, 'timestamp': '2025-10-01 04:20:46.368744', 'step': 3253, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:46.413396', 'step': 3253, 'epoch': 2} {'type': 'loss', 'content': 0.009332445450127125, 'timestamp': '2025-10-01 04:20:46.426011', 'step': 3254, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:46.480728', 'step': 3254, 'epoch': 2} {'type': 'loss', 'content': 0.01466071791946888, 'timestamp': '2025-10-01 04:20:46.489022', 'step': 3255, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:46.548368', 'step': 3255, 'epoch': 2} {'type': 'loss', 'content': 0.015513403341174126, 'timestamp': '2025-10-01 04:20:46.580259', 'step': 3256, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:46.639536', 'step': 3256, 'epoch': 2} {'type': 'loss', 'content': 0.002291190903633833, 'timestamp': '2025-10-01 04:20:46.650611', 'step': 3257, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:46.712849', 'step': 3257, 'epoch': 2} {'type': 'loss', 'content': 0.005131512880325317, 'timestamp': '2025-10-01 04:20:46.736283', 'step': 3258, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:20:46.791614', 'step': 3258, 'epoch': 2} {'type': 'loss', 'content': 0.0024837155360728502, 'timestamp': '2025-10-01 04:20:46.805195', 'step': 3259, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:46.854079', 'step': 3259, 'epoch': 2} {'type': 'loss', 'content': 0.005538976285606623, 'timestamp': '2025-10-01 04:20:46.886684', 'step': 3260, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:20:46.946753', 'step': 3260, 'epoch': 2} {'type': 'loss', 'content': 0.004172015935182571, 'timestamp': '2025-10-01 04:20:46.952523', 'step': 3261, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:47.007062', 'step': 3261, 'epoch': 2} {'type': 'loss', 'content': 0.000880545936524868, 'timestamp': '2025-10-01 04:20:47.018777', 'step': 3262, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:20:47.066246', 'step': 3262, 'epoch': 2} {'type': 'loss', 'content': 0.00857393629848957, 'timestamp': '2025-10-01 04:20:47.074363', 'step': 3263, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:47.123316', 'step': 3263, 'epoch': 2} {'type': 'loss', 'content': 0.00599947152659297, 'timestamp': '2025-10-01 04:20:47.155249', 'step': 3264, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:47.197448', 'step': 3264, 'epoch': 2} {'type': 'loss', 'content': 0.01165077742189169, 'timestamp': '2025-10-01 04:20:47.206584', 'step': 3265, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:47.259629', 'step': 3265, 'epoch': 2} {'type': 'loss', 'content': 0.007759997621178627, 'timestamp': '2025-10-01 04:20:47.267971', 'step': 3266, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:47.323265', 'step': 3266, 'epoch': 2} {'type': 'loss', 'content': 0.009602392092347145, 'timestamp': '2025-10-01 04:20:47.334689', 'step': 3267, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:20:47.386124', 'step': 3267, 'epoch': 2} {'type': 'loss', 'content': 0.002817842410877347, 'timestamp': '2025-10-01 04:20:47.415406', 'step': 3268, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:20:47.458484', 'step': 3268, 'epoch': 2} {'type': 'loss', 'content': 0.006427179090678692, 'timestamp': '2025-10-01 04:20:47.463890', 'step': 3269, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:20:47.510324', 'step': 3269, 'epoch': 2} {'type': 'loss', 'content': 0.0011261660838499665, 'timestamp': '2025-10-01 04:20:47.517822', 'step': 3270, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-10-01 04:20:47.587140', 'step': 3270, 'epoch': 2} {'type': 'loss', 'content': 0.002183279488235712, 'timestamp': '2025-10-01 04:20:47.595986', 'step': 3271, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:47.645197', 'step': 3271, 'epoch': 2} {'type': 'loss', 'content': 0.002481983508914709, 'timestamp': '2025-10-01 04:20:47.677038', 'step': 3272, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:47.719102', 'step': 3272, 'epoch': 2} {'type': 'loss', 'content': 0.013500469736754894, 'timestamp': '2025-10-01 04:20:47.730164', 'step': 3273, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:47.791799', 'step': 3273, 'epoch': 2} {'type': 'loss', 'content': 0.018686562776565552, 'timestamp': '2025-10-01 04:20:47.800029', 'step': 3274, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-10-01 04:20:47.841922', 'step': 3274, 'epoch': 2} {'type': 'loss', 'content': 0.009507223032414913, 'timestamp': '2025-10-01 04:20:47.846997', 'step': 3275, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:20:47.894753', 'step': 3275, 'epoch': 2} {'type': 'loss', 'content': 0.003884090343490243, 'timestamp': '2025-10-01 04:20:47.923214', 'step': 3276, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:20:47.981568', 'step': 3276, 'epoch': 2} {'type': 'loss', 'content': 0.0004371185787022114, 'timestamp': '2025-10-01 04:20:47.986939', 'step': 3277, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:20:48.044460', 'step': 3277, 'epoch': 2} {'type': 'loss', 'content': 0.0035328404046595097, 'timestamp': '2025-10-01 04:20:48.052028', 'step': 3278, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:48.107431', 'step': 3278, 'epoch': 2} {'type': 'loss', 'content': 0.005491653922945261, 'timestamp': '2025-10-01 04:20:48.115708', 'step': 3279, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:48.173054', 'step': 3279, 'epoch': 2} {'type': 'loss', 'content': 0.0033005704171955585, 'timestamp': '2025-10-01 04:20:48.204846', 'step': 3280, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:48.244225', 'step': 3280, 'epoch': 2} {'type': 'loss', 'content': 0.0063004824332892895, 'timestamp': '2025-10-01 04:20:48.250964', 'step': 3281, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:48.299524', 'step': 3281, 'epoch': 2} {'type': 'loss', 'content': 0.0021557784639298916, 'timestamp': '2025-10-01 04:20:48.310497', 'step': 3282, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:48.359619', 'step': 3282, 'epoch': 2} {'type': 'loss', 'content': 0.0011975150555372238, 'timestamp': '2025-10-01 04:20:48.371184', 'step': 3283, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:48.414234', 'step': 3283, 'epoch': 2} {'type': 'loss', 'content': 0.0017166842008009553, 'timestamp': '2025-10-01 04:20:48.443723', 'step': 3284, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:20:48.489965', 'step': 3284, 'epoch': 2} {'type': 'loss', 'content': 0.010976140387356281, 'timestamp': '2025-10-01 04:20:48.496068', 'step': 3285, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:48.548101', 'step': 3285, 'epoch': 2} {'type': 'loss', 'content': 0.006350715644657612, 'timestamp': '2025-10-01 04:20:48.556445', 'step': 3286, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:20:48.613407', 'step': 3286, 'epoch': 2} {'type': 'loss', 'content': 0.0015958389267325401, 'timestamp': '2025-10-01 04:20:48.621454', 'step': 3287, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:48.664092', 'step': 3287, 'epoch': 2} {'type': 'loss', 'content': 0.016637330874800682, 'timestamp': '2025-10-01 04:20:48.697587', 'step': 3288, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:48.746690', 'step': 3288, 'epoch': 2} {'type': 'loss', 'content': 0.007318527903407812, 'timestamp': '2025-10-01 04:20:48.753143', 'step': 3289, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:48.807286', 'step': 3289, 'epoch': 2} {'type': 'loss', 'content': 0.0007248566835187376, 'timestamp': '2025-10-01 04:20:48.815663', 'step': 3290, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:20:48.879319', 'step': 3290, 'epoch': 2} {'type': 'loss', 'content': 0.0005168112693354487, 'timestamp': '2025-10-01 04:20:48.892862', 'step': 3291, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:48.940233', 'step': 3291, 'epoch': 2} {'type': 'loss', 'content': 0.003808549139648676, 'timestamp': '2025-10-01 04:20:48.969449', 'step': 3292, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:49.008106', 'step': 3292, 'epoch': 2} {'type': 'loss', 'content': 0.003428783267736435, 'timestamp': '2025-10-01 04:20:49.017253', 'step': 3293, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:49.066093', 'step': 3293, 'epoch': 2} {'type': 'loss', 'content': 0.0018085906049236655, 'timestamp': '2025-10-01 04:20:49.078789', 'step': 3294, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:20:49.135173', 'step': 3294, 'epoch': 2} {'type': 'loss', 'content': 0.005331474356353283, 'timestamp': '2025-10-01 04:20:49.148738', 'step': 3295, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:20:49.196475', 'step': 3295, 'epoch': 2} {'type': 'loss', 'content': 0.003940998576581478, 'timestamp': '2025-10-01 04:20:49.225014', 'step': 3296, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-10-01 04:20:49.273084', 'step': 3296, 'epoch': 2} {'type': 'loss', 'content': 0.006566372234374285, 'timestamp': '2025-10-01 04:20:49.283187', 'step': 3297, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:20:49.333907', 'step': 3297, 'epoch': 2} {'type': 'loss', 'content': 0.0022260285913944244, 'timestamp': '2025-10-01 04:20:49.341206', 'step': 3298, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:49.387251', 'step': 3298, 'epoch': 2} {'type': 'loss', 'content': 0.0037162452936172485, 'timestamp': '2025-10-01 04:20:49.395437', 'step': 3299, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:20:49.443247', 'step': 3299, 'epoch': 2} {'type': 'loss', 'content': 0.004356463439762592, 'timestamp': '2025-10-01 04:20:49.472103', 'step': 3300, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:49.526659', 'step': 3300, 'epoch': 2} {'type': 'loss', 'content': 0.005988461896777153, 'timestamp': '2025-10-01 04:20:49.534953', 'step': 3301, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:49.589811', 'step': 3301, 'epoch': 2} {'type': 'loss', 'content': 0.005614362191408873, 'timestamp': '2025-10-01 04:20:49.600642', 'step': 3302, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-10-01 04:20:49.649177', 'step': 3302, 'epoch': 2} {'type': 'loss', 'content': 0.0060782600194215775, 'timestamp': '2025-10-01 04:20:49.654474', 'step': 3303, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:20:49.700280', 'step': 3303, 'epoch': 2} {'type': 'loss', 'content': 0.00016400389722548425, 'timestamp': '2025-10-01 04:20:49.728795', 'step': 3304, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:49.777532', 'step': 3304, 'epoch': 2} {'type': 'loss', 'content': 0.002570994198322296, 'timestamp': '2025-10-01 04:20:49.783313', 'step': 3305, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:49.832407', 'step': 3305, 'epoch': 2} {'type': 'loss', 'content': 0.005073315929621458, 'timestamp': '2025-10-01 04:20:49.840856', 'step': 3306, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:49.887986', 'step': 3306, 'epoch': 2} {'type': 'loss', 'content': 0.003587988903746009, 'timestamp': '2025-10-01 04:20:49.899469', 'step': 3307, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:49.957338', 'step': 3307, 'epoch': 2} {'type': 'loss', 'content': 0.01671769469976425, 'timestamp': '2025-10-01 04:20:49.986657', 'step': 3308, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:50.052028', 'step': 3308, 'epoch': 2} {'type': 'loss', 'content': 0.0030083528254181147, 'timestamp': '2025-10-01 04:20:50.062403', 'step': 3309, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:20:50.111806', 'step': 3309, 'epoch': 2} {'type': 'loss', 'content': 0.0002396831550868228, 'timestamp': '2025-10-01 04:20:50.119761', 'step': 3310, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:50.169419', 'step': 3310, 'epoch': 2} {'type': 'loss', 'content': 0.0055777146480977535, 'timestamp': '2025-10-01 04:20:50.182203', 'step': 3311, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:50.223862', 'step': 3311, 'epoch': 2} {'type': 'loss', 'content': 0.0038043942768126726, 'timestamp': '2025-10-01 04:20:50.257356', 'step': 3312, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:50.313976', 'step': 3312, 'epoch': 2} {'type': 'loss', 'content': 0.009556608274579048, 'timestamp': '2025-10-01 04:20:50.323114', 'step': 3313, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:50.377992', 'step': 3313, 'epoch': 2} {'type': 'loss', 'content': 0.002979888115078211, 'timestamp': '2025-10-01 04:20:50.389351', 'step': 3314, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:50.435289', 'step': 3314, 'epoch': 2} {'type': 'loss', 'content': 0.007070835214108229, 'timestamp': '2025-10-01 04:20:50.443501', 'step': 3315, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:20:50.491165', 'step': 3315, 'epoch': 2} {'type': 'loss', 'content': 0.004094443749636412, 'timestamp': '2025-10-01 04:20:50.519557', 'step': 3316, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:20:50.567057', 'step': 3316, 'epoch': 2} {'type': 'loss', 'content': 0.009296164847910404, 'timestamp': '2025-10-01 04:20:50.572431', 'step': 3317, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:50.628450', 'step': 3317, 'epoch': 2} {'type': 'loss', 'content': 0.00407500471919775, 'timestamp': '2025-10-01 04:20:50.639850', 'step': 3318, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:50.692241', 'step': 3318, 'epoch': 2} {'type': 'loss', 'content': 0.0016871878178790212, 'timestamp': '2025-10-01 04:20:50.702901', 'step': 3319, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:50.741787', 'step': 3319, 'epoch': 2} {'type': 'loss', 'content': 0.012511143460869789, 'timestamp': '2025-10-01 04:20:50.775317', 'step': 3320, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:50.820445', 'step': 3320, 'epoch': 2} {'type': 'loss', 'content': 0.0018414640799164772, 'timestamp': '2025-10-01 04:20:50.828800', 'step': 3321, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:20:50.878468', 'step': 3321, 'epoch': 2} {'type': 'loss', 'content': 0.01742539368569851, 'timestamp': '2025-10-01 04:20:50.886449', 'step': 3322, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:50.933453', 'step': 3322, 'epoch': 2} {'type': 'loss', 'content': 0.0015209410339593887, 'timestamp': '2025-10-01 04:20:50.941685', 'step': 3323, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:50.994479', 'step': 3323, 'epoch': 2} {'type': 'loss', 'content': 0.0020352660212665796, 'timestamp': '2025-10-01 04:20:51.026045', 'step': 3324, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:51.084328', 'step': 3324, 'epoch': 2} {'type': 'loss', 'content': 0.031238889321684837, 'timestamp': '2025-10-01 04:20:51.093630', 'step': 3325, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:51.142007', 'step': 3325, 'epoch': 2} {'type': 'loss', 'content': 0.0037314302753657103, 'timestamp': '2025-10-01 04:20:51.153418', 'step': 3326, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:51.195698', 'step': 3326, 'epoch': 2} {'type': 'loss', 'content': 0.004834675695747137, 'timestamp': '2025-10-01 04:20:51.207006', 'step': 3327, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:20:51.249762', 'step': 3327, 'epoch': 2} {'type': 'loss', 'content': 0.006235728971660137, 'timestamp': '2025-10-01 04:20:51.284741', 'step': 3328, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:20:51.331163', 'step': 3328, 'epoch': 2} {'type': 'loss', 'content': 0.01751502975821495, 'timestamp': '2025-10-01 04:20:51.344515', 'step': 3329, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:20:51.388686', 'step': 3329, 'epoch': 2} {'type': 'loss', 'content': 0.007124766707420349, 'timestamp': '2025-10-01 04:20:51.396798', 'step': 3330, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:51.441683', 'step': 3330, 'epoch': 2} {'type': 'loss', 'content': 0.003592984052374959, 'timestamp': '2025-10-01 04:20:51.454403', 'step': 3331, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 15187581968384}, 'timestamp': '2025-10-01 04:20:51.520019', 'step': 3331, 'epoch': 2} {'type': 'loss', 'content': 0.0034224512055516243, 'timestamp': '2025-10-01 04:20:51.558863', 'step': 3332, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:51.597815', 'step': 3332, 'epoch': 2} {'type': 'loss', 'content': 0.014889983460307121, 'timestamp': '2025-10-01 04:20:51.607905', 'step': 3333, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:51.653685', 'step': 3333, 'epoch': 2} {'type': 'loss', 'content': 0.004080453421920538, 'timestamp': '2025-10-01 04:20:51.666165', 'step': 3334, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:51.705884', 'step': 3334, 'epoch': 2} {'type': 'loss', 'content': 0.010305637493729591, 'timestamp': '2025-10-01 04:20:51.716636', 'step': 3335, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-10-01 04:20:54.400775', 'step': 3335, 'epoch': 2} {'type': 'pplx', 'content': 5.884339689584267, 'timestamp': '2025-10-01 04:20:54.405924', 'step': 3335, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:54.442932', 'step': 3335, 'epoch': 2} {'type': 'loss', 'content': 0.0036746319383382797, 'timestamp': '2025-10-01 04:20:54.473057', 'step': 3336, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:54.510499', 'step': 3336, 'epoch': 2} {'type': 'loss', 'content': 0.01057218573987484, 'timestamp': '2025-10-01 04:20:54.516189', 'step': 3337, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:20:54.562164', 'step': 3337, 'epoch': 2} {'type': 'loss', 'content': 0.0037643779069185257, 'timestamp': '2025-10-01 04:20:54.569522', 'step': 3338, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:54.611846', 'step': 3338, 'epoch': 2} {'type': 'loss', 'content': 0.005291206296533346, 'timestamp': '2025-10-01 04:20:54.623121', 'step': 3339, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:54.667252', 'step': 3339, 'epoch': 2} {'type': 'loss', 'content': 0.006397789344191551, 'timestamp': '2025-10-01 04:20:54.698802', 'step': 3340, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:20:54.743806', 'step': 3340, 'epoch': 2} {'type': 'loss', 'content': 0.0019468114478513598, 'timestamp': '2025-10-01 04:20:54.757088', 'step': 3341, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:20:54.804470', 'step': 3341, 'epoch': 2} {'type': 'loss', 'content': 0.006500633433461189, 'timestamp': '2025-10-01 04:20:54.811786', 'step': 3342, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:20:54.859778', 'step': 3342, 'epoch': 2} {'type': 'loss', 'content': 0.015371735207736492, 'timestamp': '2025-10-01 04:20:54.867285', 'step': 3343, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:54.914434', 'step': 3343, 'epoch': 2} {'type': 'loss', 'content': 0.009102726355195045, 'timestamp': '2025-10-01 04:20:54.945650', 'step': 3344, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:54.990235', 'step': 3344, 'epoch': 2} {'type': 'loss', 'content': 0.008985585533082485, 'timestamp': '2025-10-01 04:20:54.997578', 'step': 3345, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:55.047159', 'step': 3345, 'epoch': 2} {'type': 'loss', 'content': 0.002964999759569764, 'timestamp': '2025-10-01 04:20:55.058680', 'step': 3346, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:20:55.110968', 'step': 3346, 'epoch': 2} {'type': 'loss', 'content': 0.004666329361498356, 'timestamp': '2025-10-01 04:20:55.124509', 'step': 3347, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:55.172907', 'step': 3347, 'epoch': 2} {'type': 'loss', 'content': 0.0037155954632908106, 'timestamp': '2025-10-01 04:20:55.204290', 'step': 3348, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:55.249957', 'step': 3348, 'epoch': 2} {'type': 'loss', 'content': 0.006069741677492857, 'timestamp': '2025-10-01 04:20:55.259097', 'step': 3349, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:55.305500', 'step': 3349, 'epoch': 2} {'type': 'loss', 'content': 0.008814956061542034, 'timestamp': '2025-10-01 04:20:55.317093', 'step': 3350, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:20:55.366957', 'step': 3350, 'epoch': 2} {'type': 'loss', 'content': 0.003046920523047447, 'timestamp': '2025-10-01 04:20:55.374397', 'step': 3351, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:55.423949', 'step': 3351, 'epoch': 2} {'type': 'loss', 'content': 0.0013456109445542097, 'timestamp': '2025-10-01 04:20:55.453140', 'step': 3352, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:20:55.504736', 'step': 3352, 'epoch': 2} {'type': 'loss', 'content': 0.003308746265247464, 'timestamp': '2025-10-01 04:20:55.518138', 'step': 3353, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:55.564860', 'step': 3353, 'epoch': 2} {'type': 'loss', 'content': 0.0067789386957883835, 'timestamp': '2025-10-01 04:20:55.576620', 'step': 3354, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:55.614460', 'step': 3354, 'epoch': 2} {'type': 'loss', 'content': 0.0013830200769007206, 'timestamp': '2025-10-01 04:20:55.626034', 'step': 3355, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:55.667959', 'step': 3355, 'epoch': 2} {'type': 'loss', 'content': 0.004561470355838537, 'timestamp': '2025-10-01 04:20:55.699655', 'step': 3356, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:55.758392', 'step': 3356, 'epoch': 2} {'type': 'loss', 'content': 0.004553432576358318, 'timestamp': '2025-10-01 04:20:55.768500', 'step': 3357, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:55.817908', 'step': 3357, 'epoch': 2} {'type': 'loss', 'content': 0.0027432236820459366, 'timestamp': '2025-10-01 04:20:55.830462', 'step': 3358, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:55.899463', 'step': 3358, 'epoch': 2} {'type': 'loss', 'content': 0.004420625511556864, 'timestamp': '2025-10-01 04:20:55.911321', 'step': 3359, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:55.972268', 'step': 3359, 'epoch': 2} {'type': 'loss', 'content': 0.009271621704101562, 'timestamp': '2025-10-01 04:20:56.004000', 'step': 3360, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:56.055172', 'step': 3360, 'epoch': 2} {'type': 'loss', 'content': 0.00382146961055696, 'timestamp': '2025-10-01 04:20:56.065468', 'step': 3361, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:20:56.110971', 'step': 3361, 'epoch': 2} {'type': 'loss', 'content': 0.00382082792930305, 'timestamp': '2025-10-01 04:20:56.118658', 'step': 3362, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:20:56.166134', 'step': 3362, 'epoch': 2} {'type': 'loss', 'content': 0.0007540455553680658, 'timestamp': '2025-10-01 04:20:56.173134', 'step': 3363, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:56.224297', 'step': 3363, 'epoch': 2} {'type': 'loss', 'content': 0.02217872627079487, 'timestamp': '2025-10-01 04:20:56.253125', 'step': 3364, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:20:56.303440', 'step': 3364, 'epoch': 2} {'type': 'loss', 'content': 0.005328684113919735, 'timestamp': '2025-10-01 04:20:56.309784', 'step': 3365, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:56.347678', 'step': 3365, 'epoch': 2} {'type': 'loss', 'content': 0.004206249024719, 'timestamp': '2025-10-01 04:20:56.359657', 'step': 3366, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:20:56.403705', 'step': 3366, 'epoch': 2} {'type': 'loss', 'content': 0.008591735735535622, 'timestamp': '2025-10-01 04:20:56.417291', 'step': 3367, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:56.462416', 'step': 3367, 'epoch': 2} {'type': 'loss', 'content': 0.005467223469167948, 'timestamp': '2025-10-01 04:20:56.491173', 'step': 3368, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:56.538735', 'step': 3368, 'epoch': 2} {'type': 'loss', 'content': 0.0030799005180597305, 'timestamp': '2025-10-01 04:20:56.545631', 'step': 3369, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:20:56.607604', 'step': 3369, 'epoch': 2} {'type': 'loss', 'content': 0.004815699532628059, 'timestamp': '2025-10-01 04:20:56.614697', 'step': 3370, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:56.663058', 'step': 3370, 'epoch': 2} {'type': 'loss', 'content': 0.0026014605537056923, 'timestamp': '2025-10-01 04:20:56.673624', 'step': 3371, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:20:56.720265', 'step': 3371, 'epoch': 2} {'type': 'loss', 'content': 0.006404256913810968, 'timestamp': '2025-10-01 04:20:56.754722', 'step': 3372, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:56.797443', 'step': 3372, 'epoch': 2} {'type': 'loss', 'content': 0.006730062887072563, 'timestamp': '2025-10-01 04:20:56.805281', 'step': 3373, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:56.851167', 'step': 3373, 'epoch': 2} {'type': 'loss', 'content': 0.004159160424023867, 'timestamp': '2025-10-01 04:20:56.862262', 'step': 3374, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:20:56.911434', 'step': 3374, 'epoch': 2} {'type': 'loss', 'content': 0.0028081999626010656, 'timestamp': '2025-10-01 04:20:56.925449', 'step': 3375, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:56.971175', 'step': 3375, 'epoch': 2} {'type': 'loss', 'content': 0.0014097780222073197, 'timestamp': '2025-10-01 04:20:57.003241', 'step': 3376, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:57.045277', 'step': 3376, 'epoch': 2} {'type': 'loss', 'content': 0.0014837631024420261, 'timestamp': '2025-10-01 04:20:57.054268', 'step': 3377, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:57.100685', 'step': 3377, 'epoch': 2} {'type': 'loss', 'content': 0.009169837459921837, 'timestamp': '2025-10-01 04:20:57.113437', 'step': 3378, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:57.162179', 'step': 3378, 'epoch': 2} {'type': 'loss', 'content': 0.015446863137185574, 'timestamp': '2025-10-01 04:20:57.173763', 'step': 3379, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:57.222786', 'step': 3379, 'epoch': 2} {'type': 'loss', 'content': 0.012851693667471409, 'timestamp': '2025-10-01 04:20:57.256944', 'step': 3380, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:20:57.292904', 'step': 3380, 'epoch': 2} {'type': 'loss', 'content': 0.0032129427418112755, 'timestamp': '2025-10-01 04:20:57.305755', 'step': 3381, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:57.350119', 'step': 3381, 'epoch': 2} {'type': 'loss', 'content': 0.0037978561595082283, 'timestamp': '2025-10-01 04:20:57.362678', 'step': 3382, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:57.407029', 'step': 3382, 'epoch': 2} {'type': 'loss', 'content': 0.009818247519433498, 'timestamp': '2025-10-01 04:20:57.419770', 'step': 3383, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:57.461950', 'step': 3383, 'epoch': 2} {'type': 'loss', 'content': 0.010517618618905544, 'timestamp': '2025-10-01 04:20:57.495509', 'step': 3384, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:57.533521', 'step': 3384, 'epoch': 2} {'type': 'loss', 'content': 0.0035280860029160976, 'timestamp': '2025-10-01 04:20:57.542057', 'step': 3385, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:20:57.586428', 'step': 3385, 'epoch': 2} {'type': 'loss', 'content': 0.0005522941355593503, 'timestamp': '2025-10-01 04:20:57.600490', 'step': 3386, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:57.641927', 'step': 3386, 'epoch': 2} {'type': 'loss', 'content': 0.008391354233026505, 'timestamp': '2025-10-01 04:20:57.654548', 'step': 3387, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:57.693151', 'step': 3387, 'epoch': 2} {'type': 'loss', 'content': 0.0031950732227414846, 'timestamp': '2025-10-01 04:20:57.726636', 'step': 3388, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:57.765528', 'step': 3388, 'epoch': 2} {'type': 'loss', 'content': 0.00035758072044700384, 'timestamp': '2025-10-01 04:20:57.775896', 'step': 3389, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:20:57.819298', 'step': 3389, 'epoch': 2} {'type': 'loss', 'content': 0.0012964310590177774, 'timestamp': '2025-10-01 04:20:57.832919', 'step': 3390, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:57.872673', 'step': 3390, 'epoch': 2} {'type': 'loss', 'content': 0.007818169891834259, 'timestamp': '2025-10-01 04:20:57.885206', 'step': 3391, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:57.930520', 'step': 3391, 'epoch': 2} {'type': 'loss', 'content': 0.0014704506611451507, 'timestamp': '2025-10-01 04:20:57.964013', 'step': 3392, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:20:58.006507', 'step': 3392, 'epoch': 2} {'type': 'loss', 'content': 0.0005949065671302378, 'timestamp': '2025-10-01 04:20:58.019391', 'step': 3393, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:58.066505', 'step': 3393, 'epoch': 2} {'type': 'loss', 'content': 0.004601951688528061, 'timestamp': '2025-10-01 04:20:58.079273', 'step': 3394, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:58.120640', 'step': 3394, 'epoch': 2} {'type': 'loss', 'content': 0.0003592149878386408, 'timestamp': '2025-10-01 04:20:58.132250', 'step': 3395, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:20:58.179554', 'step': 3395, 'epoch': 2} {'type': 'loss', 'content': 0.008689827285706997, 'timestamp': '2025-10-01 04:20:58.214086', 'step': 3396, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:58.256689', 'step': 3396, 'epoch': 2} {'type': 'loss', 'content': 0.011584867723286152, 'timestamp': '2025-10-01 04:20:58.267846', 'step': 3397, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:58.318205', 'step': 3397, 'epoch': 2} {'type': 'loss', 'content': 0.00432134373113513, 'timestamp': '2025-10-01 04:20:58.329641', 'step': 3398, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:20:58.379794', 'step': 3398, 'epoch': 2} {'type': 'loss', 'content': 0.0010442063212394714, 'timestamp': '2025-10-01 04:20:58.393372', 'step': 3399, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:58.432754', 'step': 3399, 'epoch': 2} {'type': 'loss', 'content': 0.0008119798731058836, 'timestamp': '2025-10-01 04:20:58.462898', 'step': 3400, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:58.505305', 'step': 3400, 'epoch': 2} {'type': 'loss', 'content': 0.0007995080086402595, 'timestamp': '2025-10-01 04:20:58.516745', 'step': 3401, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:58.567089', 'step': 3401, 'epoch': 2} {'type': 'loss', 'content': 0.0010993783362209797, 'timestamp': '2025-10-01 04:20:58.578416', 'step': 3402, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:20:58.632322', 'step': 3402, 'epoch': 2} {'type': 'loss', 'content': 0.008750466629862785, 'timestamp': '2025-10-01 04:20:58.646364', 'step': 3403, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:20:58.686570', 'step': 3403, 'epoch': 2} {'type': 'loss', 'content': 0.0002882694243453443, 'timestamp': '2025-10-01 04:20:58.715733', 'step': 3404, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:58.755641', 'step': 3404, 'epoch': 2} {'type': 'loss', 'content': 0.0003267854335717857, 'timestamp': '2025-10-01 04:20:58.763926', 'step': 3405, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:20:58.807543', 'step': 3405, 'epoch': 2} {'type': 'loss', 'content': 0.002110755071043968, 'timestamp': '2025-10-01 04:20:58.821873', 'step': 3406, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:58.860878', 'step': 3406, 'epoch': 2} {'type': 'loss', 'content': 0.0035047801211476326, 'timestamp': '2025-10-01 04:20:58.872454', 'step': 3407, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:58.909803', 'step': 3407, 'epoch': 2} {'type': 'loss', 'content': 0.0006540483445860445, 'timestamp': '2025-10-01 04:20:58.941619', 'step': 3408, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:58.983810', 'step': 3408, 'epoch': 2} {'type': 'loss', 'content': 0.008842585608363152, 'timestamp': '2025-10-01 04:20:58.993036', 'step': 3409, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:59.025923', 'step': 3409, 'epoch': 2} {'type': 'loss', 'content': 0.003731707576662302, 'timestamp': '2025-10-01 04:20:59.038508', 'step': 3410, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:20:59.089948', 'step': 3410, 'epoch': 2} {'type': 'loss', 'content': 0.004696611315011978, 'timestamp': '2025-10-01 04:20:59.103547', 'step': 3411, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:59.150166', 'step': 3411, 'epoch': 2} {'type': 'loss', 'content': 0.019505584612488747, 'timestamp': '2025-10-01 04:20:59.182034', 'step': 3412, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:20:59.224522', 'step': 3412, 'epoch': 2} {'type': 'loss', 'content': 0.004343100357800722, 'timestamp': '2025-10-01 04:20:59.236451', 'step': 3413, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:59.282384', 'step': 3413, 'epoch': 2} {'type': 'loss', 'content': 0.004824060946702957, 'timestamp': '2025-10-01 04:20:59.295205', 'step': 3414, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:59.340775', 'step': 3414, 'epoch': 2} {'type': 'loss', 'content': 0.009430307894945145, 'timestamp': '2025-10-01 04:20:59.352321', 'step': 3415, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:59.399358', 'step': 3415, 'epoch': 2} {'type': 'loss', 'content': 0.018778562545776367, 'timestamp': '2025-10-01 04:20:59.433066', 'step': 3416, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:20:59.474388', 'step': 3416, 'epoch': 2} {'type': 'loss', 'content': 0.0020682301837950945, 'timestamp': '2025-10-01 04:20:59.485365', 'step': 3417, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:59.535856', 'step': 3417, 'epoch': 2} {'type': 'loss', 'content': 0.003330179024487734, 'timestamp': '2025-10-01 04:20:59.548453', 'step': 3418, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:20:59.593399', 'step': 3418, 'epoch': 2} {'type': 'loss', 'content': 0.007191121578216553, 'timestamp': '2025-10-01 04:20:59.605890', 'step': 3419, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:20:59.650591', 'step': 3419, 'epoch': 2} {'type': 'loss', 'content': 0.0008199995500035584, 'timestamp': '2025-10-01 04:20:59.685579', 'step': 3420, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:20:59.730069', 'step': 3420, 'epoch': 2} {'type': 'loss', 'content': 0.0010395839344710112, 'timestamp': '2025-10-01 04:20:59.742955', 'step': 3421, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:20:59.792597', 'step': 3421, 'epoch': 2} {'type': 'loss', 'content': 0.0003241751983296126, 'timestamp': '2025-10-01 04:20:59.808667', 'step': 3422, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:20:59.860543', 'step': 3422, 'epoch': 2} {'type': 'loss', 'content': 0.0009663720265962183, 'timestamp': '2025-10-01 04:20:59.876351', 'step': 3423, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:20:59.922575', 'step': 3423, 'epoch': 2} {'type': 'loss', 'content': 0.007445126306265593, 'timestamp': '2025-10-01 04:20:59.957585', 'step': 3424, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:20:59.997700', 'step': 3424, 'epoch': 2} {'type': 'loss', 'content': 0.006585016380995512, 'timestamp': '2025-10-01 04:21:00.006963', 'step': 3425, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:00.052020', 'step': 3425, 'epoch': 2} {'type': 'loss', 'content': 0.010316824540495872, 'timestamp': '2025-10-01 04:21:00.063727', 'step': 3426, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:21:00.107573', 'step': 3426, 'epoch': 2} {'type': 'loss', 'content': 0.010239421389997005, 'timestamp': '2025-10-01 04:21:00.118428', 'step': 3427, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:21:00.159828', 'step': 3427, 'epoch': 2} {'type': 'loss', 'content': 0.012397690676152706, 'timestamp': '2025-10-01 04:21:00.193524', 'step': 3428, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:21:00.235126', 'step': 3428, 'epoch': 2} {'type': 'loss', 'content': 0.007370943669229746, 'timestamp': '2025-10-01 04:21:00.243672', 'step': 3429, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:00.290528', 'step': 3429, 'epoch': 2} {'type': 'loss', 'content': 0.003678940236568451, 'timestamp': '2025-10-01 04:21:00.303076', 'step': 3430, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:21:00.336851', 'step': 3430, 'epoch': 2} {'type': 'loss', 'content': 0.01892003044486046, 'timestamp': '2025-10-01 04:21:00.347587', 'step': 3431, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:21:00.383910', 'step': 3431, 'epoch': 2} {'type': 'loss', 'content': 0.00471879355609417, 'timestamp': '2025-10-01 04:21:00.417590', 'step': 3432, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:00.457930', 'step': 3432, 'epoch': 2} {'type': 'loss', 'content': 0.00705791637301445, 'timestamp': '2025-10-01 04:21:00.470738', 'step': 3433, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:00.509370', 'step': 3433, 'epoch': 2} {'type': 'loss', 'content': 0.015243514440953732, 'timestamp': '2025-10-01 04:21:00.520918', 'step': 3434, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:21:00.565052', 'step': 3434, 'epoch': 2} {'type': 'loss', 'content': 0.0028298539109528065, 'timestamp': '2025-10-01 04:21:00.574576', 'step': 3435, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:21:00.618008', 'step': 3435, 'epoch': 2} {'type': 'loss', 'content': 0.0022668312303721905, 'timestamp': '2025-10-01 04:21:00.649702', 'step': 3436, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:00.686979', 'step': 3436, 'epoch': 2} {'type': 'loss', 'content': 0.007029287051409483, 'timestamp': '2025-10-01 04:21:00.696318', 'step': 3437, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:00.734264', 'step': 3437, 'epoch': 2} {'type': 'loss', 'content': 0.0020515096839517355, 'timestamp': '2025-10-01 04:21:00.746881', 'step': 3438, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:21:00.809020', 'step': 3438, 'epoch': 2} {'type': 'loss', 'content': 0.00671619176864624, 'timestamp': '2025-10-01 04:21:00.823018', 'step': 3439, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:21:00.860749', 'step': 3439, 'epoch': 2} {'type': 'loss', 'content': 0.0004347034264355898, 'timestamp': '2025-10-01 04:21:00.890456', 'step': 3440, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:00.932281', 'step': 3440, 'epoch': 2} {'type': 'loss', 'content': 0.0013966427650302649, 'timestamp': '2025-10-01 04:21:00.942556', 'step': 3441, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:21:00.988870', 'step': 3441, 'epoch': 2} {'type': 'loss', 'content': 0.01549519132822752, 'timestamp': '2025-10-01 04:21:01.002913', 'step': 3442, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:21:01.040669', 'step': 3442, 'epoch': 2} {'type': 'loss', 'content': 0.005171158816665411, 'timestamp': '2025-10-01 04:21:01.051536', 'step': 3443, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:01.090863', 'step': 3443, 'epoch': 2} {'type': 'loss', 'content': 0.00283059268258512, 'timestamp': '2025-10-01 04:21:01.124385', 'step': 3444, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:21:01.167590', 'step': 3444, 'epoch': 2} {'type': 'loss', 'content': 0.00998830795288086, 'timestamp': '2025-10-01 04:21:01.181174', 'step': 3445, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:21:01.221690', 'step': 3445, 'epoch': 2} {'type': 'loss', 'content': 0.007938687689602375, 'timestamp': '2025-10-01 04:21:01.234435', 'step': 3446, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:01.276056', 'step': 3446, 'epoch': 2} {'type': 'loss', 'content': 0.0017149104969576001, 'timestamp': '2025-10-01 04:21:01.289605', 'step': 3447, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:01.331361', 'step': 3447, 'epoch': 2} {'type': 'loss', 'content': 0.0014055409701541066, 'timestamp': '2025-10-01 04:21:01.365907', 'step': 3448, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:21:01.406780', 'step': 3448, 'epoch': 2} {'type': 'loss', 'content': 0.020550504326820374, 'timestamp': '2025-10-01 04:21:01.417636', 'step': 3449, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:21:01.459687', 'step': 3449, 'epoch': 2} {'type': 'loss', 'content': 0.010313997976481915, 'timestamp': '2025-10-01 04:21:01.473734', 'step': 3450, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-10-01 04:21:04.069948', 'step': 3450, 'epoch': 2} {'type': 'pplx', 'content': 6.180839115789414, 'timestamp': '2025-10-01 04:21:04.072147', 'step': 3450, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:04.109594', 'step': 3450, 'epoch': 2} {'type': 'loss', 'content': 0.011143877170979977, 'timestamp': '2025-10-01 04:21:04.123130', 'step': 3451, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:04.160401', 'step': 3451, 'epoch': 2} {'type': 'loss', 'content': 0.0024744817055761814, 'timestamp': '2025-10-01 04:21:04.193886', 'step': 3452, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:21:04.242314', 'step': 3452, 'epoch': 2} {'type': 'loss', 'content': 0.010676765814423561, 'timestamp': '2025-10-01 04:21:04.248226', 'step': 3453, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:04.292220', 'step': 3453, 'epoch': 2} {'type': 'loss', 'content': 0.015052909031510353, 'timestamp': '2025-10-01 04:21:04.303832', 'step': 3454, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:21:04.349516', 'step': 3454, 'epoch': 2} {'type': 'loss', 'content': 0.008177290670573711, 'timestamp': '2025-10-01 04:21:04.362302', 'step': 3455, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-10-01 04:21:04.417406', 'step': 3455, 'epoch': 2} {'type': 'loss', 'content': 0.009180535562336445, 'timestamp': '2025-10-01 04:21:04.454758', 'step': 3456, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:04.492794', 'step': 3456, 'epoch': 2} {'type': 'loss', 'content': 0.003851070301607251, 'timestamp': '2025-10-01 04:21:04.505607', 'step': 3457, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:21:04.560423', 'step': 3457, 'epoch': 2} {'type': 'loss', 'content': 0.0023840030189603567, 'timestamp': '2025-10-01 04:21:04.576254', 'step': 3458, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:04.620057', 'step': 3458, 'epoch': 2} {'type': 'loss', 'content': 0.011942200362682343, 'timestamp': '2025-10-01 04:21:04.632676', 'step': 3459, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:21:04.679269', 'step': 3459, 'epoch': 2} {'type': 'loss', 'content': 0.0059167081490159035, 'timestamp': '2025-10-01 04:21:04.712965', 'step': 3460, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:21:04.757331', 'step': 3460, 'epoch': 2} {'type': 'loss', 'content': 0.01357619185000658, 'timestamp': '2025-10-01 04:21:04.766810', 'step': 3461, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:04.812828', 'step': 3461, 'epoch': 2} {'type': 'loss', 'content': 0.014917567372322083, 'timestamp': '2025-10-01 04:21:04.825483', 'step': 3462, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:21:04.876524', 'step': 3462, 'epoch': 2} {'type': 'loss', 'content': 0.005105736665427685, 'timestamp': '2025-10-01 04:21:04.890543', 'step': 3463, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:21:04.940141', 'step': 3463, 'epoch': 2} {'type': 'loss', 'content': 0.012164900079369545, 'timestamp': '2025-10-01 04:21:04.970751', 'step': 3464, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:21:05.018570', 'step': 3464, 'epoch': 2} {'type': 'loss', 'content': 0.013260414823889732, 'timestamp': '2025-10-01 04:21:05.033870', 'step': 3465, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:21:05.071214', 'step': 3465, 'epoch': 2} {'type': 'loss', 'content': 0.0030844013672322035, 'timestamp': '2025-10-01 04:21:05.078357', 'step': 3466, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-10-01 04:21:05.134346', 'step': 3466, 'epoch': 2} {'type': 'loss', 'content': 0.0025625769048929214, 'timestamp': '2025-10-01 04:21:05.150608', 'step': 3467, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:21:05.197330', 'step': 3467, 'epoch': 2} {'type': 'loss', 'content': 0.004864360671490431, 'timestamp': '2025-10-01 04:21:05.225706', 'step': 3468, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:05.269167', 'step': 3468, 'epoch': 2} {'type': 'loss', 'content': 0.002173231914639473, 'timestamp': '2025-10-01 04:21:05.278460', 'step': 3469, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:05.324608', 'step': 3469, 'epoch': 2} {'type': 'loss', 'content': 0.011275903321802616, 'timestamp': '2025-10-01 04:21:05.336399', 'step': 3470, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:21:05.378530', 'step': 3470, 'epoch': 2} {'type': 'loss', 'content': 0.01018246728926897, 'timestamp': '2025-10-01 04:21:05.389591', 'step': 3471, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:05.431714', 'step': 3471, 'epoch': 2} {'type': 'loss', 'content': 0.00573386438190937, 'timestamp': '2025-10-01 04:21:05.465239', 'step': 3472, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:21:05.514447', 'step': 3472, 'epoch': 2} {'type': 'loss', 'content': 0.0057820891961455345, 'timestamp': '2025-10-01 04:21:05.519724', 'step': 3473, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:05.552873', 'step': 3473, 'epoch': 2} {'type': 'loss', 'content': 0.0022748271003365517, 'timestamp': '2025-10-01 04:21:05.565448', 'step': 3474, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:05.614686', 'step': 3474, 'epoch': 2} {'type': 'loss', 'content': 0.0048403688706457615, 'timestamp': '2025-10-01 04:21:05.626359', 'step': 3475, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:21:05.673447', 'step': 3475, 'epoch': 2} {'type': 'loss', 'content': 0.02874894067645073, 'timestamp': '2025-10-01 04:21:05.702822', 'step': 3476, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-10-01 04:21:05.752603', 'step': 3476, 'epoch': 2} {'type': 'loss', 'content': 0.04195503145456314, 'timestamp': '2025-10-01 04:21:05.759162', 'step': 3477, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:05.807212', 'step': 3477, 'epoch': 2} {'type': 'loss', 'content': 0.0041847690008580685, 'timestamp': '2025-10-01 04:21:05.819774', 'step': 3478, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:21:05.867348', 'step': 3478, 'epoch': 2} {'type': 'loss', 'content': 0.006701535079628229, 'timestamp': '2025-10-01 04:21:05.883465', 'step': 3479, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:21:05.929619', 'step': 3479, 'epoch': 2} {'type': 'loss', 'content': 0.006351891905069351, 'timestamp': '2025-10-01 04:21:05.959009', 'step': 3480, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:21:05.992367', 'step': 3480, 'epoch': 2} {'type': 'loss', 'content': 0.002570952521637082, 'timestamp': '2025-10-01 04:21:06.003441', 'step': 3481, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:06.052983', 'step': 3481, 'epoch': 2} {'type': 'loss', 'content': 0.013853223994374275, 'timestamp': '2025-10-01 04:21:06.065588', 'step': 3482, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:21:06.112614', 'step': 3482, 'epoch': 2} {'type': 'loss', 'content': 0.01432775054126978, 'timestamp': '2025-10-01 04:21:06.123498', 'step': 3483, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:21:06.177142', 'step': 3483, 'epoch': 2} {'type': 'loss', 'content': 0.005327321123331785, 'timestamp': '2025-10-01 04:21:06.212111', 'step': 3484, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:06.255308', 'step': 3484, 'epoch': 2} {'type': 'loss', 'content': 0.012041251175105572, 'timestamp': '2025-10-01 04:21:06.264676', 'step': 3485, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:06.312821', 'step': 3485, 'epoch': 2} {'type': 'loss', 'content': 0.0028958169277757406, 'timestamp': '2025-10-01 04:21:06.325325', 'step': 3486, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:21:06.365133', 'step': 3486, 'epoch': 2} {'type': 'loss', 'content': 0.025005092844367027, 'timestamp': '2025-10-01 04:21:06.377948', 'step': 3487, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:06.431246', 'step': 3487, 'epoch': 2} {'type': 'loss', 'content': 0.0033536949194967747, 'timestamp': '2025-10-01 04:21:06.464740', 'step': 3488, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:21:06.520800', 'step': 3488, 'epoch': 2} {'type': 'loss', 'content': 0.009406721219420433, 'timestamp': '2025-10-01 04:21:06.534119', 'step': 3489, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:06.582867', 'step': 3489, 'epoch': 2} {'type': 'loss', 'content': 0.00903734564781189, 'timestamp': '2025-10-01 04:21:06.596400', 'step': 3490, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:06.652465', 'step': 3490, 'epoch': 2} {'type': 'loss', 'content': 0.0029826986137777567, 'timestamp': '2025-10-01 04:21:06.666028', 'step': 3491, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:06.714185', 'step': 3491, 'epoch': 2} {'type': 'loss', 'content': 0.005055807530879974, 'timestamp': '2025-10-01 04:21:06.747723', 'step': 3492, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:06.800860', 'step': 3492, 'epoch': 2} {'type': 'loss', 'content': 0.006921333260834217, 'timestamp': '2025-10-01 04:21:06.811364', 'step': 3493, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:21:06.856407', 'step': 3493, 'epoch': 2} {'type': 'loss', 'content': 0.0014069892931729555, 'timestamp': '2025-10-01 04:21:06.867255', 'step': 3494, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:21:06.917551', 'step': 3494, 'epoch': 2} {'type': 'loss', 'content': 0.005927248392254114, 'timestamp': '2025-10-01 04:21:06.925531', 'step': 3495, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:21:06.982454', 'step': 3495, 'epoch': 2} {'type': 'loss', 'content': 0.015501566231250763, 'timestamp': '2025-10-01 04:21:07.014281', 'step': 3496, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:21:07.066801', 'step': 3496, 'epoch': 2} {'type': 'loss', 'content': 0.0025473537389189005, 'timestamp': '2025-10-01 04:21:07.080129', 'step': 3497, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:21:07.130516', 'step': 3497, 'epoch': 2} {'type': 'loss', 'content': 0.009828455746173859, 'timestamp': '2025-10-01 04:21:07.138410', 'step': 3498, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:21:07.197587', 'step': 3498, 'epoch': 2} {'type': 'loss', 'content': 0.009051754139363766, 'timestamp': '2025-10-01 04:21:07.211654', 'step': 3499, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:21:07.261039', 'step': 3499, 'epoch': 2} {'type': 'loss', 'content': 0.0047249398194253445, 'timestamp': '2025-10-01 04:21:07.290366', 'step': 3500, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 3500', 'timestamp': '2025-10-01 04:21:12.860605', 'step': 3500, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:21:12.903074', 'step': 3500, 'epoch': 2} {'type': 'loss', 'content': 0.002356200013309717, 'timestamp': '2025-10-01 04:21:12.911929', 'step': 3501, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:21:12.971417', 'step': 3501, 'epoch': 2} {'type': 'loss', 'content': 0.0012341502588242292, 'timestamp': '2025-10-01 04:21:12.984097', 'step': 3502, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:21:13.026850', 'step': 3502, 'epoch': 2} {'type': 'loss', 'content': 0.0033824192360043526, 'timestamp': '2025-10-01 04:21:13.034590', 'step': 3503, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:13.086084', 'step': 3503, 'epoch': 2} {'type': 'loss', 'content': 0.0010419373866170645, 'timestamp': '2025-10-01 04:21:13.120532', 'step': 3504, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:13.156973', 'step': 3504, 'epoch': 2} {'type': 'loss', 'content': 0.014888651669025421, 'timestamp': '2025-10-01 04:21:13.167363', 'step': 3505, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:21:13.222575', 'step': 3505, 'epoch': 2} {'type': 'loss', 'content': 0.006338586565107107, 'timestamp': '2025-10-01 04:21:13.230994', 'step': 3506, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:21:13.279626', 'step': 3506, 'epoch': 2} {'type': 'loss', 'content': 0.004382327664643526, 'timestamp': '2025-10-01 04:21:13.290450', 'step': 3507, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:21:13.338557', 'step': 3507, 'epoch': 2} {'type': 'loss', 'content': 0.013099217787384987, 'timestamp': '2025-10-01 04:21:13.367785', 'step': 3508, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:21:13.415290', 'step': 3508, 'epoch': 2} {'type': 'loss', 'content': 0.0067933169193565845, 'timestamp': '2025-10-01 04:21:13.426498', 'step': 3509, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:13.484641', 'step': 3509, 'epoch': 2} {'type': 'loss', 'content': 0.0035203148145228624, 'timestamp': '2025-10-01 04:21:13.498199', 'step': 3510, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:21:13.556816', 'step': 3510, 'epoch': 2} {'type': 'loss', 'content': 0.005642816890031099, 'timestamp': '2025-10-01 04:21:13.570749', 'step': 3511, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-10-01 04:21:13.620259', 'step': 3511, 'epoch': 2} {'type': 'loss', 'content': 0.0064141773618757725, 'timestamp': '2025-10-01 04:21:13.657581', 'step': 3512, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:13.705547', 'step': 3512, 'epoch': 2} {'type': 'loss', 'content': 0.005579173099249601, 'timestamp': '2025-10-01 04:21:13.718425', 'step': 3513, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:21:13.765151', 'step': 3513, 'epoch': 2} {'type': 'loss', 'content': 0.010117200203239918, 'timestamp': '2025-10-01 04:21:13.777959', 'step': 3514, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:13.827988', 'step': 3514, 'epoch': 2} {'type': 'loss', 'content': 0.0029709292575716972, 'timestamp': '2025-10-01 04:21:13.839339', 'step': 3515, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:21:13.890746', 'step': 3515, 'epoch': 2} {'type': 'loss', 'content': 0.005443285685032606, 'timestamp': '2025-10-01 04:21:13.925734', 'step': 3516, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:21:13.973057', 'step': 3516, 'epoch': 2} {'type': 'loss', 'content': 0.009365163743495941, 'timestamp': '2025-10-01 04:21:13.986598', 'step': 3517, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:21:14.028790', 'step': 3517, 'epoch': 2} {'type': 'loss', 'content': 0.003940186463296413, 'timestamp': '2025-10-01 04:21:14.037186', 'step': 3518, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:21:14.095315', 'step': 3518, 'epoch': 2} {'type': 'loss', 'content': 0.0021791516337543726, 'timestamp': '2025-10-01 04:21:14.108086', 'step': 3519, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:21:14.156799', 'step': 3519, 'epoch': 2} {'type': 'loss', 'content': 0.0011362936347723007, 'timestamp': '2025-10-01 04:21:14.186117', 'step': 3520, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:21:14.229416', 'step': 3520, 'epoch': 2} {'type': 'loss', 'content': 0.00495500210672617, 'timestamp': '2025-10-01 04:21:14.240838', 'step': 3521, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:21:14.285573', 'step': 3521, 'epoch': 2} {'type': 'loss', 'content': 0.08984890580177307, 'timestamp': '2025-10-01 04:21:14.293789', 'step': 3522, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:21:14.344297', 'step': 3522, 'epoch': 2} {'type': 'loss', 'content': 0.006633015349507332, 'timestamp': '2025-10-01 04:21:14.355119', 'step': 3523, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:21:14.408333', 'step': 3523, 'epoch': 2} {'type': 'loss', 'content': 0.009272017516195774, 'timestamp': '2025-10-01 04:21:14.437282', 'step': 3524, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:14.477534', 'step': 3524, 'epoch': 2} {'type': 'loss', 'content': 0.006005698814988136, 'timestamp': '2025-10-01 04:21:14.488014', 'step': 3525, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-10-01 04:21:14.543870', 'step': 3525, 'epoch': 2} {'type': 'loss', 'content': 0.007930831983685493, 'timestamp': '2025-10-01 04:21:14.561226', 'step': 3526, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-10-01 04:21:14.610454', 'step': 3526, 'epoch': 2} {'type': 'loss', 'content': 0.008113951422274113, 'timestamp': '2025-10-01 04:21:14.626790', 'step': 3527, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:14.664661', 'step': 3527, 'epoch': 2} {'type': 'loss', 'content': 0.02410772815346718, 'timestamp': '2025-10-01 04:21:14.699163', 'step': 3528, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:14.742967', 'step': 3528, 'epoch': 2} {'type': 'loss', 'content': 0.010740519501268864, 'timestamp': '2025-10-01 04:21:14.754802', 'step': 3529, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:14.803820', 'step': 3529, 'epoch': 2} {'type': 'loss', 'content': 0.007737560197710991, 'timestamp': '2025-10-01 04:21:14.817417', 'step': 3530, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:14.855546', 'step': 3530, 'epoch': 2} {'type': 'loss', 'content': 0.008408408612012863, 'timestamp': '2025-10-01 04:21:14.869112', 'step': 3531, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:14.916923', 'step': 3531, 'epoch': 2} {'type': 'loss', 'content': 0.011354845017194748, 'timestamp': '2025-10-01 04:21:14.951474', 'step': 3532, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:21:15.018861', 'step': 3532, 'epoch': 2} {'type': 'loss', 'content': 0.006866615731269121, 'timestamp': '2025-10-01 04:21:15.032202', 'step': 3533, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:21:15.073109', 'step': 3533, 'epoch': 2} {'type': 'loss', 'content': 0.0038457929622381926, 'timestamp': '2025-10-01 04:21:15.083974', 'step': 3534, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:21:15.126372', 'step': 3534, 'epoch': 2} {'type': 'loss', 'content': 0.0068074907176196575, 'timestamp': '2025-10-01 04:21:15.140446', 'step': 3535, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:15.188329', 'step': 3535, 'epoch': 2} {'type': 'loss', 'content': 0.006661117542535067, 'timestamp': '2025-10-01 04:21:15.221897', 'step': 3536, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:21:15.279140', 'step': 3536, 'epoch': 2} {'type': 'loss', 'content': 0.006107887253165245, 'timestamp': '2025-10-01 04:21:15.292510', 'step': 3537, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:21:15.339278', 'step': 3537, 'epoch': 2} {'type': 'loss', 'content': 0.010051233693957329, 'timestamp': '2025-10-01 04:21:15.352077', 'step': 3538, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:21:15.389356', 'step': 3538, 'epoch': 2} {'type': 'loss', 'content': 0.00453915586695075, 'timestamp': '2025-10-01 04:21:15.397749', 'step': 3539, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:15.442069', 'step': 3539, 'epoch': 2} {'type': 'loss', 'content': 0.007447835057973862, 'timestamp': '2025-10-01 04:21:15.476593', 'step': 3540, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:21:15.521619', 'step': 3540, 'epoch': 2} {'type': 'loss', 'content': 0.003138076514005661, 'timestamp': '2025-10-01 04:21:15.536923', 'step': 3541, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:15.578455', 'step': 3541, 'epoch': 2} {'type': 'loss', 'content': 0.008174720220267773, 'timestamp': '2025-10-01 04:21:15.590998', 'step': 3542, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:21:15.635390', 'step': 3542, 'epoch': 2} {'type': 'loss', 'content': 0.004435270093381405, 'timestamp': '2025-10-01 04:21:15.648160', 'step': 3543, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:15.693183', 'step': 3543, 'epoch': 2} {'type': 'loss', 'content': 0.004019815940409899, 'timestamp': '2025-10-01 04:21:15.726692', 'step': 3544, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:15.773395', 'step': 3544, 'epoch': 2} {'type': 'loss', 'content': 0.0024687405675649643, 'timestamp': '2025-10-01 04:21:15.782744', 'step': 3545, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:15.824231', 'step': 3545, 'epoch': 2} {'type': 'loss', 'content': 0.012097933329641819, 'timestamp': '2025-10-01 04:21:15.835863', 'step': 3546, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:21:15.868813', 'step': 3546, 'epoch': 2} {'type': 'loss', 'content': 0.003984780050814152, 'timestamp': '2025-10-01 04:21:15.879689', 'step': 3547, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:21:15.913524', 'step': 3547, 'epoch': 2} {'type': 'loss', 'content': 0.008230150677263737, 'timestamp': '2025-10-01 04:21:15.945280', 'step': 3548, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:15.994457', 'step': 3548, 'epoch': 2} {'type': 'loss', 'content': 0.007387492805719376, 'timestamp': '2025-10-01 04:21:16.003609', 'step': 3549, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:16.045684', 'step': 3549, 'epoch': 2} {'type': 'loss', 'content': 0.005485896486788988, 'timestamp': '2025-10-01 04:21:16.057325', 'step': 3550, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:21:16.100543', 'step': 3550, 'epoch': 2} {'type': 'loss', 'content': 0.009977423585951328, 'timestamp': '2025-10-01 04:21:16.108604', 'step': 3551, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:21:16.176058', 'step': 3551, 'epoch': 2} {'type': 'loss', 'content': 0.016925528645515442, 'timestamp': '2025-10-01 04:21:16.211030', 'step': 3552, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:21:16.270352', 'step': 3552, 'epoch': 2} {'type': 'loss', 'content': 0.002512283157557249, 'timestamp': '2025-10-01 04:21:16.283748', 'step': 3553, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:21:16.338197', 'step': 3553, 'epoch': 2} {'type': 'loss', 'content': 0.0019997514318674803, 'timestamp': '2025-10-01 04:21:16.351010', 'step': 3554, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:21:16.392933', 'step': 3554, 'epoch': 2} {'type': 'loss', 'content': 0.004384097643196583, 'timestamp': '2025-10-01 04:21:16.403868', 'step': 3555, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:16.460498', 'step': 3555, 'epoch': 2} {'type': 'loss', 'content': 0.0030208348762243986, 'timestamp': '2025-10-01 04:21:16.493047', 'step': 3556, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:21:16.540468', 'step': 3556, 'epoch': 2} {'type': 'loss', 'content': 0.003076892113313079, 'timestamp': '2025-10-01 04:21:16.553834', 'step': 3557, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:21:16.604353', 'step': 3557, 'epoch': 2} {'type': 'loss', 'content': 0.004394651390612125, 'timestamp': '2025-10-01 04:21:16.618430', 'step': 3558, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-10-01 04:21:16.670208', 'step': 3558, 'epoch': 2} {'type': 'loss', 'content': 0.003945488017052412, 'timestamp': '2025-10-01 04:21:16.686779', 'step': 3559, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:16.736170', 'step': 3559, 'epoch': 2} {'type': 'loss', 'content': 0.0025650588795542717, 'timestamp': '2025-10-01 04:21:16.768817', 'step': 3560, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:16.813372', 'step': 3560, 'epoch': 2} {'type': 'loss', 'content': 0.0033456459641456604, 'timestamp': '2025-10-01 04:21:16.823959', 'step': 3561, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:21:16.870026', 'step': 3561, 'epoch': 2} {'type': 'loss', 'content': 0.0029470669105648994, 'timestamp': '2025-10-01 04:21:16.882764', 'step': 3562, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:21:16.925829', 'step': 3562, 'epoch': 2} {'type': 'loss', 'content': 0.004917456302791834, 'timestamp': '2025-10-01 04:21:16.936766', 'step': 3563, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:21:16.974292', 'step': 3563, 'epoch': 2} {'type': 'loss', 'content': 0.007657179608941078, 'timestamp': '2025-10-01 04:21:17.007980', 'step': 3564, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:17.048166', 'step': 3564, 'epoch': 2} {'type': 'loss', 'content': 0.008127033710479736, 'timestamp': '2025-10-01 04:21:17.057387', 'step': 3565, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-10-01 04:21:19.838012', 'step': 3565, 'epoch': 2} {'type': 'pplx', 'content': 5.852938150975257, 'timestamp': '2025-10-01 04:21:19.841618', 'step': 3565, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:19.888574', 'step': 3565, 'epoch': 2} {'type': 'loss', 'content': 0.011512434110045433, 'timestamp': '2025-10-01 04:21:19.901734', 'step': 3566, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:21:19.954246', 'step': 3566, 'epoch': 2} {'type': 'loss', 'content': 0.0009366077720187604, 'timestamp': '2025-10-01 04:21:19.967853', 'step': 3567, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:20.011426', 'step': 3567, 'epoch': 2} {'type': 'loss', 'content': 0.0028857439756393433, 'timestamp': '2025-10-01 04:21:20.045691', 'step': 3568, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:21:20.087141', 'step': 3568, 'epoch': 2} {'type': 'loss', 'content': 0.009246950969099998, 'timestamp': '2025-10-01 04:21:20.099633', 'step': 3569, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:20.145129', 'step': 3569, 'epoch': 2} {'type': 'loss', 'content': 0.005562372040003538, 'timestamp': '2025-10-01 04:21:20.158728', 'step': 3570, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:21:20.208855', 'step': 3570, 'epoch': 2} {'type': 'loss', 'content': 0.002368408953770995, 'timestamp': '2025-10-01 04:21:20.217193', 'step': 3571, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:21:20.258864', 'step': 3571, 'epoch': 2} {'type': 'loss', 'content': 0.0026756622828543186, 'timestamp': '2025-10-01 04:21:20.293746', 'step': 3572, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:21:20.332000', 'step': 3572, 'epoch': 2} {'type': 'loss', 'content': 0.0028482030611485243, 'timestamp': '2025-10-01 04:21:20.340606', 'step': 3573, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:21:20.382470', 'step': 3573, 'epoch': 2} {'type': 'loss', 'content': 0.00388348032720387, 'timestamp': '2025-10-01 04:21:20.389775', 'step': 3574, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:20.440003', 'step': 3574, 'epoch': 2} {'type': 'loss', 'content': 0.004181555937975645, 'timestamp': '2025-10-01 04:21:20.451427', 'step': 3575, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:21:20.492008', 'step': 3575, 'epoch': 2} {'type': 'loss', 'content': 0.0025545007083564997, 'timestamp': '2025-10-01 04:21:20.529234', 'step': 3576, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:21:20.563405', 'step': 3576, 'epoch': 2} {'type': 'loss', 'content': 0.012850345112383366, 'timestamp': '2025-10-01 04:21:20.575543', 'step': 3577, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:20.630883', 'step': 3577, 'epoch': 2} {'type': 'loss', 'content': 0.0023591588251292706, 'timestamp': '2025-10-01 04:21:20.643461', 'step': 3578, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:20.688149', 'step': 3578, 'epoch': 2} {'type': 'loss', 'content': 0.0021441220305860043, 'timestamp': '2025-10-01 04:21:20.700680', 'step': 3579, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:21:20.755275', 'step': 3579, 'epoch': 2} {'type': 'loss', 'content': 0.005983383860439062, 'timestamp': '2025-10-01 04:21:20.790277', 'step': 3580, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:21:20.836708', 'step': 3580, 'epoch': 2} {'type': 'loss', 'content': 0.007465463597327471, 'timestamp': '2025-10-01 04:21:20.850013', 'step': 3581, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:21:20.893218', 'step': 3581, 'epoch': 2} {'type': 'loss', 'content': 0.0031465000938624144, 'timestamp': '2025-10-01 04:21:20.900722', 'step': 3582, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-10-01 04:21:20.939704', 'step': 3582, 'epoch': 2} {'type': 'loss', 'content': 0.01339349802583456, 'timestamp': '2025-10-01 04:21:20.944177', 'step': 3583, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-10-01 04:21:20.985044', 'step': 3583, 'epoch': 2} {'type': 'loss', 'content': 0.001871186075732112, 'timestamp': '2025-10-01 04:21:21.011941', 'step': 3584, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:21.054178', 'step': 3584, 'epoch': 2} {'type': 'loss', 'content': 0.003986815921962261, 'timestamp': '2025-10-01 04:21:21.064366', 'step': 3585, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:21:21.104688', 'step': 3585, 'epoch': 2} {'type': 'loss', 'content': 0.0018133093835785985, 'timestamp': '2025-10-01 04:21:21.115628', 'step': 3586, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:21:21.157082', 'step': 3586, 'epoch': 2} {'type': 'loss', 'content': 0.0017748401733115315, 'timestamp': '2025-10-01 04:21:21.167870', 'step': 3587, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:21:21.210181', 'step': 3587, 'epoch': 2} {'type': 'loss', 'content': 0.006242212373763323, 'timestamp': '2025-10-01 04:21:21.239134', 'step': 3588, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:21:21.290534', 'step': 3588, 'epoch': 2} {'type': 'loss', 'content': 0.0009158995817415416, 'timestamp': '2025-10-01 04:21:21.301327', 'step': 3589, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:21:21.353059', 'step': 3589, 'epoch': 2} {'type': 'loss', 'content': 0.0027336899656802416, 'timestamp': '2025-10-01 04:21:21.361351', 'step': 3590, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:21:21.400272', 'step': 3590, 'epoch': 2} {'type': 'loss', 'content': 0.004959358833730221, 'timestamp': '2025-10-01 04:21:21.413056', 'step': 3591, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:21:21.467288', 'step': 3591, 'epoch': 2} {'type': 'loss', 'content': 0.003454524790868163, 'timestamp': '2025-10-01 04:21:21.501032', 'step': 3592, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:21.538315', 'step': 3592, 'epoch': 2} {'type': 'loss', 'content': 0.03133634850382805, 'timestamp': '2025-10-01 04:21:21.547532', 'step': 3593, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:21.589849', 'step': 3593, 'epoch': 2} {'type': 'loss', 'content': 0.007856866344809532, 'timestamp': '2025-10-01 04:21:21.603388', 'step': 3594, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:21:21.648374', 'step': 3594, 'epoch': 2} {'type': 'loss', 'content': 0.016919678077101707, 'timestamp': '2025-10-01 04:21:21.656611', 'step': 3595, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:21:21.692845', 'step': 3595, 'epoch': 2} {'type': 'loss', 'content': 0.00734949903562665, 'timestamp': '2025-10-01 04:21:21.724675', 'step': 3596, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:21:21.779715', 'step': 3596, 'epoch': 2} {'type': 'loss', 'content': 0.001403686124831438, 'timestamp': '2025-10-01 04:21:21.786525', 'step': 3597, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:21:21.843996', 'step': 3597, 'epoch': 2} {'type': 'loss', 'content': 0.008136210031807423, 'timestamp': '2025-10-01 04:21:21.852401', 'step': 3598, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:21.897572', 'step': 3598, 'epoch': 2} {'type': 'loss', 'content': 0.007268601097166538, 'timestamp': '2025-10-01 04:21:21.911176', 'step': 3599, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:21.956849', 'step': 3599, 'epoch': 2} {'type': 'loss', 'content': 0.008646498434245586, 'timestamp': '2025-10-01 04:21:21.989509', 'step': 3600, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:21:22.043572', 'step': 3600, 'epoch': 2} {'type': 'loss', 'content': 0.00513711292296648, 'timestamp': '2025-10-01 04:21:22.053290', 'step': 3601, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:22.113518', 'step': 3601, 'epoch': 2} {'type': 'loss', 'content': 0.005385591182857752, 'timestamp': '2025-10-01 04:21:22.127111', 'step': 3602, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 14712978242368}, 'timestamp': '2025-10-01 04:21:22.184231', 'step': 3602, 'epoch': 2} {'type': 'loss', 'content': 0.013840852305293083, 'timestamp': '2025-10-01 04:21:22.201947', 'step': 3603, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:21:22.246562', 'step': 3603, 'epoch': 2} {'type': 'loss', 'content': 0.00506893265992403, 'timestamp': '2025-10-01 04:21:22.281511', 'step': 3604, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:21:22.336014', 'step': 3604, 'epoch': 2} {'type': 'loss', 'content': 0.008357024751603603, 'timestamp': '2025-10-01 04:21:22.351337', 'step': 3605, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:21:22.395269', 'step': 3605, 'epoch': 2} {'type': 'loss', 'content': 0.02101152390241623, 'timestamp': '2025-10-01 04:21:22.408064', 'step': 3606, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:21:22.452863', 'step': 3606, 'epoch': 2} {'type': 'loss', 'content': 0.003687090240418911, 'timestamp': '2025-10-01 04:21:22.465644', 'step': 3607, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:21:22.532474', 'step': 3607, 'epoch': 2} {'type': 'loss', 'content': 0.0041946908459067345, 'timestamp': '2025-10-01 04:21:22.568192', 'step': 3608, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:21:22.631004', 'step': 3608, 'epoch': 2} {'type': 'loss', 'content': 0.005259398370981216, 'timestamp': '2025-10-01 04:21:22.644931', 'step': 3609, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:22.702842', 'step': 3609, 'epoch': 2} {'type': 'loss', 'content': 0.01051033940166235, 'timestamp': '2025-10-01 04:21:22.715372', 'step': 3610, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:21:22.765716', 'step': 3610, 'epoch': 2} {'type': 'loss', 'content': 0.004030001815408468, 'timestamp': '2025-10-01 04:21:22.779739', 'step': 3611, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:21:22.823265', 'step': 3611, 'epoch': 2} {'type': 'loss', 'content': 0.0038656650576740503, 'timestamp': '2025-10-01 04:21:22.855151', 'step': 3612, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:22.894607', 'step': 3612, 'epoch': 2} {'type': 'loss', 'content': 0.010231290943920612, 'timestamp': '2025-10-01 04:21:22.906080', 'step': 3613, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:21:22.959604', 'step': 3613, 'epoch': 2} {'type': 'loss', 'content': 0.009555571712553501, 'timestamp': '2025-10-01 04:21:22.973627', 'step': 3614, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:21:23.018260', 'step': 3614, 'epoch': 2} {'type': 'loss', 'content': 0.0036950737703591585, 'timestamp': '2025-10-01 04:21:23.031005', 'step': 3615, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:21:23.074531', 'step': 3615, 'epoch': 2} {'type': 'loss', 'content': 0.005088642239570618, 'timestamp': '2025-10-01 04:21:23.106278', 'step': 3616, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:23.145901', 'step': 3616, 'epoch': 2} {'type': 'loss', 'content': 0.007594781927764416, 'timestamp': '2025-10-01 04:21:23.155241', 'step': 3617, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:23.208962', 'step': 3617, 'epoch': 2} {'type': 'loss', 'content': 0.0033215859439224005, 'timestamp': '2025-10-01 04:21:23.220549', 'step': 3618, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:21:23.266783', 'step': 3618, 'epoch': 2} {'type': 'loss', 'content': 0.0033614691346883774, 'timestamp': '2025-10-01 04:21:23.275105', 'step': 3619, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:21:23.331907', 'step': 3619, 'epoch': 2} {'type': 'loss', 'content': 0.0020737173035740852, 'timestamp': '2025-10-01 04:21:23.366875', 'step': 3620, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:21:23.417332', 'step': 3620, 'epoch': 2} {'type': 'loss', 'content': 0.006450573913753033, 'timestamp': '2025-10-01 04:21:23.432948', 'step': 3621, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:21:23.485175', 'step': 3621, 'epoch': 2} {'type': 'loss', 'content': 0.004080698825418949, 'timestamp': '2025-10-01 04:21:23.499174', 'step': 3622, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:21:23.544479', 'step': 3622, 'epoch': 2} {'type': 'loss', 'content': 0.0026450776495039463, 'timestamp': '2025-10-01 04:21:23.558570', 'step': 3623, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:21:23.602423', 'step': 3623, 'epoch': 2} {'type': 'loss', 'content': 0.00608130544424057, 'timestamp': '2025-10-01 04:21:23.636180', 'step': 3624, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:23.672900', 'step': 3624, 'epoch': 2} {'type': 'loss', 'content': 0.01088705100119114, 'timestamp': '2025-10-01 04:21:23.683203', 'step': 3625, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:21:23.721592', 'step': 3625, 'epoch': 2} {'type': 'loss', 'content': 0.004070811439305544, 'timestamp': '2025-10-01 04:21:23.734346', 'step': 3626, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:21:23.790083', 'step': 3626, 'epoch': 2} {'type': 'loss', 'content': 0.00380131253041327, 'timestamp': '2025-10-01 04:21:23.806156', 'step': 3627, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:21:23.845052', 'step': 3627, 'epoch': 2} {'type': 'loss', 'content': 0.002175417961552739, 'timestamp': '2025-10-01 04:21:23.873982', 'step': 3628, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-10-01 04:21:23.926185', 'step': 3628, 'epoch': 2} {'type': 'loss', 'content': 0.013972057960927486, 'timestamp': '2025-10-01 04:21:23.942052', 'step': 3629, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:23.995513', 'step': 3629, 'epoch': 2} {'type': 'loss', 'content': 0.012729805894196033, 'timestamp': '2025-10-01 04:21:24.007125', 'step': 3630, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:24.059230', 'step': 3630, 'epoch': 2} {'type': 'loss', 'content': 0.0030193079728633165, 'timestamp': '2025-10-01 04:21:24.071015', 'step': 3631, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:21:24.118805', 'step': 3631, 'epoch': 2} {'type': 'loss', 'content': 0.007514115888625383, 'timestamp': '2025-10-01 04:21:24.150631', 'step': 3632, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-10-01 04:21:24.195281', 'step': 3632, 'epoch': 2} {'type': 'loss', 'content': 0.0022967944387346506, 'timestamp': '2025-10-01 04:21:24.211188', 'step': 3633, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:21:24.250889', 'step': 3633, 'epoch': 2} {'type': 'loss', 'content': 0.010498822666704655, 'timestamp': '2025-10-01 04:21:24.264934', 'step': 3634, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:21:24.315466', 'step': 3634, 'epoch': 2} {'type': 'loss', 'content': 0.006436318624764681, 'timestamp': '2025-10-01 04:21:24.329481', 'step': 3635, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 14238374516352}, 'timestamp': '2025-10-01 04:21:24.376360', 'step': 3635, 'epoch': 2} {'type': 'loss', 'content': 0.004777480382472277, 'timestamp': '2025-10-01 04:21:24.414772', 'step': 3636, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:24.450685', 'step': 3636, 'epoch': 2} {'type': 'loss', 'content': 0.011695281602442265, 'timestamp': '2025-10-01 04:21:24.459703', 'step': 3637, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:24.498123', 'step': 3637, 'epoch': 2} {'type': 'loss', 'content': 0.0035629987251013517, 'timestamp': '2025-10-01 04:21:24.510687', 'step': 3638, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:21:24.561360', 'step': 3638, 'epoch': 2} {'type': 'loss', 'content': 0.004373285453766584, 'timestamp': '2025-10-01 04:21:24.575308', 'step': 3639, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:21:24.627462', 'step': 3639, 'epoch': 2} {'type': 'loss', 'content': 0.002190364059060812, 'timestamp': '2025-10-01 04:21:24.662595', 'step': 3640, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:24.712335', 'step': 3640, 'epoch': 2} {'type': 'loss', 'content': 0.010532655753195286, 'timestamp': '2025-10-01 04:21:24.721552', 'step': 3641, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:24.773626', 'step': 3641, 'epoch': 2} {'type': 'loss', 'content': 0.0035297528374940157, 'timestamp': '2025-10-01 04:21:24.787150', 'step': 3642, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:21:24.830852', 'step': 3642, 'epoch': 2} {'type': 'loss', 'content': 0.007361642085015774, 'timestamp': '2025-10-01 04:21:24.844820', 'step': 3643, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:24.886666', 'step': 3643, 'epoch': 2} {'type': 'loss', 'content': 0.005647050216794014, 'timestamp': '2025-10-01 04:21:24.920189', 'step': 3644, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:24.961197', 'step': 3644, 'epoch': 2} {'type': 'loss', 'content': 0.007834755815565586, 'timestamp': '2025-10-01 04:21:24.971635', 'step': 3645, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:25.010243', 'step': 3645, 'epoch': 2} {'type': 'loss', 'content': 0.0030830081086605787, 'timestamp': '2025-10-01 04:21:25.021763', 'step': 3646, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:21:25.058213', 'step': 3646, 'epoch': 2} {'type': 'loss', 'content': 0.003513108240440488, 'timestamp': '2025-10-01 04:21:25.069087', 'step': 3647, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:25.108291', 'step': 3647, 'epoch': 2} {'type': 'loss', 'content': 0.006579185836017132, 'timestamp': '2025-10-01 04:21:25.140710', 'step': 3648, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:21:25.180251', 'step': 3648, 'epoch': 2} {'type': 'loss', 'content': 0.005885055288672447, 'timestamp': '2025-10-01 04:21:25.186095', 'step': 3649, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:21:25.228848', 'step': 3649, 'epoch': 2} {'type': 'loss', 'content': 0.003572633722797036, 'timestamp': '2025-10-01 04:21:25.235935', 'step': 3650, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:25.283853', 'step': 3650, 'epoch': 2} {'type': 'loss', 'content': 0.004030601587146521, 'timestamp': '2025-10-01 04:21:25.297345', 'step': 3651, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:21:25.339677', 'step': 3651, 'epoch': 2} {'type': 'loss', 'content': 0.023808851838111877, 'timestamp': '2025-10-01 04:21:25.373331', 'step': 3652, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:25.411720', 'step': 3652, 'epoch': 2} {'type': 'loss', 'content': 0.008621822111308575, 'timestamp': '2025-10-01 04:21:25.424756', 'step': 3653, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:21:25.463141', 'step': 3653, 'epoch': 2} {'type': 'loss', 'content': 0.011664393357932568, 'timestamp': '2025-10-01 04:21:25.475935', 'step': 3654, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:25.514457', 'step': 3654, 'epoch': 2} {'type': 'loss', 'content': 0.005966162774711847, 'timestamp': '2025-10-01 04:21:25.528004', 'step': 3655, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:25.564827', 'step': 3655, 'epoch': 2} {'type': 'loss', 'content': 0.011967264115810394, 'timestamp': '2025-10-01 04:21:25.598342', 'step': 3656, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:21:25.662457', 'step': 3656, 'epoch': 2} {'type': 'loss', 'content': 0.004900925327092409, 'timestamp': '2025-10-01 04:21:25.676003', 'step': 3657, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:25.711738', 'step': 3657, 'epoch': 2} {'type': 'loss', 'content': 0.0031652122270315886, 'timestamp': '2025-10-01 04:21:25.724271', 'step': 3658, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 17560600598464}, 'timestamp': '2025-10-01 04:21:25.788462', 'step': 3658, 'epoch': 2} {'type': 'loss', 'content': 0.0034585276152938604, 'timestamp': '2025-10-01 04:21:25.809642', 'step': 3659, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:25.851972', 'step': 3659, 'epoch': 2} {'type': 'loss', 'content': 0.00712274806573987, 'timestamp': '2025-10-01 04:21:25.886414', 'step': 3660, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:21:25.939372', 'step': 3660, 'epoch': 2} {'type': 'loss', 'content': 0.00122780108358711, 'timestamp': '2025-10-01 04:21:25.944311', 'step': 3661, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:25.996394', 'step': 3661, 'epoch': 2} {'type': 'loss', 'content': 0.0005794846219941974, 'timestamp': '2025-10-01 04:21:26.008968', 'step': 3662, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:26.052229', 'step': 3662, 'epoch': 2} {'type': 'loss', 'content': 0.002142628887668252, 'timestamp': '2025-10-01 04:21:26.065817', 'step': 3663, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:21:26.107459', 'step': 3663, 'epoch': 2} {'type': 'loss', 'content': 0.0008109890623018146, 'timestamp': '2025-10-01 04:21:26.141132', 'step': 3664, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:26.186585', 'step': 3664, 'epoch': 2} {'type': 'loss', 'content': 0.002143657999113202, 'timestamp': '2025-10-01 04:21:26.199399', 'step': 3665, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:26.235837', 'step': 3665, 'epoch': 2} {'type': 'loss', 'content': 0.010006860829889774, 'timestamp': '2025-10-01 04:21:26.247311', 'step': 3666, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:26.291815', 'step': 3666, 'epoch': 2} {'type': 'loss', 'content': 0.014249571599066257, 'timestamp': '2025-10-01 04:21:26.304348', 'step': 3667, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:21:26.352142', 'step': 3667, 'epoch': 2} {'type': 'loss', 'content': 0.021893944591283798, 'timestamp': '2025-10-01 04:21:26.387113', 'step': 3668, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:26.429533', 'step': 3668, 'epoch': 2} {'type': 'loss', 'content': 0.008589619770646095, 'timestamp': '2025-10-01 04:21:26.442371', 'step': 3669, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:21:26.485233', 'step': 3669, 'epoch': 2} {'type': 'loss', 'content': 0.009280269034206867, 'timestamp': '2025-10-01 04:21:26.498030', 'step': 3670, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:21:26.545597', 'step': 3670, 'epoch': 2} {'type': 'loss', 'content': 0.008715203031897545, 'timestamp': '2025-10-01 04:21:26.559630', 'step': 3671, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:26.602024', 'step': 3671, 'epoch': 2} {'type': 'loss', 'content': 0.0057623316533863544, 'timestamp': '2025-10-01 04:21:26.635544', 'step': 3672, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:26.679396', 'step': 3672, 'epoch': 2} {'type': 'loss', 'content': 0.018947254866361618, 'timestamp': '2025-10-01 04:21:26.689651', 'step': 3673, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:26.732261', 'step': 3673, 'epoch': 2} {'type': 'loss', 'content': 0.013083810918033123, 'timestamp': '2025-10-01 04:21:26.743869', 'step': 3674, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:21:26.790142', 'step': 3674, 'epoch': 2} {'type': 'loss', 'content': 0.026693126186728477, 'timestamp': '2025-10-01 04:21:26.800953', 'step': 3675, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:21:26.845035', 'step': 3675, 'epoch': 2} {'type': 'loss', 'content': 0.002509724348783493, 'timestamp': '2025-10-01 04:21:26.880151', 'step': 3676, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:21:26.925333', 'step': 3676, 'epoch': 2} {'type': 'loss', 'content': 0.0050446405075490475, 'timestamp': '2025-10-01 04:21:26.938654', 'step': 3677, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 15187581968384}, 'timestamp': '2025-10-01 04:21:26.992797', 'step': 3677, 'epoch': 2} {'type': 'loss', 'content': 0.007949820719659328, 'timestamp': '2025-10-01 04:21:27.010749', 'step': 3678, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:21:27.060589', 'step': 3678, 'epoch': 2} {'type': 'loss', 'content': 0.0069472696632146835, 'timestamp': '2025-10-01 04:21:27.082256', 'step': 3679, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:27.130799', 'step': 3679, 'epoch': 2} {'type': 'loss', 'content': 0.001824391190893948, 'timestamp': '2025-10-01 04:21:27.165292', 'step': 3680, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-10-01 04:21:29.854443', 'step': 3680, 'epoch': 2} {'type': 'pplx', 'content': 5.963162021021418, 'timestamp': '2025-10-01 04:21:29.868259', 'step': 3680, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:29.916133', 'step': 3680, 'epoch': 2} {'type': 'loss', 'content': 0.000695497845299542, 'timestamp': '2025-10-01 04:21:29.930544', 'step': 3681, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:29.980810', 'step': 3681, 'epoch': 2} {'type': 'loss', 'content': 0.005683867260813713, 'timestamp': '2025-10-01 04:21:29.994416', 'step': 3682, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:30.053061', 'step': 3682, 'epoch': 2} {'type': 'loss', 'content': 0.009371207095682621, 'timestamp': '2025-10-01 04:21:30.073346', 'step': 3683, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:21:30.135617', 'step': 3683, 'epoch': 2} {'type': 'loss', 'content': 0.006398250348865986, 'timestamp': '2025-10-01 04:21:30.170671', 'step': 3684, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:30.232448', 'step': 3684, 'epoch': 2} {'type': 'loss', 'content': 0.004283849149942398, 'timestamp': '2025-10-01 04:21:30.256677', 'step': 3685, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:21:30.313554', 'step': 3685, 'epoch': 2} {'type': 'loss', 'content': 0.010360103100538254, 'timestamp': '2025-10-01 04:21:30.326621', 'step': 3686, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:21:30.382428', 'step': 3686, 'epoch': 2} {'type': 'loss', 'content': 0.011265738867223263, 'timestamp': '2025-10-01 04:21:30.400131', 'step': 3687, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:21:30.450933', 'step': 3687, 'epoch': 2} {'type': 'loss', 'content': 0.013687366619706154, 'timestamp': '2025-10-01 04:21:30.488897', 'step': 3688, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:21:30.553875', 'step': 3688, 'epoch': 3} {'type': 'loss', 'content': 0.02698073908686638, 'timestamp': '2025-10-01 04:21:30.566756', 'step': 3689, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:30.613231', 'step': 3689, 'epoch': 3} {'type': 'loss', 'content': 0.0046708970330655575, 'timestamp': '2025-10-01 04:21:30.630158', 'step': 3690, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:21:30.677993', 'step': 3690, 'epoch': 3} {'type': 'loss', 'content': 0.010345847345888615, 'timestamp': '2025-10-01 04:21:30.691109', 'step': 3691, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:30.744598', 'step': 3691, 'epoch': 3} {'type': 'loss', 'content': 0.004108492285013199, 'timestamp': '2025-10-01 04:21:30.777013', 'step': 3692, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:30.842056', 'step': 3692, 'epoch': 3} {'type': 'loss', 'content': 0.005783985834568739, 'timestamp': '2025-10-01 04:21:30.856081', 'step': 3693, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:21:30.912839', 'step': 3693, 'epoch': 3} {'type': 'loss', 'content': 0.0045420024544000626, 'timestamp': '2025-10-01 04:21:30.921958', 'step': 3694, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:30.978372', 'step': 3694, 'epoch': 3} {'type': 'loss', 'content': 0.0038130097091197968, 'timestamp': '2025-10-01 04:21:30.992089', 'step': 3695, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:21:31.046728', 'step': 3695, 'epoch': 3} {'type': 'loss', 'content': 0.0056780981831252575, 'timestamp': '2025-10-01 04:21:31.080396', 'step': 3696, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-10-01 04:21:31.138908', 'step': 3696, 'epoch': 3} {'type': 'loss', 'content': 0.004821539390832186, 'timestamp': '2025-10-01 04:21:31.155809', 'step': 3697, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:31.200806', 'step': 3697, 'epoch': 3} {'type': 'loss', 'content': 0.009917207062244415, 'timestamp': '2025-10-01 04:21:31.212384', 'step': 3698, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:31.262475', 'step': 3698, 'epoch': 3} {'type': 'loss', 'content': 0.0021955675911158323, 'timestamp': '2025-10-01 04:21:31.276053', 'step': 3699, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:31.331684', 'step': 3699, 'epoch': 3} {'type': 'loss', 'content': 0.004807520192116499, 'timestamp': '2025-10-01 04:21:31.366142', 'step': 3700, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:21:31.410465', 'step': 3700, 'epoch': 3} {'type': 'loss', 'content': 0.006146770901978016, 'timestamp': '2025-10-01 04:21:31.417412', 'step': 3701, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:21:31.474353', 'step': 3701, 'epoch': 3} {'type': 'loss', 'content': 0.008152900263667107, 'timestamp': '2025-10-01 04:21:31.482619', 'step': 3702, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:31.531682', 'step': 3702, 'epoch': 3} {'type': 'loss', 'content': 0.009212502278387547, 'timestamp': '2025-10-01 04:21:31.543131', 'step': 3703, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:31.592463', 'step': 3703, 'epoch': 3} {'type': 'loss', 'content': 0.01338356826454401, 'timestamp': '2025-10-01 04:21:31.627068', 'step': 3704, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:21:31.659617', 'step': 3704, 'epoch': 3} {'type': 'loss', 'content': 0.012891402468085289, 'timestamp': '2025-10-01 04:21:31.668134', 'step': 3705, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:21:31.710406', 'step': 3705, 'epoch': 3} {'type': 'loss', 'content': 0.00879686325788498, 'timestamp': '2025-10-01 04:21:31.723213', 'step': 3706, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:21:31.772514', 'step': 3706, 'epoch': 3} {'type': 'loss', 'content': 0.009560851380228996, 'timestamp': '2025-10-01 04:21:31.780160', 'step': 3707, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:21:31.826948', 'step': 3707, 'epoch': 3} {'type': 'loss', 'content': 0.005689837504178286, 'timestamp': '2025-10-01 04:21:31.856283', 'step': 3708, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:21:31.894505', 'step': 3708, 'epoch': 3} {'type': 'loss', 'content': 0.01599104516208172, 'timestamp': '2025-10-01 04:21:31.901452', 'step': 3709, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:21:31.945919', 'step': 3709, 'epoch': 3} {'type': 'loss', 'content': 0.0022669904865324497, 'timestamp': '2025-10-01 04:21:31.954208', 'step': 3710, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:21:31.997872', 'step': 3710, 'epoch': 3} {'type': 'loss', 'content': 0.005011002998799086, 'timestamp': '2025-10-01 04:21:32.005416', 'step': 3711, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:21:32.049199', 'step': 3711, 'epoch': 3} {'type': 'loss', 'content': 0.005658212583512068, 'timestamp': '2025-10-01 04:21:32.077835', 'step': 3712, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:21:32.122571', 'step': 3712, 'epoch': 3} {'type': 'loss', 'content': 0.006361310835927725, 'timestamp': '2025-10-01 04:21:32.135900', 'step': 3713, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:21:32.180037', 'step': 3713, 'epoch': 3} {'type': 'loss', 'content': 0.015064243227243423, 'timestamp': '2025-10-01 04:21:32.187553', 'step': 3714, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:21:32.236903', 'step': 3714, 'epoch': 3} {'type': 'loss', 'content': 0.011290229856967926, 'timestamp': '2025-10-01 04:21:32.250936', 'step': 3715, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:21:32.291078', 'step': 3715, 'epoch': 3} {'type': 'loss', 'content': 0.01136233564466238, 'timestamp': '2025-10-01 04:21:32.324793', 'step': 3716, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:21:32.370107', 'step': 3716, 'epoch': 3} {'type': 'loss', 'content': 0.00702950544655323, 'timestamp': '2025-10-01 04:21:32.381230', 'step': 3717, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:32.419744', 'step': 3717, 'epoch': 3} {'type': 'loss', 'content': 0.006085939705371857, 'timestamp': '2025-10-01 04:21:32.433387', 'step': 3718, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:32.478342', 'step': 3718, 'epoch': 3} {'type': 'loss', 'content': 0.009192436002194881, 'timestamp': '2025-10-01 04:21:32.489725', 'step': 3719, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:21:32.531822', 'step': 3719, 'epoch': 3} {'type': 'loss', 'content': 0.003275879193097353, 'timestamp': '2025-10-01 04:21:32.561237', 'step': 3720, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:21:32.600464', 'step': 3720, 'epoch': 3} {'type': 'loss', 'content': 0.0040464578196406364, 'timestamp': '2025-10-01 04:21:32.613837', 'step': 3721, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:32.663252', 'step': 3721, 'epoch': 3} {'type': 'loss', 'content': 0.006304399576038122, 'timestamp': '2025-10-01 04:21:32.675874', 'step': 3722, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:21:32.727133', 'step': 3722, 'epoch': 3} {'type': 'loss', 'content': 0.01022473443299532, 'timestamp': '2025-10-01 04:21:32.741190', 'step': 3723, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:21:32.793563', 'step': 3723, 'epoch': 3} {'type': 'loss', 'content': 0.013354051858186722, 'timestamp': '2025-10-01 04:21:32.828506', 'step': 3724, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:21:32.875037', 'step': 3724, 'epoch': 3} {'type': 'loss', 'content': 0.0020749038085341454, 'timestamp': '2025-10-01 04:21:32.883862', 'step': 3725, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:32.931702', 'step': 3725, 'epoch': 3} {'type': 'loss', 'content': 0.004935049451887608, 'timestamp': '2025-10-01 04:21:32.943192', 'step': 3726, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:21:32.980697', 'step': 3726, 'epoch': 3} {'type': 'loss', 'content': 0.007339076604694128, 'timestamp': '2025-10-01 04:21:32.988037', 'step': 3727, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:21:33.029499', 'step': 3727, 'epoch': 3} {'type': 'loss', 'content': 0.005236686673015356, 'timestamp': '2025-10-01 04:21:33.058511', 'step': 3728, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:21:33.096713', 'step': 3728, 'epoch': 3} {'type': 'loss', 'content': 0.01565227471292019, 'timestamp': '2025-10-01 04:21:33.103879', 'step': 3729, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:21:33.145611', 'step': 3729, 'epoch': 3} {'type': 'loss', 'content': 0.009821142069995403, 'timestamp': '2025-10-01 04:21:33.156334', 'step': 3730, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 15187581968384}, 'timestamp': '2025-10-01 04:21:33.213401', 'step': 3730, 'epoch': 3} {'type': 'loss', 'content': 0.008819508366286755, 'timestamp': '2025-10-01 04:21:33.231396', 'step': 3731, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:21:33.271763', 'step': 3731, 'epoch': 3} {'type': 'loss', 'content': 0.005135006736963987, 'timestamp': '2025-10-01 04:21:33.306698', 'step': 3732, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:21:33.350979', 'step': 3732, 'epoch': 3} {'type': 'loss', 'content': 0.009660299867391586, 'timestamp': '2025-10-01 04:21:33.366566', 'step': 3733, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:33.412547', 'step': 3733, 'epoch': 3} {'type': 'loss', 'content': 0.006446721963584423, 'timestamp': '2025-10-01 04:21:33.425235', 'step': 3734, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:21:33.474444', 'step': 3734, 'epoch': 3} {'type': 'loss', 'content': 0.006669782102108002, 'timestamp': '2025-10-01 04:21:33.485787', 'step': 3735, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:33.532446', 'step': 3735, 'epoch': 3} {'type': 'loss', 'content': 0.011971898376941681, 'timestamp': '2025-10-01 04:21:33.567775', 'step': 3736, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:21:33.615152', 'step': 3736, 'epoch': 3} {'type': 'loss', 'content': 0.009500562213361263, 'timestamp': '2025-10-01 04:21:33.621901', 'step': 3737, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:33.661536', 'step': 3737, 'epoch': 3} {'type': 'loss', 'content': 0.01449498999863863, 'timestamp': '2025-10-01 04:21:33.673082', 'step': 3738, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:21:33.716470', 'step': 3738, 'epoch': 3} {'type': 'loss', 'content': 0.005854697898030281, 'timestamp': '2025-10-01 04:21:33.726925', 'step': 3739, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:33.773989', 'step': 3739, 'epoch': 3} {'type': 'loss', 'content': 0.006733085494488478, 'timestamp': '2025-10-01 04:21:33.808621', 'step': 3740, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:21:33.847854', 'step': 3740, 'epoch': 3} {'type': 'loss', 'content': 0.006285593844950199, 'timestamp': '2025-10-01 04:21:33.861192', 'step': 3741, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:33.902849', 'step': 3741, 'epoch': 3} {'type': 'loss', 'content': 0.007295776624232531, 'timestamp': '2025-10-01 04:21:33.914447', 'step': 3742, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:33.959178', 'step': 3742, 'epoch': 3} {'type': 'loss', 'content': 0.007840866222977638, 'timestamp': '2025-10-01 04:21:33.970729', 'step': 3743, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:34.007901', 'step': 3743, 'epoch': 3} {'type': 'loss', 'content': 0.004712915979325771, 'timestamp': '2025-10-01 04:21:34.041208', 'step': 3744, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:21:34.079623', 'step': 3744, 'epoch': 3} {'type': 'loss', 'content': 0.008143024519085884, 'timestamp': '2025-10-01 04:21:34.090682', 'step': 3745, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:34.126004', 'step': 3745, 'epoch': 3} {'type': 'loss', 'content': 0.008052135817706585, 'timestamp': '2025-10-01 04:21:34.137610', 'step': 3746, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:21:34.179339', 'step': 3746, 'epoch': 3} {'type': 'loss', 'content': 0.004795056767761707, 'timestamp': '2025-10-01 04:21:34.188940', 'step': 3747, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:21:34.232695', 'step': 3747, 'epoch': 3} {'type': 'loss', 'content': 0.006835413631051779, 'timestamp': '2025-10-01 04:21:34.264163', 'step': 3748, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:21:34.303281', 'step': 3748, 'epoch': 3} {'type': 'loss', 'content': 0.004591739736497402, 'timestamp': '2025-10-01 04:21:34.313211', 'step': 3749, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:21:34.358314', 'step': 3749, 'epoch': 3} {'type': 'loss', 'content': 0.005950805265456438, 'timestamp': '2025-10-01 04:21:34.366677', 'step': 3750, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:34.420628', 'step': 3750, 'epoch': 3} {'type': 'loss', 'content': 0.010333477519452572, 'timestamp': '2025-10-01 04:21:34.433260', 'step': 3751, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:21:34.487343', 'step': 3751, 'epoch': 3} {'type': 'loss', 'content': 0.008263938128948212, 'timestamp': '2025-10-01 04:21:34.522337', 'step': 3752, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:21:34.564842', 'step': 3752, 'epoch': 3} {'type': 'loss', 'content': 0.009499086067080498, 'timestamp': '2025-10-01 04:21:34.578417', 'step': 3753, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:21:34.628531', 'step': 3753, 'epoch': 3} {'type': 'loss', 'content': 0.011588684283196926, 'timestamp': '2025-10-01 04:21:34.636272', 'step': 3754, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:34.681867', 'step': 3754, 'epoch': 3} {'type': 'loss', 'content': 0.010373840108513832, 'timestamp': '2025-10-01 04:21:34.694413', 'step': 3755, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:34.734016', 'step': 3755, 'epoch': 3} {'type': 'loss', 'content': 0.006424376741051674, 'timestamp': '2025-10-01 04:21:34.767683', 'step': 3756, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:34.813916', 'step': 3756, 'epoch': 3} {'type': 'loss', 'content': 0.008625333197414875, 'timestamp': '2025-10-01 04:21:34.823268', 'step': 3757, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:21:34.868030', 'step': 3757, 'epoch': 3} {'type': 'loss', 'content': 0.006540013011544943, 'timestamp': '2025-10-01 04:21:34.882087', 'step': 3758, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:21:34.920905', 'step': 3758, 'epoch': 3} {'type': 'loss', 'content': 0.006726275198161602, 'timestamp': '2025-10-01 04:21:34.929514', 'step': 3759, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:34.975986', 'step': 3759, 'epoch': 3} {'type': 'loss', 'content': 0.0029816841706633568, 'timestamp': '2025-10-01 04:21:35.009532', 'step': 3760, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:21:35.046163', 'step': 3760, 'epoch': 3} {'type': 'loss', 'content': 0.006205371581017971, 'timestamp': '2025-10-01 04:21:35.054479', 'step': 3761, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:21:35.097401', 'step': 3761, 'epoch': 3} {'type': 'loss', 'content': 0.0077186767011880875, 'timestamp': '2025-10-01 04:21:35.110194', 'step': 3762, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:35.146150', 'step': 3762, 'epoch': 3} {'type': 'loss', 'content': 0.005855043418705463, 'timestamp': '2025-10-01 04:21:35.157474', 'step': 3763, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:21:35.194232', 'step': 3763, 'epoch': 3} {'type': 'loss', 'content': 0.0077878618612885475, 'timestamp': '2025-10-01 04:21:35.225912', 'step': 3764, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:35.269971', 'step': 3764, 'epoch': 3} {'type': 'loss', 'content': 0.006303573027253151, 'timestamp': '2025-10-01 04:21:35.280529', 'step': 3765, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:21:35.318249', 'step': 3765, 'epoch': 3} {'type': 'loss', 'content': 0.005393424071371555, 'timestamp': '2025-10-01 04:21:35.330965', 'step': 3766, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:35.366553', 'step': 3766, 'epoch': 3} {'type': 'loss', 'content': 0.012819443829357624, 'timestamp': '2025-10-01 04:21:35.378182', 'step': 3767, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:21:35.436951', 'step': 3767, 'epoch': 3} {'type': 'loss', 'content': 0.013666943646967411, 'timestamp': '2025-10-01 04:21:35.466812', 'step': 3768, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:21:35.510428', 'step': 3768, 'epoch': 3} {'type': 'loss', 'content': 0.007845673710107803, 'timestamp': '2025-10-01 04:21:35.520058', 'step': 3769, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:21:35.563547', 'step': 3769, 'epoch': 3} {'type': 'loss', 'content': 0.005483326967805624, 'timestamp': '2025-10-01 04:21:35.571855', 'step': 3770, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:35.619842', 'step': 3770, 'epoch': 3} {'type': 'loss', 'content': 0.005489562638103962, 'timestamp': '2025-10-01 04:21:35.633362', 'step': 3771, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-10-01 04:21:35.680356', 'step': 3771, 'epoch': 3} {'type': 'loss', 'content': 0.004718263167887926, 'timestamp': '2025-10-01 04:21:35.717637', 'step': 3772, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:35.759175', 'step': 3772, 'epoch': 3} {'type': 'loss', 'content': 0.0102782491594553, 'timestamp': '2025-10-01 04:21:35.772040', 'step': 3773, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:21:35.816850', 'step': 3773, 'epoch': 3} {'type': 'loss', 'content': 0.0056166634894907475, 'timestamp': '2025-10-01 04:21:35.830833', 'step': 3774, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:21:35.876251', 'step': 3774, 'epoch': 3} {'type': 'loss', 'content': 0.0023278058506548405, 'timestamp': '2025-10-01 04:21:35.890255', 'step': 3775, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:35.932893', 'step': 3775, 'epoch': 3} {'type': 'loss', 'content': 0.005771087482571602, 'timestamp': '2025-10-01 04:21:35.967421', 'step': 3776, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:21:36.006624', 'step': 3776, 'epoch': 3} {'type': 'loss', 'content': 0.009895231574773788, 'timestamp': '2025-10-01 04:21:36.020008', 'step': 3777, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:21:36.062155', 'step': 3777, 'epoch': 3} {'type': 'loss', 'content': 0.0039160228334367275, 'timestamp': '2025-10-01 04:21:36.071177', 'step': 3778, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:21:36.113427', 'step': 3778, 'epoch': 3} {'type': 'loss', 'content': 0.0038676320109516382, 'timestamp': '2025-10-01 04:21:36.124239', 'step': 3779, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:21:36.163061', 'step': 3779, 'epoch': 3} {'type': 'loss', 'content': 0.006585745140910149, 'timestamp': '2025-10-01 04:21:36.192271', 'step': 3780, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:21:36.232399', 'step': 3780, 'epoch': 3} {'type': 'loss', 'content': 0.012533853761851788, 'timestamp': '2025-10-01 04:21:36.240931', 'step': 3781, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:36.283626', 'step': 3781, 'epoch': 3} {'type': 'loss', 'content': 0.005867802072316408, 'timestamp': '2025-10-01 04:21:36.295458', 'step': 3782, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:21:36.338333', 'step': 3782, 'epoch': 3} {'type': 'loss', 'content': 0.008939079940319061, 'timestamp': '2025-10-01 04:21:36.352389', 'step': 3783, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:36.393012', 'step': 3783, 'epoch': 3} {'type': 'loss', 'content': 0.006224760785698891, 'timestamp': '2025-10-01 04:21:36.426851', 'step': 3784, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:21:36.470256', 'step': 3784, 'epoch': 3} {'type': 'loss', 'content': 0.012876718305051327, 'timestamp': '2025-10-01 04:21:36.478602', 'step': 3785, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:21:36.525157', 'step': 3785, 'epoch': 3} {'type': 'loss', 'content': 0.0026953043416142464, 'timestamp': '2025-10-01 04:21:36.537952', 'step': 3786, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:21:36.588864', 'step': 3786, 'epoch': 3} {'type': 'loss', 'content': 0.01639711670577526, 'timestamp': '2025-10-01 04:21:36.602897', 'step': 3787, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:36.653189', 'step': 3787, 'epoch': 3} {'type': 'loss', 'content': 0.00399809330701828, 'timestamp': '2025-10-01 04:21:36.687729', 'step': 3788, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:21:36.729464', 'step': 3788, 'epoch': 3} {'type': 'loss', 'content': 0.001002951292321086, 'timestamp': '2025-10-01 04:21:36.742879', 'step': 3789, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:36.776423', 'step': 3789, 'epoch': 3} {'type': 'loss', 'content': 0.0017315756995230913, 'timestamp': '2025-10-01 04:21:36.789114', 'step': 3790, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:36.836524', 'step': 3790, 'epoch': 3} {'type': 'loss', 'content': 0.0022892530541867018, 'timestamp': '2025-10-01 04:21:36.849052', 'step': 3791, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:21:36.898333', 'step': 3791, 'epoch': 3} {'type': 'loss', 'content': 0.002275864128023386, 'timestamp': '2025-10-01 04:21:36.927176', 'step': 3792, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:21:36.967692', 'step': 3792, 'epoch': 3} {'type': 'loss', 'content': 0.0007483239751309156, 'timestamp': '2025-10-01 04:21:36.976006', 'step': 3793, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:21:37.025789', 'step': 3793, 'epoch': 3} {'type': 'loss', 'content': 0.002579609164968133, 'timestamp': '2025-10-01 04:21:37.039813', 'step': 3794, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:21:37.082909', 'step': 3794, 'epoch': 3} {'type': 'loss', 'content': 0.001384778879582882, 'timestamp': '2025-10-01 04:21:37.095607', 'step': 3795, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-10-01 04:21:39.719563', 'step': 3795, 'epoch': 3} {'type': 'pplx', 'content': 5.920190783568409, 'timestamp': '2025-10-01 04:21:39.723259', 'step': 3795, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:21:39.758842', 'step': 3795, 'epoch': 3} {'type': 'loss', 'content': 0.013693811371922493, 'timestamp': '2025-10-01 04:21:39.789390', 'step': 3796, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:21:39.835807', 'step': 3796, 'epoch': 3} {'type': 'loss', 'content': 0.0044151837937533855, 'timestamp': '2025-10-01 04:21:39.841991', 'step': 3797, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:21:39.886618', 'step': 3797, 'epoch': 3} {'type': 'loss', 'content': 0.017463091760873795, 'timestamp': '2025-10-01 04:21:39.899365', 'step': 3798, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-10-01 04:21:39.951060', 'step': 3798, 'epoch': 3} {'type': 'loss', 'content': 0.005681238137185574, 'timestamp': '2025-10-01 04:21:39.967308', 'step': 3799, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:40.010856', 'step': 3799, 'epoch': 3} {'type': 'loss', 'content': 0.010169201530516148, 'timestamp': '2025-10-01 04:21:40.044309', 'step': 3800, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:21:40.097992', 'step': 3800, 'epoch': 3} {'type': 'loss', 'content': 0.002855896484106779, 'timestamp': '2025-10-01 04:21:40.111573', 'step': 3801, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:21:40.160858', 'step': 3801, 'epoch': 3} {'type': 'loss', 'content': 0.00683237798511982, 'timestamp': '2025-10-01 04:21:40.169407', 'step': 3802, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:21:40.211530', 'step': 3802, 'epoch': 3} {'type': 'loss', 'content': 0.007955810986459255, 'timestamp': '2025-10-01 04:21:40.222454', 'step': 3803, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:21:40.275716', 'step': 3803, 'epoch': 3} {'type': 'loss', 'content': 0.003038405440747738, 'timestamp': '2025-10-01 04:21:40.312590', 'step': 3804, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:21:40.355137', 'step': 3804, 'epoch': 3} {'type': 'loss', 'content': 0.004363365471363068, 'timestamp': '2025-10-01 04:21:40.363237', 'step': 3805, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:40.407549', 'step': 3805, 'epoch': 3} {'type': 'loss', 'content': 0.008155022747814655, 'timestamp': '2025-10-01 04:21:40.420060', 'step': 3806, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:40.467602', 'step': 3806, 'epoch': 3} {'type': 'loss', 'content': 0.00979082565754652, 'timestamp': '2025-10-01 04:21:40.478958', 'step': 3807, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:21:40.531178', 'step': 3807, 'epoch': 3} {'type': 'loss', 'content': 0.005278818309307098, 'timestamp': '2025-10-01 04:21:40.566118', 'step': 3808, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:21:40.612866', 'step': 3808, 'epoch': 3} {'type': 'loss', 'content': 0.0034789559431374073, 'timestamp': '2025-10-01 04:21:40.626208', 'step': 3809, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:21:40.670309', 'step': 3809, 'epoch': 3} {'type': 'loss', 'content': 0.005489254370331764, 'timestamp': '2025-10-01 04:21:40.683054', 'step': 3810, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:21:40.725305', 'step': 3810, 'epoch': 3} {'type': 'loss', 'content': 0.00514249550178647, 'timestamp': '2025-10-01 04:21:40.739329', 'step': 3811, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:21:40.798819', 'step': 3811, 'epoch': 3} {'type': 'loss', 'content': 0.0048544760793447495, 'timestamp': '2025-10-01 04:21:40.835762', 'step': 3812, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:21:40.891441', 'step': 3812, 'epoch': 3} {'type': 'loss', 'content': 0.005835839547216892, 'timestamp': '2025-10-01 04:21:40.907080', 'step': 3813, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:21:40.965579', 'step': 3813, 'epoch': 3} {'type': 'loss', 'content': 0.008851534686982632, 'timestamp': '2025-10-01 04:21:40.981420', 'step': 3814, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:41.024478', 'step': 3814, 'epoch': 3} {'type': 'loss', 'content': 0.006555199157446623, 'timestamp': '2025-10-01 04:21:41.035865', 'step': 3815, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 16611393146432}, 'timestamp': '2025-10-01 04:21:41.102185', 'step': 3815, 'epoch': 3} {'type': 'loss', 'content': 0.009754362516105175, 'timestamp': '2025-10-01 04:21:41.142629', 'step': 3816, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-10-01 04:21:41.194622', 'step': 3816, 'epoch': 3} {'type': 'loss', 'content': 0.00460690027102828, 'timestamp': '2025-10-01 04:21:41.211428', 'step': 3817, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:21:41.262755', 'step': 3817, 'epoch': 3} {'type': 'loss', 'content': 0.0028491229750216007, 'timestamp': '2025-10-01 04:21:41.276912', 'step': 3818, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:41.320127', 'step': 3818, 'epoch': 3} {'type': 'loss', 'content': 0.009219329804182053, 'timestamp': '2025-10-01 04:21:41.332683', 'step': 3819, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:41.378682', 'step': 3819, 'epoch': 3} {'type': 'loss', 'content': 0.007629296276718378, 'timestamp': '2025-10-01 04:21:41.413155', 'step': 3820, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-10-01 04:21:41.468734', 'step': 3820, 'epoch': 3} {'type': 'loss', 'content': 0.002886310452595353, 'timestamp': '2025-10-01 04:21:41.485476', 'step': 3821, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 14712978242368}, 'timestamp': '2025-10-01 04:21:41.532412', 'step': 3821, 'epoch': 3} {'type': 'loss', 'content': 0.005417125299572945, 'timestamp': '2025-10-01 04:21:41.550196', 'step': 3822, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:21:41.601952', 'step': 3822, 'epoch': 3} {'type': 'loss', 'content': 0.011424657888710499, 'timestamp': '2025-10-01 04:21:41.615929', 'step': 3823, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:21:41.652992', 'step': 3823, 'epoch': 3} {'type': 'loss', 'content': 0.004510477185249329, 'timestamp': '2025-10-01 04:21:41.681523', 'step': 3824, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:21:41.721316', 'step': 3824, 'epoch': 3} {'type': 'loss', 'content': 0.0035503145772963762, 'timestamp': '2025-10-01 04:21:41.729697', 'step': 3825, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:21:41.767601', 'step': 3825, 'epoch': 3} {'type': 'loss', 'content': 0.006845823023468256, 'timestamp': '2025-10-01 04:21:41.780405', 'step': 3826, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:41.826859', 'step': 3826, 'epoch': 3} {'type': 'loss', 'content': 0.005940115079283714, 'timestamp': '2025-10-01 04:21:41.838445', 'step': 3827, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:21:41.878045', 'step': 3827, 'epoch': 3} {'type': 'loss', 'content': 0.012975228019058704, 'timestamp': '2025-10-01 04:21:41.907117', 'step': 3828, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:21:41.945948', 'step': 3828, 'epoch': 3} {'type': 'loss', 'content': 0.003947144839912653, 'timestamp': '2025-10-01 04:21:41.952461', 'step': 3829, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:42.003014', 'step': 3829, 'epoch': 3} {'type': 'loss', 'content': 0.012637635692954063, 'timestamp': '2025-10-01 04:21:42.016522', 'step': 3830, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-10-01 04:21:42.070785', 'step': 3830, 'epoch': 3} {'type': 'loss', 'content': 0.011283905245363712, 'timestamp': '2025-10-01 04:21:42.087180', 'step': 3831, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:21:42.132436', 'step': 3831, 'epoch': 3} {'type': 'loss', 'content': 0.005232776049524546, 'timestamp': '2025-10-01 04:21:42.164646', 'step': 3832, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:21:42.215384', 'step': 3832, 'epoch': 3} {'type': 'loss', 'content': 0.009843096137046814, 'timestamp': '2025-10-01 04:21:42.225119', 'step': 3833, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:21:42.271044', 'step': 3833, 'epoch': 3} {'type': 'loss', 'content': 0.008104391396045685, 'timestamp': '2025-10-01 04:21:42.279678', 'step': 3834, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:21:42.321441', 'step': 3834, 'epoch': 3} {'type': 'loss', 'content': 0.007726176176220179, 'timestamp': '2025-10-01 04:21:42.328856', 'step': 3835, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:21:42.375379', 'step': 3835, 'epoch': 3} {'type': 'loss', 'content': 0.007057244423776865, 'timestamp': '2025-10-01 04:21:42.403711', 'step': 3836, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:21:42.443931', 'step': 3836, 'epoch': 3} {'type': 'loss', 'content': 0.0067809708416461945, 'timestamp': '2025-10-01 04:21:42.454882', 'step': 3837, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:42.489476', 'step': 3837, 'epoch': 3} {'type': 'loss', 'content': 0.008209925144910812, 'timestamp': '2025-10-01 04:21:42.500830', 'step': 3838, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:42.537301', 'step': 3838, 'epoch': 3} {'type': 'loss', 'content': 0.006265724077820778, 'timestamp': '2025-10-01 04:21:42.548624', 'step': 3839, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:21:42.589848', 'step': 3839, 'epoch': 3} {'type': 'loss', 'content': 0.008741269819438457, 'timestamp': '2025-10-01 04:21:42.624925', 'step': 3840, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:21:42.664600', 'step': 3840, 'epoch': 3} {'type': 'loss', 'content': 0.00582159636542201, 'timestamp': '2025-10-01 04:21:42.675352', 'step': 3841, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:21:42.717408', 'step': 3841, 'epoch': 3} {'type': 'loss', 'content': 0.004883139859884977, 'timestamp': '2025-10-01 04:21:42.731436', 'step': 3842, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:21:42.775086', 'step': 3842, 'epoch': 3} {'type': 'loss', 'content': 0.010044214315712452, 'timestamp': '2025-10-01 04:21:42.789094', 'step': 3843, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:21:42.839051', 'step': 3843, 'epoch': 3} {'type': 'loss', 'content': 0.005302182864397764, 'timestamp': '2025-10-01 04:21:42.875812', 'step': 3844, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:42.911737', 'step': 3844, 'epoch': 3} {'type': 'loss', 'content': 0.00982626061886549, 'timestamp': '2025-10-01 04:21:42.921894', 'step': 3845, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:21:42.963763', 'step': 3845, 'epoch': 3} {'type': 'loss', 'content': 0.0033476154785603285, 'timestamp': '2025-10-01 04:21:42.977768', 'step': 3846, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:43.021751', 'step': 3846, 'epoch': 3} {'type': 'loss', 'content': 0.008563864044845104, 'timestamp': '2025-10-01 04:21:43.034034', 'step': 3847, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:21:43.080364', 'step': 3847, 'epoch': 3} {'type': 'loss', 'content': 0.0062912749126553535, 'timestamp': '2025-10-01 04:21:43.115525', 'step': 3848, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:21:43.152918', 'step': 3848, 'epoch': 3} {'type': 'loss', 'content': 0.006278780288994312, 'timestamp': '2025-10-01 04:21:43.158884', 'step': 3849, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:21:43.192175', 'step': 3849, 'epoch': 3} {'type': 'loss', 'content': 0.005297094117850065, 'timestamp': '2025-10-01 04:21:43.204911', 'step': 3850, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:43.243986', 'step': 3850, 'epoch': 3} {'type': 'loss', 'content': 0.0068575600162148476, 'timestamp': '2025-10-01 04:21:43.255355', 'step': 3851, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:21:43.294358', 'step': 3851, 'epoch': 3} {'type': 'loss', 'content': 0.007542989682406187, 'timestamp': '2025-10-01 04:21:43.328094', 'step': 3852, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:21:43.376975', 'step': 3852, 'epoch': 3} {'type': 'loss', 'content': 0.009979772381484509, 'timestamp': '2025-10-01 04:21:43.389277', 'step': 3853, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:21:43.437197', 'step': 3853, 'epoch': 3} {'type': 'loss', 'content': 0.0062700598500669, 'timestamp': '2025-10-01 04:21:43.447884', 'step': 3854, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:43.491478', 'step': 3854, 'epoch': 3} {'type': 'loss', 'content': 0.005252262577414513, 'timestamp': '2025-10-01 04:21:43.503981', 'step': 3855, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:21:43.543890', 'step': 3855, 'epoch': 3} {'type': 'loss', 'content': 0.00958793330937624, 'timestamp': '2025-10-01 04:21:43.573376', 'step': 3856, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:21:43.612186', 'step': 3856, 'epoch': 3} {'type': 'loss', 'content': 0.01129075139760971, 'timestamp': '2025-10-01 04:21:43.623667', 'step': 3857, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:21:43.670545', 'step': 3857, 'epoch': 3} {'type': 'loss', 'content': 0.007336131762713194, 'timestamp': '2025-10-01 04:21:43.679074', 'step': 3858, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:21:43.717229', 'step': 3858, 'epoch': 3} {'type': 'loss', 'content': 0.0012963026529178023, 'timestamp': '2025-10-01 04:21:43.731268', 'step': 3859, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:21:43.776627', 'step': 3859, 'epoch': 3} {'type': 'loss', 'content': 0.002206779783591628, 'timestamp': '2025-10-01 04:21:43.805565', 'step': 3860, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:43.848321', 'step': 3860, 'epoch': 3} {'type': 'loss', 'content': 0.006879172287881374, 'timestamp': '2025-10-01 04:21:43.858540', 'step': 3861, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:21:43.892962', 'step': 3861, 'epoch': 3} {'type': 'loss', 'content': 0.007915803231298923, 'timestamp': '2025-10-01 04:21:43.903725', 'step': 3862, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:21:43.945223', 'step': 3862, 'epoch': 3} {'type': 'loss', 'content': 0.007328977342694998, 'timestamp': '2025-10-01 04:21:43.952847', 'step': 3863, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:43.993706', 'step': 3863, 'epoch': 3} {'type': 'loss', 'content': 0.00448673777282238, 'timestamp': '2025-10-01 04:21:44.027193', 'step': 3864, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:21:44.070644', 'step': 3864, 'epoch': 3} {'type': 'loss', 'content': 0.005047598388046026, 'timestamp': '2025-10-01 04:21:44.083917', 'step': 3865, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:21:44.119381', 'step': 3865, 'epoch': 3} {'type': 'loss', 'content': 0.007395552471280098, 'timestamp': '2025-10-01 04:21:44.129030', 'step': 3866, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-10-01 04:21:44.180328', 'step': 3866, 'epoch': 3} {'type': 'loss', 'content': 0.001688328804448247, 'timestamp': '2025-10-01 04:21:44.196647', 'step': 3867, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:44.242814', 'step': 3867, 'epoch': 3} {'type': 'loss', 'content': 0.006219188682734966, 'timestamp': '2025-10-01 04:21:44.277250', 'step': 3868, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:21:44.320938', 'step': 3868, 'epoch': 3} {'type': 'loss', 'content': 0.006195826455950737, 'timestamp': '2025-10-01 04:21:44.333610', 'step': 3869, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:44.373005', 'step': 3869, 'epoch': 3} {'type': 'loss', 'content': 0.006788821890950203, 'timestamp': '2025-10-01 04:21:44.384609', 'step': 3870, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:21:44.433642', 'step': 3870, 'epoch': 3} {'type': 'loss', 'content': 0.010092318058013916, 'timestamp': '2025-10-01 04:21:44.441964', 'step': 3871, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:21:44.492750', 'step': 3871, 'epoch': 3} {'type': 'loss', 'content': 0.0028680639807134867, 'timestamp': '2025-10-01 04:21:44.526405', 'step': 3872, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:21:44.570566', 'step': 3872, 'epoch': 3} {'type': 'loss', 'content': 0.0071935043670237064, 'timestamp': '2025-10-01 04:21:44.574352', 'step': 3873, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:21:44.620352', 'step': 3873, 'epoch': 3} {'type': 'loss', 'content': 0.0031867825891822577, 'timestamp': '2025-10-01 04:21:44.631216', 'step': 3874, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:21:44.678838', 'step': 3874, 'epoch': 3} {'type': 'loss', 'content': 0.006500164046883583, 'timestamp': '2025-10-01 04:21:44.692837', 'step': 3875, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:21:44.744348', 'step': 3875, 'epoch': 3} {'type': 'loss', 'content': 0.009594959206879139, 'timestamp': '2025-10-01 04:21:44.773304', 'step': 3876, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:44.816451', 'step': 3876, 'epoch': 3} {'type': 'loss', 'content': 0.006768261082470417, 'timestamp': '2025-10-01 04:21:44.829317', 'step': 3877, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:21:44.872203', 'step': 3877, 'epoch': 3} {'type': 'loss', 'content': 0.005373574327677488, 'timestamp': '2025-10-01 04:21:44.884951', 'step': 3878, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:21:44.934324', 'step': 3878, 'epoch': 3} {'type': 'loss', 'content': 0.0041663614101707935, 'timestamp': '2025-10-01 04:21:44.950434', 'step': 3879, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:21:44.994517', 'step': 3879, 'epoch': 3} {'type': 'loss', 'content': 0.002968120388686657, 'timestamp': '2025-10-01 04:21:45.028179', 'step': 3880, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:21:45.071478', 'step': 3880, 'epoch': 3} {'type': 'loss', 'content': 0.006960436701774597, 'timestamp': '2025-10-01 04:21:45.080936', 'step': 3881, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:21:45.120441', 'step': 3881, 'epoch': 3} {'type': 'loss', 'content': 0.0015443864976987243, 'timestamp': '2025-10-01 04:21:45.132787', 'step': 3882, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:21:45.199756', 'step': 3882, 'epoch': 3} {'type': 'loss', 'content': 0.006900917273014784, 'timestamp': '2025-10-01 04:21:45.215716', 'step': 3883, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:21:45.264905', 'step': 3883, 'epoch': 3} {'type': 'loss', 'content': 0.006999525241553783, 'timestamp': '2025-10-01 04:21:45.299871', 'step': 3884, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:21:45.349060', 'step': 3884, 'epoch': 3} {'type': 'loss', 'content': 0.006143415812402964, 'timestamp': '2025-10-01 04:21:45.357490', 'step': 3885, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:21:45.400585', 'step': 3885, 'epoch': 3} {'type': 'loss', 'content': 0.004637688864022493, 'timestamp': '2025-10-01 04:21:45.411023', 'step': 3886, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:21:45.449394', 'step': 3886, 'epoch': 3} {'type': 'loss', 'content': 0.011323896236717701, 'timestamp': '2025-10-01 04:21:45.459810', 'step': 3887, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:21:45.501419', 'step': 3887, 'epoch': 3} {'type': 'loss', 'content': 0.003829208668321371, 'timestamp': '2025-10-01 04:21:45.533135', 'step': 3888, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:21:45.583456', 'step': 3888, 'epoch': 3} {'type': 'loss', 'content': 0.005798206198960543, 'timestamp': '2025-10-01 04:21:45.589416', 'step': 3889, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:45.635685', 'step': 3889, 'epoch': 3} {'type': 'loss', 'content': 0.004710289184004068, 'timestamp': '2025-10-01 04:21:45.649222', 'step': 3890, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:21:45.696700', 'step': 3890, 'epoch': 3} {'type': 'loss', 'content': 0.0029786215163767338, 'timestamp': '2025-10-01 04:21:45.710725', 'step': 3891, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:21:45.755461', 'step': 3891, 'epoch': 3} {'type': 'loss', 'content': 0.00428601261228323, 'timestamp': '2025-10-01 04:21:45.790420', 'step': 3892, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:45.838002', 'step': 3892, 'epoch': 3} {'type': 'loss', 'content': 0.005602055694907904, 'timestamp': '2025-10-01 04:21:45.850816', 'step': 3893, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:45.893076', 'step': 3893, 'epoch': 3} {'type': 'loss', 'content': 0.007251392118632793, 'timestamp': '2025-10-01 04:21:45.905588', 'step': 3894, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:45.954122', 'step': 3894, 'epoch': 3} {'type': 'loss', 'content': 0.008004963397979736, 'timestamp': '2025-10-01 04:21:45.967681', 'step': 3895, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:21:46.023122', 'step': 3895, 'epoch': 3} {'type': 'loss', 'content': 0.004561685957014561, 'timestamp': '2025-10-01 04:21:46.058101', 'step': 3896, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:21:46.100674', 'step': 3896, 'epoch': 3} {'type': 'loss', 'content': 0.008317393250763416, 'timestamp': '2025-10-01 04:21:46.109075', 'step': 3897, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:21:46.153693', 'step': 3897, 'epoch': 3} {'type': 'loss', 'content': 0.007390536367893219, 'timestamp': '2025-10-01 04:21:46.167699', 'step': 3898, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:21:46.212321', 'step': 3898, 'epoch': 3} {'type': 'loss', 'content': 0.0123180216178298, 'timestamp': '2025-10-01 04:21:46.220746', 'step': 3899, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:46.260549', 'step': 3899, 'epoch': 3} {'type': 'loss', 'content': 0.006009929347783327, 'timestamp': '2025-10-01 04:21:46.295024', 'step': 3900, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:46.346097', 'step': 3900, 'epoch': 3} {'type': 'loss', 'content': 0.004409028682857752, 'timestamp': '2025-10-01 04:21:46.356588', 'step': 3901, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:46.394353', 'step': 3901, 'epoch': 3} {'type': 'loss', 'content': 0.006092773750424385, 'timestamp': '2025-10-01 04:21:46.405816', 'step': 3902, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:46.448927', 'step': 3902, 'epoch': 3} {'type': 'loss', 'content': 0.005507196765393019, 'timestamp': '2025-10-01 04:21:46.461472', 'step': 3903, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:21:46.507055', 'step': 3903, 'epoch': 3} {'type': 'loss', 'content': 0.005520245525985956, 'timestamp': '2025-10-01 04:21:46.542073', 'step': 3904, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:46.586869', 'step': 3904, 'epoch': 3} {'type': 'loss', 'content': 0.0037713726051151752, 'timestamp': '2025-10-01 04:21:46.599740', 'step': 3905, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:21:46.643216', 'step': 3905, 'epoch': 3} {'type': 'loss', 'content': 0.006941889878362417, 'timestamp': '2025-10-01 04:21:46.655981', 'step': 3906, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:46.705418', 'step': 3906, 'epoch': 3} {'type': 'loss', 'content': 0.0026663634926080704, 'timestamp': '2025-10-01 04:21:46.719023', 'step': 3907, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:46.763006', 'step': 3907, 'epoch': 3} {'type': 'loss', 'content': 0.003992387559264898, 'timestamp': '2025-10-01 04:21:46.796397', 'step': 3908, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:46.840945', 'step': 3908, 'epoch': 3} {'type': 'loss', 'content': 0.00826653465628624, 'timestamp': '2025-10-01 04:21:46.853819', 'step': 3909, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:21:46.902317', 'step': 3909, 'epoch': 3} {'type': 'loss', 'content': 0.0022964158561080694, 'timestamp': '2025-10-01 04:21:46.913093', 'step': 3910, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-10-01 04:21:49.581386', 'step': 3910, 'epoch': 3} {'type': 'pplx', 'content': 5.839724768063334, 'timestamp': '2025-10-01 04:21:49.587038', 'step': 3910, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:21:49.625134', 'step': 3910, 'epoch': 3} {'type': 'loss', 'content': 0.004357689060270786, 'timestamp': '2025-10-01 04:21:49.637183', 'step': 3911, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:21:49.680635', 'step': 3911, 'epoch': 3} {'type': 'loss', 'content': 0.01052537839859724, 'timestamp': '2025-10-01 04:21:49.714948', 'step': 3912, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:21:49.761115', 'step': 3912, 'epoch': 3} {'type': 'loss', 'content': 0.007060512900352478, 'timestamp': '2025-10-01 04:21:49.772223', 'step': 3913, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:49.819071', 'step': 3913, 'epoch': 3} {'type': 'loss', 'content': 0.0053184786811470985, 'timestamp': '2025-10-01 04:21:49.830429', 'step': 3914, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:21:49.871800', 'step': 3914, 'epoch': 3} {'type': 'loss', 'content': 0.004902233369648457, 'timestamp': '2025-10-01 04:21:49.879767', 'step': 3915, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-10-01 04:21:49.949915', 'step': 3915, 'epoch': 3} {'type': 'loss', 'content': 0.004571522586047649, 'timestamp': '2025-10-01 04:21:49.988046', 'step': 3916, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:50.030004', 'step': 3916, 'epoch': 3} {'type': 'loss', 'content': 0.00507276551797986, 'timestamp': '2025-10-01 04:21:50.042837', 'step': 3917, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:50.080340', 'step': 3917, 'epoch': 3} {'type': 'loss', 'content': 0.01796695403754711, 'timestamp': '2025-10-01 04:21:50.092886', 'step': 3918, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:21:50.136215', 'step': 3918, 'epoch': 3} {'type': 'loss', 'content': 0.006453292444348335, 'timestamp': '2025-10-01 04:21:50.144532', 'step': 3919, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:50.185025', 'step': 3919, 'epoch': 3} {'type': 'loss', 'content': 0.007986822165548801, 'timestamp': '2025-10-01 04:21:50.218461', 'step': 3920, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:21:50.263588', 'step': 3920, 'epoch': 3} {'type': 'loss', 'content': 0.01686868630349636, 'timestamp': '2025-10-01 04:21:50.272080', 'step': 3921, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:21:50.325680', 'step': 3921, 'epoch': 3} {'type': 'loss', 'content': 0.005332557018846273, 'timestamp': '2025-10-01 04:21:50.338432', 'step': 3922, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:21:50.384800', 'step': 3922, 'epoch': 3} {'type': 'loss', 'content': 0.001777208992280066, 'timestamp': '2025-10-01 04:21:50.401817', 'step': 3923, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:50.460386', 'step': 3923, 'epoch': 3} {'type': 'loss', 'content': 0.003371636150404811, 'timestamp': '2025-10-01 04:21:50.494862', 'step': 3924, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:50.535330', 'step': 3924, 'epoch': 3} {'type': 'loss', 'content': 0.0061006611213088036, 'timestamp': '2025-10-01 04:21:50.544355', 'step': 3925, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:50.579750', 'step': 3925, 'epoch': 3} {'type': 'loss', 'content': 0.009890899062156677, 'timestamp': '2025-10-01 04:21:50.592233', 'step': 3926, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:50.635649', 'step': 3926, 'epoch': 3} {'type': 'loss', 'content': 0.010139167308807373, 'timestamp': '2025-10-01 04:21:50.648169', 'step': 3927, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:50.691135', 'step': 3927, 'epoch': 3} {'type': 'loss', 'content': 0.006483972072601318, 'timestamp': '2025-10-01 04:21:50.725583', 'step': 3928, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:21:50.772542', 'step': 3928, 'epoch': 3} {'type': 'loss', 'content': 0.014142896980047226, 'timestamp': '2025-10-01 04:21:50.785901', 'step': 3929, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:50.829046', 'step': 3929, 'epoch': 3} {'type': 'loss', 'content': 0.008921366184949875, 'timestamp': '2025-10-01 04:21:50.840511', 'step': 3930, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:50.879321', 'step': 3930, 'epoch': 3} {'type': 'loss', 'content': 0.003010173561051488, 'timestamp': '2025-10-01 04:21:50.890864', 'step': 3931, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:21:50.942106', 'step': 3931, 'epoch': 3} {'type': 'loss', 'content': 0.005212696269154549, 'timestamp': '2025-10-01 04:21:50.977115', 'step': 3932, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:21:51.020627', 'step': 3932, 'epoch': 3} {'type': 'loss', 'content': 0.014917776919901371, 'timestamp': '2025-10-01 04:21:51.034011', 'step': 3933, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:21:51.092716', 'step': 3933, 'epoch': 3} {'type': 'loss', 'content': 0.002226614858955145, 'timestamp': '2025-10-01 04:21:51.108456', 'step': 3934, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:51.159467', 'step': 3934, 'epoch': 3} {'type': 'loss', 'content': 0.002463407116010785, 'timestamp': '2025-10-01 04:21:51.172968', 'step': 3935, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:21:51.225145', 'step': 3935, 'epoch': 3} {'type': 'loss', 'content': 0.005115607753396034, 'timestamp': '2025-10-01 04:21:51.259479', 'step': 3936, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:21:51.303564', 'step': 3936, 'epoch': 3} {'type': 'loss', 'content': 0.005467408336699009, 'timestamp': '2025-10-01 04:21:51.317091', 'step': 3937, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:21:51.373972', 'step': 3937, 'epoch': 3} {'type': 'loss', 'content': 0.005886225029826164, 'timestamp': '2025-10-01 04:21:51.389886', 'step': 3938, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:21:51.437977', 'step': 3938, 'epoch': 3} {'type': 'loss', 'content': 0.003188173985108733, 'timestamp': '2025-10-01 04:21:51.452027', 'step': 3939, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:21:51.502988', 'step': 3939, 'epoch': 3} {'type': 'loss', 'content': 0.0029654563404619694, 'timestamp': '2025-10-01 04:21:51.539948', 'step': 3940, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:51.579780', 'step': 3940, 'epoch': 3} {'type': 'loss', 'content': 0.006373301148414612, 'timestamp': '2025-10-01 04:21:51.589919', 'step': 3941, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:51.637683', 'step': 3941, 'epoch': 3} {'type': 'loss', 'content': 0.005012688226997852, 'timestamp': '2025-10-01 04:21:51.651201', 'step': 3942, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:21:51.692531', 'step': 3942, 'epoch': 3} {'type': 'loss', 'content': 0.007268612273037434, 'timestamp': '2025-10-01 04:21:51.706590', 'step': 3943, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:51.742122', 'step': 3943, 'epoch': 3} {'type': 'loss', 'content': 0.0058811623603105545, 'timestamp': '2025-10-01 04:21:51.774594', 'step': 3944, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:21:51.826921', 'step': 3944, 'epoch': 3} {'type': 'loss', 'content': 0.005982053931802511, 'timestamp': '2025-10-01 04:21:51.840197', 'step': 3945, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:51.880793', 'step': 3945, 'epoch': 3} {'type': 'loss', 'content': 0.003882664255797863, 'timestamp': '2025-10-01 04:21:51.893256', 'step': 3946, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:51.947428', 'step': 3946, 'epoch': 3} {'type': 'loss', 'content': 0.011562162078917027, 'timestamp': '2025-10-01 04:21:51.961004', 'step': 3947, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:21:52.005759', 'step': 3947, 'epoch': 3} {'type': 'loss', 'content': 0.014229918830096722, 'timestamp': '2025-10-01 04:21:52.040914', 'step': 3948, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 14238374516352}, 'timestamp': '2025-10-01 04:21:52.100205', 'step': 3948, 'epoch': 3} {'type': 'loss', 'content': 0.005639976821839809, 'timestamp': '2025-10-01 04:21:52.117378', 'step': 3949, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:52.169105', 'step': 3949, 'epoch': 3} {'type': 'loss', 'content': 0.0014610253274440765, 'timestamp': '2025-10-01 04:21:52.182634', 'step': 3950, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:21:52.227394', 'step': 3950, 'epoch': 3} {'type': 'loss', 'content': 0.003693748265504837, 'timestamp': '2025-10-01 04:21:52.235538', 'step': 3951, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:21:52.277531', 'step': 3951, 'epoch': 3} {'type': 'loss', 'content': 0.00797701720148325, 'timestamp': '2025-10-01 04:21:52.309186', 'step': 3952, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:21:52.351572', 'step': 3952, 'epoch': 3} {'type': 'loss', 'content': 0.010840168222784996, 'timestamp': '2025-10-01 04:21:52.361781', 'step': 3953, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:21:52.409929', 'step': 3953, 'epoch': 3} {'type': 'loss', 'content': 0.0038654720410704613, 'timestamp': '2025-10-01 04:21:52.422665', 'step': 3954, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:52.462544', 'step': 3954, 'epoch': 3} {'type': 'loss', 'content': 0.004548898432403803, 'timestamp': '2025-10-01 04:21:52.476037', 'step': 3955, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:21:52.512299', 'step': 3955, 'epoch': 3} {'type': 'loss', 'content': 0.0032744272612035275, 'timestamp': '2025-10-01 04:21:52.546040', 'step': 3956, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:21:52.583152', 'step': 3956, 'epoch': 3} {'type': 'loss', 'content': 0.002379852579906583, 'timestamp': '2025-10-01 04:21:52.594192', 'step': 3957, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:52.643542', 'step': 3957, 'epoch': 3} {'type': 'loss', 'content': 0.005453142803162336, 'timestamp': '2025-10-01 04:21:52.657111', 'step': 3958, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-10-01 04:21:52.716188', 'step': 3958, 'epoch': 3} {'type': 'loss', 'content': 0.009838271886110306, 'timestamp': '2025-10-01 04:21:52.732688', 'step': 3959, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:21:52.792830', 'step': 3959, 'epoch': 3} {'type': 'loss', 'content': 0.003469709539785981, 'timestamp': '2025-10-01 04:21:52.829777', 'step': 3960, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:21:52.871554', 'step': 3960, 'epoch': 3} {'type': 'loss', 'content': 0.002748725935816765, 'timestamp': '2025-10-01 04:21:52.884959', 'step': 3961, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:21:52.929529', 'step': 3961, 'epoch': 3} {'type': 'loss', 'content': 0.0019808292854577303, 'timestamp': '2025-10-01 04:21:52.940249', 'step': 3962, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-10-01 04:21:52.995046', 'step': 3962, 'epoch': 3} {'type': 'loss', 'content': 0.003518739715218544, 'timestamp': '2025-10-01 04:21:53.011395', 'step': 3963, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:21:53.062969', 'step': 3963, 'epoch': 3} {'type': 'loss', 'content': 0.003578239120543003, 'timestamp': '2025-10-01 04:21:53.097948', 'step': 3964, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:53.149790', 'step': 3964, 'epoch': 3} {'type': 'loss', 'content': 0.0023242479655891657, 'timestamp': '2025-10-01 04:21:53.160145', 'step': 3965, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:21:53.212022', 'step': 3965, 'epoch': 3} {'type': 'loss', 'content': 0.004066049586981535, 'timestamp': '2025-10-01 04:21:53.221592', 'step': 3966, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:21:53.277215', 'step': 3966, 'epoch': 3} {'type': 'loss', 'content': 0.0036848580930382013, 'timestamp': '2025-10-01 04:21:53.288313', 'step': 3967, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:21:53.332085', 'step': 3967, 'epoch': 3} {'type': 'loss', 'content': 0.0013571695890277624, 'timestamp': '2025-10-01 04:21:53.363875', 'step': 3968, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:21:53.415870', 'step': 3968, 'epoch': 3} {'type': 'loss', 'content': 0.009095735847949982, 'timestamp': '2025-10-01 04:21:53.429159', 'step': 3969, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:21:53.473555', 'step': 3969, 'epoch': 3} {'type': 'loss', 'content': 0.001621878589503467, 'timestamp': '2025-10-01 04:21:53.484279', 'step': 3970, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:21:53.528042', 'step': 3970, 'epoch': 3} {'type': 'loss', 'content': 0.007703746203333139, 'timestamp': '2025-10-01 04:21:53.542211', 'step': 3971, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:53.587950', 'step': 3971, 'epoch': 3} {'type': 'loss', 'content': 0.0031901104375720024, 'timestamp': '2025-10-01 04:21:53.621372', 'step': 3972, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:21:53.668921', 'step': 3972, 'epoch': 3} {'type': 'loss', 'content': 0.009260383434593678, 'timestamp': '2025-10-01 04:21:53.684245', 'step': 3973, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:21:53.734412', 'step': 3973, 'epoch': 3} {'type': 'loss', 'content': 0.007291245274245739, 'timestamp': '2025-10-01 04:21:53.750425', 'step': 3974, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:53.793529', 'step': 3974, 'epoch': 3} {'type': 'loss', 'content': 0.0049786875024437904, 'timestamp': '2025-10-01 04:21:53.804890', 'step': 3975, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:21:53.849608', 'step': 3975, 'epoch': 3} {'type': 'loss', 'content': 0.006085373926907778, 'timestamp': '2025-10-01 04:21:53.881310', 'step': 3976, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:53.917948', 'step': 3976, 'epoch': 3} {'type': 'loss', 'content': 0.002700099488720298, 'timestamp': '2025-10-01 04:21:53.930335', 'step': 3977, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:53.976614', 'step': 3977, 'epoch': 3} {'type': 'loss', 'content': 0.008996649645268917, 'timestamp': '2025-10-01 04:21:53.990160', 'step': 3978, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:54.040684', 'step': 3978, 'epoch': 3} {'type': 'loss', 'content': 0.005770283751189709, 'timestamp': '2025-10-01 04:21:54.054225', 'step': 3979, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:54.101856', 'step': 3979, 'epoch': 3} {'type': 'loss', 'content': 0.007909988053143024, 'timestamp': '2025-10-01 04:21:54.135303', 'step': 3980, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:54.178924', 'step': 3980, 'epoch': 3} {'type': 'loss', 'content': 0.003248956985771656, 'timestamp': '2025-10-01 04:21:54.188262', 'step': 3981, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:21:54.228675', 'step': 3981, 'epoch': 3} {'type': 'loss', 'content': 0.0023408562410622835, 'timestamp': '2025-10-01 04:21:54.241223', 'step': 3982, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:21:54.280648', 'step': 3982, 'epoch': 3} {'type': 'loss', 'content': 0.0007053704466670752, 'timestamp': '2025-10-01 04:21:54.288210', 'step': 3983, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:21:54.336706', 'step': 3983, 'epoch': 3} {'type': 'loss', 'content': 0.005955597385764122, 'timestamp': '2025-10-01 04:21:54.373745', 'step': 3984, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:21:54.419799', 'step': 3984, 'epoch': 3} {'type': 'loss', 'content': 0.002642560051754117, 'timestamp': '2025-10-01 04:21:54.426117', 'step': 3985, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:21:54.470392', 'step': 3985, 'epoch': 3} {'type': 'loss', 'content': 0.003556240350008011, 'timestamp': '2025-10-01 04:21:54.477826', 'step': 3986, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:21:54.521288', 'step': 3986, 'epoch': 3} {'type': 'loss', 'content': 0.0020851942244917154, 'timestamp': '2025-10-01 04:21:54.532174', 'step': 3987, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:54.568563', 'step': 3987, 'epoch': 3} {'type': 'loss', 'content': 0.0054181222803890705, 'timestamp': '2025-10-01 04:21:54.601009', 'step': 3988, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:21:54.648388', 'step': 3988, 'epoch': 3} {'type': 'loss', 'content': 0.012517843395471573, 'timestamp': '2025-10-01 04:21:54.661909', 'step': 3989, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:21:54.705686', 'step': 3989, 'epoch': 3} {'type': 'loss', 'content': 0.003964308649301529, 'timestamp': '2025-10-01 04:21:54.714920', 'step': 3990, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:21:54.763495', 'step': 3990, 'epoch': 3} {'type': 'loss', 'content': 0.005212290212512016, 'timestamp': '2025-10-01 04:21:54.770940', 'step': 3991, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:21:54.808892', 'step': 3991, 'epoch': 3} {'type': 'loss', 'content': 0.002421075478196144, 'timestamp': '2025-10-01 04:21:54.841381', 'step': 3992, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:21:54.880518', 'step': 3992, 'epoch': 3} {'type': 'loss', 'content': 0.002844829112291336, 'timestamp': '2025-10-01 04:21:54.893399', 'step': 3993, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:21:54.935882', 'step': 3993, 'epoch': 3} {'type': 'loss', 'content': 0.005474701523780823, 'timestamp': '2025-10-01 04:21:54.943867', 'step': 3994, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:21:54.983647', 'step': 3994, 'epoch': 3} {'type': 'loss', 'content': 0.009113478474318981, 'timestamp': '2025-10-01 04:21:54.991460', 'step': 3995, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:21:55.046330', 'step': 3995, 'epoch': 3} {'type': 'loss', 'content': 0.0035934862680733204, 'timestamp': '2025-10-01 04:21:55.081165', 'step': 3996, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:21:55.126864', 'step': 3996, 'epoch': 3} {'type': 'loss', 'content': 0.009463516995310783, 'timestamp': '2025-10-01 04:21:55.142466', 'step': 3997, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-10-01 04:21:55.190808', 'step': 3997, 'epoch': 3} {'type': 'loss', 'content': 0.0054005784913897514, 'timestamp': '2025-10-01 04:21:55.207996', 'step': 3998, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:21:55.279606', 'step': 3998, 'epoch': 3} {'type': 'loss', 'content': 0.007956825196743011, 'timestamp': '2025-10-01 04:21:55.293606', 'step': 3999, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:21:55.340044', 'step': 3999, 'epoch': 3} {'type': 'loss', 'content': 0.0042536500841379166, 'timestamp': '2025-10-01 04:21:55.375215', 'step': 4000, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 4000', 'timestamp': '2025-10-01 04:22:01.048867', 'step': 4000, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:22:01.085732', 'step': 4000, 'epoch': 3} {'type': 'loss', 'content': 0.005294458009302616, 'timestamp': '2025-10-01 04:22:01.092460', 'step': 4001, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:22:01.138700', 'step': 4001, 'epoch': 3} {'type': 'loss', 'content': 0.00946174655109644, 'timestamp': '2025-10-01 04:22:01.152666', 'step': 4002, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:22:01.201605', 'step': 4002, 'epoch': 3} {'type': 'loss', 'content': 0.005702061578631401, 'timestamp': '2025-10-01 04:22:01.214168', 'step': 4003, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:22:01.274104', 'step': 4003, 'epoch': 3} {'type': 'loss', 'content': 0.0025700535625219345, 'timestamp': '2025-10-01 04:22:01.308632', 'step': 4004, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:22:01.369578', 'step': 4004, 'epoch': 3} {'type': 'loss', 'content': 0.0030462751165032387, 'timestamp': '2025-10-01 04:22:01.383197', 'step': 4005, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:22:01.434189', 'step': 4005, 'epoch': 3} {'type': 'loss', 'content': 0.003450506366789341, 'timestamp': '2025-10-01 04:22:01.447770', 'step': 4006, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:22:01.498841', 'step': 4006, 'epoch': 3} {'type': 'loss', 'content': 0.009651977568864822, 'timestamp': '2025-10-01 04:22:01.511570', 'step': 4007, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:22:01.568447', 'step': 4007, 'epoch': 3} {'type': 'loss', 'content': 0.005251831840723753, 'timestamp': '2025-10-01 04:22:01.603406', 'step': 4008, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:22:01.640920', 'step': 4008, 'epoch': 3} {'type': 'loss', 'content': 0.003374854801222682, 'timestamp': '2025-10-01 04:22:01.649277', 'step': 4009, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:22:01.707157', 'step': 4009, 'epoch': 3} {'type': 'loss', 'content': 0.004814501386135817, 'timestamp': '2025-10-01 04:22:01.719940', 'step': 4010, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:22:01.773823', 'step': 4010, 'epoch': 3} {'type': 'loss', 'content': 0.00974239595234394, 'timestamp': '2025-10-01 04:22:01.781421', 'step': 4011, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:22:01.832688', 'step': 4011, 'epoch': 3} {'type': 'loss', 'content': 0.009027737192809582, 'timestamp': '2025-10-01 04:22:01.861483', 'step': 4012, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:22:01.905734', 'step': 4012, 'epoch': 3} {'type': 'loss', 'content': 0.0016402070177718997, 'timestamp': '2025-10-01 04:22:01.912530', 'step': 4013, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:22:01.949156', 'step': 4013, 'epoch': 3} {'type': 'loss', 'content': 0.005706467665731907, 'timestamp': '2025-10-01 04:22:01.956657', 'step': 4014, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:22:02.001246', 'step': 4014, 'epoch': 3} {'type': 'loss', 'content': 0.0037947460077703, 'timestamp': '2025-10-01 04:22:02.013997', 'step': 4015, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-10-01 04:22:02.047914', 'step': 4015, 'epoch': 3} {'type': 'loss', 'content': 0.0011354503221809864, 'timestamp': '2025-10-01 04:22:02.074664', 'step': 4016, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:22:02.124923', 'step': 4016, 'epoch': 3} {'type': 'loss', 'content': 0.0029890364967286587, 'timestamp': '2025-10-01 04:22:02.138300', 'step': 4017, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:22:02.181187', 'step': 4017, 'epoch': 3} {'type': 'loss', 'content': 0.003457897575572133, 'timestamp': '2025-10-01 04:22:02.192095', 'step': 4018, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:22:02.246891', 'step': 4018, 'epoch': 3} {'type': 'loss', 'content': 0.0037044628988951445, 'timestamp': '2025-10-01 04:22:02.260910', 'step': 4019, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:22:02.318575', 'step': 4019, 'epoch': 3} {'type': 'loss', 'content': 0.006951749790459871, 'timestamp': '2025-10-01 04:22:02.355400', 'step': 4020, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:22:02.410862', 'step': 4020, 'epoch': 3} {'type': 'loss', 'content': 0.00672274362295866, 'timestamp': '2025-10-01 04:22:02.424251', 'step': 4021, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:22:02.466295', 'step': 4021, 'epoch': 3} {'type': 'loss', 'content': 0.004746371414512396, 'timestamp': '2025-10-01 04:22:02.479889', 'step': 4022, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:22:02.529176', 'step': 4022, 'epoch': 3} {'type': 'loss', 'content': 0.003702593268826604, 'timestamp': '2025-10-01 04:22:02.543370', 'step': 4023, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:22:02.592484', 'step': 4023, 'epoch': 3} {'type': 'loss', 'content': 0.003959006164222956, 'timestamp': '2025-10-01 04:22:02.626972', 'step': 4024, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:02.669562', 'step': 4024, 'epoch': 3} {'type': 'loss', 'content': 0.007566820364445448, 'timestamp': '2025-10-01 04:22:02.678869', 'step': 4025, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-10-01 04:22:05.400318', 'step': 4025, 'epoch': 3} {'type': 'pplx', 'content': 5.898660951229456, 'timestamp': '2025-10-01 04:22:05.405115', 'step': 4025, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:22:05.445082', 'step': 4025, 'epoch': 3} {'type': 'loss', 'content': 0.0020642371382564306, 'timestamp': '2025-10-01 04:22:05.458649', 'step': 4026, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:22:05.505938', 'step': 4026, 'epoch': 3} {'type': 'loss', 'content': 0.001972567057237029, 'timestamp': '2025-10-01 04:22:05.515049', 'step': 4027, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:05.558594', 'step': 4027, 'epoch': 3} {'type': 'loss', 'content': 0.002096639247611165, 'timestamp': '2025-10-01 04:22:05.587623', 'step': 4028, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:22:05.635148', 'step': 4028, 'epoch': 3} {'type': 'loss', 'content': 0.0028044255450367928, 'timestamp': '2025-10-01 04:22:05.643857', 'step': 4029, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:05.688211', 'step': 4029, 'epoch': 3} {'type': 'loss', 'content': 0.0021916457917541265, 'timestamp': '2025-10-01 04:22:05.701244', 'step': 4030, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:22:05.740819', 'step': 4030, 'epoch': 3} {'type': 'loss', 'content': 0.001907055382616818, 'timestamp': '2025-10-01 04:22:05.750678', 'step': 4031, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:22:05.791632', 'step': 4031, 'epoch': 3} {'type': 'loss', 'content': 0.007362797856330872, 'timestamp': '2025-10-01 04:22:05.823414', 'step': 4032, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:22:05.863479', 'step': 4032, 'epoch': 3} {'type': 'loss', 'content': 0.009867505170404911, 'timestamp': '2025-10-01 04:22:05.869272', 'step': 4033, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:22:05.907534', 'step': 4033, 'epoch': 3} {'type': 'loss', 'content': 0.0027310277801007032, 'timestamp': '2025-10-01 04:22:05.915709', 'step': 4034, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:05.961673', 'step': 4034, 'epoch': 3} {'type': 'loss', 'content': 0.00649982038885355, 'timestamp': '2025-10-01 04:22:05.969976', 'step': 4035, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:06.016963', 'step': 4035, 'epoch': 3} {'type': 'loss', 'content': 0.003822494763880968, 'timestamp': '2025-10-01 04:22:06.049482', 'step': 4036, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:22:06.094903', 'step': 4036, 'epoch': 3} {'type': 'loss', 'content': 0.007370095234364271, 'timestamp': '2025-10-01 04:22:06.108310', 'step': 4037, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:06.150348', 'step': 4037, 'epoch': 3} {'type': 'loss', 'content': 0.005830608308315277, 'timestamp': '2025-10-01 04:22:06.162090', 'step': 4038, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:06.203523', 'step': 4038, 'epoch': 3} {'type': 'loss', 'content': 0.006647130008786917, 'timestamp': '2025-10-01 04:22:06.215024', 'step': 4039, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:22:06.260461', 'step': 4039, 'epoch': 3} {'type': 'loss', 'content': 0.001578525290824473, 'timestamp': '2025-10-01 04:22:06.294958', 'step': 4040, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:22:06.336555', 'step': 4040, 'epoch': 3} {'type': 'loss', 'content': 0.005431648343801498, 'timestamp': '2025-10-01 04:22:06.349466', 'step': 4041, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:22:06.396047', 'step': 4041, 'epoch': 3} {'type': 'loss', 'content': 0.003575842594727874, 'timestamp': '2025-10-01 04:22:06.410074', 'step': 4042, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:22:06.455659', 'step': 4042, 'epoch': 3} {'type': 'loss', 'content': 0.007028443273156881, 'timestamp': '2025-10-01 04:22:06.469869', 'step': 4043, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:22:06.514688', 'step': 4043, 'epoch': 3} {'type': 'loss', 'content': 0.005563113372772932, 'timestamp': '2025-10-01 04:22:06.549235', 'step': 4044, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:22:06.590693', 'step': 4044, 'epoch': 3} {'type': 'loss', 'content': 0.001840126235038042, 'timestamp': '2025-10-01 04:22:06.601132', 'step': 4045, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:22:06.640446', 'step': 4045, 'epoch': 3} {'type': 'loss', 'content': 0.006613203324377537, 'timestamp': '2025-10-01 04:22:06.653033', 'step': 4046, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:22:06.700606', 'step': 4046, 'epoch': 3} {'type': 'loss', 'content': 0.0021520627196878195, 'timestamp': '2025-10-01 04:22:06.710380', 'step': 4047, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:22:06.766438', 'step': 4047, 'epoch': 3} {'type': 'loss', 'content': 0.004538105335086584, 'timestamp': '2025-10-01 04:22:06.803446', 'step': 4048, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:22:06.852428', 'step': 4048, 'epoch': 3} {'type': 'loss', 'content': 0.005180913023650646, 'timestamp': '2025-10-01 04:22:06.865786', 'step': 4049, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:06.911988', 'step': 4049, 'epoch': 3} {'type': 'loss', 'content': 0.0066529144532978535, 'timestamp': '2025-10-01 04:22:06.924819', 'step': 4050, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:22:06.975061', 'step': 4050, 'epoch': 3} {'type': 'loss', 'content': 0.008856681175529957, 'timestamp': '2025-10-01 04:22:06.991129', 'step': 4051, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 14238374516352}, 'timestamp': '2025-10-01 04:22:07.055386', 'step': 4051, 'epoch': 3} {'type': 'loss', 'content': 0.004478821065276861, 'timestamp': '2025-10-01 04:22:07.093910', 'step': 4052, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:22:07.146114', 'step': 4052, 'epoch': 3} {'type': 'loss', 'content': 0.008157907985150814, 'timestamp': '2025-10-01 04:22:07.157506', 'step': 4053, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:22:07.204848', 'step': 4053, 'epoch': 3} {'type': 'loss', 'content': 0.011038596741855145, 'timestamp': '2025-10-01 04:22:07.217612', 'step': 4054, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:07.261798', 'step': 4054, 'epoch': 3} {'type': 'loss', 'content': 0.0047259279526770115, 'timestamp': '2025-10-01 04:22:07.271016', 'step': 4055, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:22:07.311507', 'step': 4055, 'epoch': 3} {'type': 'loss', 'content': 0.008237821981310844, 'timestamp': '2025-10-01 04:22:07.344977', 'step': 4056, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:07.383856', 'step': 4056, 'epoch': 3} {'type': 'loss', 'content': 0.008062207140028477, 'timestamp': '2025-10-01 04:22:07.394645', 'step': 4057, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:22:07.434391', 'step': 4057, 'epoch': 3} {'type': 'loss', 'content': 0.005449914839118719, 'timestamp': '2025-10-01 04:22:07.442422', 'step': 4058, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:07.483700', 'step': 4058, 'epoch': 3} {'type': 'loss', 'content': 0.007627742365002632, 'timestamp': '2025-10-01 04:22:07.494702', 'step': 4059, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:22:07.538010', 'step': 4059, 'epoch': 3} {'type': 'loss', 'content': 0.010227284394204617, 'timestamp': '2025-10-01 04:22:07.569830', 'step': 4060, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:22:07.608156', 'step': 4060, 'epoch': 3} {'type': 'loss', 'content': 0.00814125593751669, 'timestamp': '2025-10-01 04:22:07.618402', 'step': 4061, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:22:07.659722', 'step': 4061, 'epoch': 3} {'type': 'loss', 'content': 0.005088137928396463, 'timestamp': '2025-10-01 04:22:07.673911', 'step': 4062, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:22:07.722326', 'step': 4062, 'epoch': 3} {'type': 'loss', 'content': 0.005096712149679661, 'timestamp': '2025-10-01 04:22:07.736329', 'step': 4063, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:07.776132', 'step': 4063, 'epoch': 3} {'type': 'loss', 'content': 0.0004924583481624722, 'timestamp': '2025-10-01 04:22:07.807583', 'step': 4064, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:07.850703', 'step': 4064, 'epoch': 3} {'type': 'loss', 'content': 0.0043891193345189095, 'timestamp': '2025-10-01 04:22:07.856363', 'step': 4065, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:22:07.899175', 'step': 4065, 'epoch': 3} {'type': 'loss', 'content': 0.009185709059238434, 'timestamp': '2025-10-01 04:22:07.911910', 'step': 4066, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:22:07.951584', 'step': 4066, 'epoch': 3} {'type': 'loss', 'content': 0.0029773563146591187, 'timestamp': '2025-10-01 04:22:07.959402', 'step': 4067, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:22:08.001402', 'step': 4067, 'epoch': 3} {'type': 'loss', 'content': 0.009250526316463947, 'timestamp': '2025-10-01 04:22:08.034780', 'step': 4068, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:22:08.079466', 'step': 4068, 'epoch': 3} {'type': 'loss', 'content': 0.0009133016574196517, 'timestamp': '2025-10-01 04:22:08.086213', 'step': 4069, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:22:08.132277', 'step': 4069, 'epoch': 3} {'type': 'loss', 'content': 0.009127314202487469, 'timestamp': '2025-10-01 04:22:08.144849', 'step': 4070, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:22:08.195806', 'step': 4070, 'epoch': 3} {'type': 'loss', 'content': 0.0042390376329422, 'timestamp': '2025-10-01 04:22:08.210711', 'step': 4071, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:22:08.270090', 'step': 4071, 'epoch': 3} {'type': 'loss', 'content': 0.0010975259356200695, 'timestamp': '2025-10-01 04:22:08.301364', 'step': 4072, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:08.340708', 'step': 4072, 'epoch': 3} {'type': 'loss', 'content': 0.006118766497820616, 'timestamp': '2025-10-01 04:22:08.349780', 'step': 4073, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:22:08.394819', 'step': 4073, 'epoch': 3} {'type': 'loss', 'content': 0.005468964111059904, 'timestamp': '2025-10-01 04:22:08.402847', 'step': 4074, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:22:08.443312', 'step': 4074, 'epoch': 3} {'type': 'loss', 'content': 0.00869242288172245, 'timestamp': '2025-10-01 04:22:08.457347', 'step': 4075, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:22:08.493624', 'step': 4075, 'epoch': 3} {'type': 'loss', 'content': 0.003010945161804557, 'timestamp': '2025-10-01 04:22:08.522237', 'step': 4076, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:08.560681', 'step': 4076, 'epoch': 3} {'type': 'loss', 'content': 0.005359400063753128, 'timestamp': '2025-10-01 04:22:08.570495', 'step': 4077, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:22:08.613803', 'step': 4077, 'epoch': 3} {'type': 'loss', 'content': 0.0032695510890334845, 'timestamp': '2025-10-01 04:22:08.626289', 'step': 4078, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:22:08.662798', 'step': 4078, 'epoch': 3} {'type': 'loss', 'content': 0.0047582704573869705, 'timestamp': '2025-10-01 04:22:08.673632', 'step': 4079, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:22:08.714261', 'step': 4079, 'epoch': 3} {'type': 'loss', 'content': 0.003358026035130024, 'timestamp': '2025-10-01 04:22:08.747968', 'step': 4080, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:22:08.788237', 'step': 4080, 'epoch': 3} {'type': 'loss', 'content': 0.012069402262568474, 'timestamp': '2025-10-01 04:22:08.801071', 'step': 4081, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:22:08.843297', 'step': 4081, 'epoch': 3} {'type': 'loss', 'content': 0.0038407177198678255, 'timestamp': '2025-10-01 04:22:08.856078', 'step': 4082, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:08.899150', 'step': 4082, 'epoch': 3} {'type': 'loss', 'content': 0.006149406544864178, 'timestamp': '2025-10-01 04:22:08.908299', 'step': 4083, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:22:08.956810', 'step': 4083, 'epoch': 3} {'type': 'loss', 'content': 0.0030754453036934137, 'timestamp': '2025-10-01 04:22:08.991620', 'step': 4084, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:22:09.030863', 'step': 4084, 'epoch': 3} {'type': 'loss', 'content': 0.011341019533574581, 'timestamp': '2025-10-01 04:22:09.041222', 'step': 4085, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:22:09.091309', 'step': 4085, 'epoch': 3} {'type': 'loss', 'content': 0.005612129345536232, 'timestamp': '2025-10-01 04:22:09.104869', 'step': 4086, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:22:09.171297', 'step': 4086, 'epoch': 3} {'type': 'loss', 'content': 0.003415448125451803, 'timestamp': '2025-10-01 04:22:09.185450', 'step': 4087, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:22:09.229552', 'step': 4087, 'epoch': 3} {'type': 'loss', 'content': 0.00666961632668972, 'timestamp': '2025-10-01 04:22:09.263239', 'step': 4088, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:22:09.310925', 'step': 4088, 'epoch': 3} {'type': 'loss', 'content': 0.006907173898071051, 'timestamp': '2025-10-01 04:22:09.323834', 'step': 4089, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:22:09.372095', 'step': 4089, 'epoch': 3} {'type': 'loss', 'content': 0.007343146484345198, 'timestamp': '2025-10-01 04:22:09.382621', 'step': 4090, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:22:09.427283', 'step': 4090, 'epoch': 3} {'type': 'loss', 'content': 0.010336176492273808, 'timestamp': '2025-10-01 04:22:09.434795', 'step': 4091, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:22:09.480691', 'step': 4091, 'epoch': 3} {'type': 'loss', 'content': 0.008009220473468304, 'timestamp': '2025-10-01 04:22:09.511113', 'step': 4092, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:09.555566', 'step': 4092, 'epoch': 3} {'type': 'loss', 'content': 0.0048915427178144455, 'timestamp': '2025-10-01 04:22:09.563364', 'step': 4093, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:09.606796', 'step': 4093, 'epoch': 3} {'type': 'loss', 'content': 0.005842886865139008, 'timestamp': '2025-10-01 04:22:09.615244', 'step': 4094, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-10-01 04:22:09.659842', 'step': 4094, 'epoch': 3} {'type': 'loss', 'content': 0.02300436794757843, 'timestamp': '2025-10-01 04:22:09.670638', 'step': 4095, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:22:09.721593', 'step': 4095, 'epoch': 3} {'type': 'loss', 'content': 0.006876409985125065, 'timestamp': '2025-10-01 04:22:09.749784', 'step': 4096, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 15187581968384}, 'timestamp': '2025-10-01 04:22:09.813378', 'step': 4096, 'epoch': 3} {'type': 'loss', 'content': 0.008503127843141556, 'timestamp': '2025-10-01 04:22:09.830947', 'step': 4097, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:22:09.874899', 'step': 4097, 'epoch': 3} {'type': 'loss', 'content': 0.004703718237578869, 'timestamp': '2025-10-01 04:22:09.888957', 'step': 4098, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:22:09.934435', 'step': 4098, 'epoch': 3} {'type': 'loss', 'content': 0.013144118711352348, 'timestamp': '2025-10-01 04:22:09.939473', 'step': 4099, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:09.981280', 'step': 4099, 'epoch': 3} {'type': 'loss', 'content': 0.006982049904763699, 'timestamp': '2025-10-01 04:22:10.015325', 'step': 4100, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:22:10.059283', 'step': 4100, 'epoch': 3} {'type': 'loss', 'content': 0.0060974652878940105, 'timestamp': '2025-10-01 04:22:10.064903', 'step': 4101, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:10.110592', 'step': 4101, 'epoch': 3} {'type': 'loss', 'content': 0.005732825491577387, 'timestamp': '2025-10-01 04:22:10.124632', 'step': 4102, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 14712978242368}, 'timestamp': '2025-10-01 04:22:10.183364', 'step': 4102, 'epoch': 3} {'type': 'loss', 'content': 0.0042778546921908855, 'timestamp': '2025-10-01 04:22:10.201156', 'step': 4103, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:22:10.249515', 'step': 4103, 'epoch': 3} {'type': 'loss', 'content': 0.008323310874402523, 'timestamp': '2025-10-01 04:22:10.278712', 'step': 4104, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:10.323514', 'step': 4104, 'epoch': 3} {'type': 'loss', 'content': 0.012524610385298729, 'timestamp': '2025-10-01 04:22:10.329340', 'step': 4105, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:10.376430', 'step': 4105, 'epoch': 3} {'type': 'loss', 'content': 0.003543686820194125, 'timestamp': '2025-10-01 04:22:10.384606', 'step': 4106, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:10.433265', 'step': 4106, 'epoch': 3} {'type': 'loss', 'content': 0.007255967706441879, 'timestamp': '2025-10-01 04:22:10.441656', 'step': 4107, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:22:10.484248', 'step': 4107, 'epoch': 3} {'type': 'loss', 'content': 0.006084497552365065, 'timestamp': '2025-10-01 04:22:10.519157', 'step': 4108, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:22:10.566538', 'step': 4108, 'epoch': 3} {'type': 'loss', 'content': 0.004911665804684162, 'timestamp': '2025-10-01 04:22:10.572211', 'step': 4109, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:22:10.620438', 'step': 4109, 'epoch': 3} {'type': 'loss', 'content': 0.00566836865618825, 'timestamp': '2025-10-01 04:22:10.634446', 'step': 4110, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:22:10.686686', 'step': 4110, 'epoch': 3} {'type': 'loss', 'content': 0.00631115585565567, 'timestamp': '2025-10-01 04:22:10.700271', 'step': 4111, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:22:10.749591', 'step': 4111, 'epoch': 3} {'type': 'loss', 'content': 0.006609178613871336, 'timestamp': '2025-10-01 04:22:10.784554', 'step': 4112, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-10-01 04:22:10.840795', 'step': 4112, 'epoch': 3} {'type': 'loss', 'content': 0.005407432094216347, 'timestamp': '2025-10-01 04:22:10.857668', 'step': 4113, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:22:10.906234', 'step': 4113, 'epoch': 3} {'type': 'loss', 'content': 0.0037685553543269634, 'timestamp': '2025-10-01 04:22:10.917090', 'step': 4114, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:10.965980', 'step': 4114, 'epoch': 3} {'type': 'loss', 'content': 0.020283052697777748, 'timestamp': '2025-10-01 04:22:10.974303', 'step': 4115, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:11.023145', 'step': 4115, 'epoch': 3} {'type': 'loss', 'content': 0.006402190309017897, 'timestamp': '2025-10-01 04:22:11.055452', 'step': 4116, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:22:11.094163', 'step': 4116, 'epoch': 3} {'type': 'loss', 'content': 0.009551385417580605, 'timestamp': '2025-10-01 04:22:11.103095', 'step': 4117, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:22:11.156040', 'step': 4117, 'epoch': 3} {'type': 'loss', 'content': 0.005368416663259268, 'timestamp': '2025-10-01 04:22:11.172148', 'step': 4118, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:22:11.217739', 'step': 4118, 'epoch': 3} {'type': 'loss', 'content': 0.0076238736510276794, 'timestamp': '2025-10-01 04:22:11.231874', 'step': 4119, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:22:11.288166', 'step': 4119, 'epoch': 3} {'type': 'loss', 'content': 0.0041812751442193985, 'timestamp': '2025-10-01 04:22:11.323119', 'step': 4120, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:11.371839', 'step': 4120, 'epoch': 3} {'type': 'loss', 'content': 0.003642105031758547, 'timestamp': '2025-10-01 04:22:11.381154', 'step': 4121, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:22:11.435792', 'step': 4121, 'epoch': 3} {'type': 'loss', 'content': 0.006128326058387756, 'timestamp': '2025-10-01 04:22:11.447982', 'step': 4122, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:22:11.502367', 'step': 4122, 'epoch': 3} {'type': 'loss', 'content': 0.010363324545323849, 'timestamp': '2025-10-01 04:22:11.514993', 'step': 4123, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:11.575485', 'step': 4123, 'epoch': 3} {'type': 'loss', 'content': 0.004702799022197723, 'timestamp': '2025-10-01 04:22:11.608160', 'step': 4124, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:22:11.666920', 'step': 4124, 'epoch': 3} {'type': 'loss', 'content': 0.0123340655118227, 'timestamp': '2025-10-01 04:22:11.679827', 'step': 4125, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:22:11.731455', 'step': 4125, 'epoch': 3} {'type': 'loss', 'content': 0.0030719658825546503, 'timestamp': '2025-10-01 04:22:11.739480', 'step': 4126, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:11.792371', 'step': 4126, 'epoch': 3} {'type': 'loss', 'content': 0.005457690451294184, 'timestamp': '2025-10-01 04:22:11.804083', 'step': 4127, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:11.863430', 'step': 4127, 'epoch': 3} {'type': 'loss', 'content': 0.0019527450203895569, 'timestamp': '2025-10-01 04:22:11.896065', 'step': 4128, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:22:11.964836', 'step': 4128, 'epoch': 3} {'type': 'loss', 'content': 0.009599453769624233, 'timestamp': '2025-10-01 04:22:11.975942', 'step': 4129, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:12.028024', 'step': 4129, 'epoch': 3} {'type': 'loss', 'content': 0.0172463059425354, 'timestamp': '2025-10-01 04:22:12.036272', 'step': 4130, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:22:12.090912', 'step': 4130, 'epoch': 3} {'type': 'loss', 'content': 0.00862100999802351, 'timestamp': '2025-10-01 04:22:12.103651', 'step': 4131, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:22:12.160617', 'step': 4131, 'epoch': 3} {'type': 'loss', 'content': 0.0067465766333043575, 'timestamp': '2025-10-01 04:22:12.195592', 'step': 4132, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:22:12.250877', 'step': 4132, 'epoch': 3} {'type': 'loss', 'content': 0.005544332787394524, 'timestamp': '2025-10-01 04:22:12.264282', 'step': 4133, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:22:12.320177', 'step': 4133, 'epoch': 3} {'type': 'loss', 'content': 0.00807325728237629, 'timestamp': '2025-10-01 04:22:12.334279', 'step': 4134, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-10-01 04:22:12.397380', 'step': 4134, 'epoch': 3} {'type': 'loss', 'content': 0.0048148054629564285, 'timestamp': '2025-10-01 04:22:12.413892', 'step': 4135, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:22:12.473343', 'step': 4135, 'epoch': 3} {'type': 'loss', 'content': 0.0065632229670882225, 'timestamp': '2025-10-01 04:22:12.509763', 'step': 4136, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:22:12.555371', 'step': 4136, 'epoch': 3} {'type': 'loss', 'content': 0.005432798992842436, 'timestamp': '2025-10-01 04:22:12.568208', 'step': 4137, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:22:12.630044', 'step': 4137, 'epoch': 3} {'type': 'loss', 'content': 0.008962566964328289, 'timestamp': '2025-10-01 04:22:12.646223', 'step': 4138, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:22:12.697618', 'step': 4138, 'epoch': 3} {'type': 'loss', 'content': 0.012327460572123528, 'timestamp': '2025-10-01 04:22:12.708529', 'step': 4139, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:12.760870', 'step': 4139, 'epoch': 3} {'type': 'loss', 'content': 0.001992779318243265, 'timestamp': '2025-10-01 04:22:12.793258', 'step': 4140, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-10-01 04:22:15.585003', 'step': 4140, 'epoch': 3} {'type': 'pplx', 'content': 5.917168531157638, 'timestamp': '2025-10-01 04:22:15.590520', 'step': 4140, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:22:15.626962', 'step': 4140, 'epoch': 3} {'type': 'loss', 'content': 0.013613116927444935, 'timestamp': '2025-10-01 04:22:15.635284', 'step': 4141, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:22:15.686554', 'step': 4141, 'epoch': 3} {'type': 'loss', 'content': 0.004260324873030186, 'timestamp': '2025-10-01 04:22:15.700134', 'step': 4142, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:22:15.754674', 'step': 4142, 'epoch': 3} {'type': 'loss', 'content': 0.0024001775309443474, 'timestamp': '2025-10-01 04:22:15.768864', 'step': 4143, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:22:15.823559', 'step': 4143, 'epoch': 3} {'type': 'loss', 'content': 0.0071576121263206005, 'timestamp': '2025-10-01 04:22:15.857242', 'step': 4144, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:22:15.905159', 'step': 4144, 'epoch': 3} {'type': 'loss', 'content': 0.006839216221123934, 'timestamp': '2025-10-01 04:22:15.915561', 'step': 4145, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:22:15.965233', 'step': 4145, 'epoch': 3} {'type': 'loss', 'content': 0.008451160043478012, 'timestamp': '2025-10-01 04:22:15.977975', 'step': 4146, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:16.024649', 'step': 4146, 'epoch': 3} {'type': 'loss', 'content': 0.0027516642585396767, 'timestamp': '2025-10-01 04:22:16.032886', 'step': 4147, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:22:16.080375', 'step': 4147, 'epoch': 3} {'type': 'loss', 'content': 0.006120120640844107, 'timestamp': '2025-10-01 04:22:16.117266', 'step': 4148, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:22:16.166012', 'step': 4148, 'epoch': 3} {'type': 'loss', 'content': 0.0018730685114860535, 'timestamp': '2025-10-01 04:22:16.181660', 'step': 4149, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:22:16.241137', 'step': 4149, 'epoch': 3} {'type': 'loss', 'content': 0.006524836644530296, 'timestamp': '2025-10-01 04:22:16.255185', 'step': 4150, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:22:16.298199', 'step': 4150, 'epoch': 3} {'type': 'loss', 'content': 0.009152116253972054, 'timestamp': '2025-10-01 04:22:16.310973', 'step': 4151, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:22:16.358138', 'step': 4151, 'epoch': 3} {'type': 'loss', 'content': 0.005474897567182779, 'timestamp': '2025-10-01 04:22:16.393266', 'step': 4152, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:22:16.437513', 'step': 4152, 'epoch': 3} {'type': 'loss', 'content': 0.002718254690989852, 'timestamp': '2025-10-01 04:22:16.442844', 'step': 4153, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:22:16.489170', 'step': 4153, 'epoch': 3} {'type': 'loss', 'content': 0.003023600671440363, 'timestamp': '2025-10-01 04:22:16.503090', 'step': 4154, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:16.547675', 'step': 4154, 'epoch': 3} {'type': 'loss', 'content': 0.006308500189334154, 'timestamp': '2025-10-01 04:22:16.555979', 'step': 4155, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:22:16.603397', 'step': 4155, 'epoch': 3} {'type': 'loss', 'content': 0.00520367594435811, 'timestamp': '2025-10-01 04:22:16.632453', 'step': 4156, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 14238374516352}, 'timestamp': '2025-10-01 04:22:16.694627', 'step': 4156, 'epoch': 3} {'type': 'loss', 'content': 0.005836689844727516, 'timestamp': '2025-10-01 04:22:16.711805', 'step': 4157, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:16.765208', 'step': 4157, 'epoch': 3} {'type': 'loss', 'content': 0.015569687820971012, 'timestamp': '2025-10-01 04:22:16.773689', 'step': 4158, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:22:16.826765', 'step': 4158, 'epoch': 3} {'type': 'loss', 'content': 0.00576051464304328, 'timestamp': '2025-10-01 04:22:16.834650', 'step': 4159, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:22:16.890276', 'step': 4159, 'epoch': 3} {'type': 'loss', 'content': 0.004776860121637583, 'timestamp': '2025-10-01 04:22:16.922348', 'step': 4160, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:22:16.972027', 'step': 4160, 'epoch': 3} {'type': 'loss', 'content': 0.007326411549001932, 'timestamp': '2025-10-01 04:22:16.983256', 'step': 4161, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-10-01 04:22:17.036986', 'step': 4161, 'epoch': 3} {'type': 'loss', 'content': 0.004407929722219706, 'timestamp': '2025-10-01 04:22:17.053323', 'step': 4162, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:22:17.096360', 'step': 4162, 'epoch': 3} {'type': 'loss', 'content': 0.007858099415898323, 'timestamp': '2025-10-01 04:22:17.108913', 'step': 4163, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:22:17.156659', 'step': 4163, 'epoch': 3} {'type': 'loss', 'content': 0.008644106797873974, 'timestamp': '2025-10-01 04:22:17.190199', 'step': 4164, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:22:17.238315', 'step': 4164, 'epoch': 3} {'type': 'loss', 'content': 0.006930166389793158, 'timestamp': '2025-10-01 04:22:17.246778', 'step': 4165, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:17.299533', 'step': 4165, 'epoch': 3} {'type': 'loss', 'content': 0.0038496297784149647, 'timestamp': '2025-10-01 04:22:17.307915', 'step': 4166, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:22:17.353187', 'step': 4166, 'epoch': 3} {'type': 'loss', 'content': 0.0036294120363891125, 'timestamp': '2025-10-01 04:22:17.361124', 'step': 4167, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:22:17.403175', 'step': 4167, 'epoch': 3} {'type': 'loss', 'content': 0.00484378170222044, 'timestamp': '2025-10-01 04:22:17.436688', 'step': 4168, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:22:17.485693', 'step': 4168, 'epoch': 3} {'type': 'loss', 'content': 0.00413028709590435, 'timestamp': '2025-10-01 04:22:17.498543', 'step': 4169, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:22:17.543105', 'step': 4169, 'epoch': 3} {'type': 'loss', 'content': 0.012814853340387344, 'timestamp': '2025-10-01 04:22:17.555882', 'step': 4170, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:22:17.600927', 'step': 4170, 'epoch': 3} {'type': 'loss', 'content': 0.005623261444270611, 'timestamp': '2025-10-01 04:22:17.614912', 'step': 4171, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:17.666107', 'step': 4171, 'epoch': 3} {'type': 'loss', 'content': 0.0043925936333835125, 'timestamp': '2025-10-01 04:22:17.698786', 'step': 4172, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:17.754963', 'step': 4172, 'epoch': 3} {'type': 'loss', 'content': 0.0014770793495699763, 'timestamp': '2025-10-01 04:22:17.764186', 'step': 4173, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:17.811052', 'step': 4173, 'epoch': 3} {'type': 'loss', 'content': 0.006645291578024626, 'timestamp': '2025-10-01 04:22:17.822741', 'step': 4174, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:22:17.872896', 'step': 4174, 'epoch': 3} {'type': 'loss', 'content': 0.007615561131387949, 'timestamp': '2025-10-01 04:22:17.883997', 'step': 4175, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:22:17.942899', 'step': 4175, 'epoch': 3} {'type': 'loss', 'content': 0.007910758256912231, 'timestamp': '2025-10-01 04:22:17.979627', 'step': 4176, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:22:18.034759', 'step': 4176, 'epoch': 3} {'type': 'loss', 'content': 0.005652237683534622, 'timestamp': '2025-10-01 04:22:18.047630', 'step': 4177, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:22:18.096478', 'step': 4177, 'epoch': 3} {'type': 'loss', 'content': 0.005612634122371674, 'timestamp': '2025-10-01 04:22:18.110496', 'step': 4178, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:18.161533', 'step': 4178, 'epoch': 3} {'type': 'loss', 'content': 0.0031780777499079704, 'timestamp': '2025-10-01 04:22:18.170015', 'step': 4179, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:22:18.212365', 'step': 4179, 'epoch': 3} {'type': 'loss', 'content': 0.007611333392560482, 'timestamp': '2025-10-01 04:22:18.244071', 'step': 4180, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:22:18.292816', 'step': 4180, 'epoch': 3} {'type': 'loss', 'content': 0.005093053914606571, 'timestamp': '2025-10-01 04:22:18.304040', 'step': 4181, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:22:18.356951', 'step': 4181, 'epoch': 3} {'type': 'loss', 'content': 0.003347428049892187, 'timestamp': '2025-10-01 04:22:18.371029', 'step': 4182, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:22:18.420774', 'step': 4182, 'epoch': 3} {'type': 'loss', 'content': 0.003900351235643029, 'timestamp': '2025-10-01 04:22:18.434780', 'step': 4183, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:22:18.488832', 'step': 4183, 'epoch': 3} {'type': 'loss', 'content': 0.002436827402561903, 'timestamp': '2025-10-01 04:22:18.523863', 'step': 4184, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:22:18.580389', 'step': 4184, 'epoch': 3} {'type': 'loss', 'content': 0.004921661224216223, 'timestamp': '2025-10-01 04:22:18.593832', 'step': 4185, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:22:18.645020', 'step': 4185, 'epoch': 3} {'type': 'loss', 'content': 0.011028957553207874, 'timestamp': '2025-10-01 04:22:18.657582', 'step': 4186, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:18.707446', 'step': 4186, 'epoch': 3} {'type': 'loss', 'content': 0.0048931739293038845, 'timestamp': '2025-10-01 04:22:18.715839', 'step': 4187, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:22:18.758339', 'step': 4187, 'epoch': 3} {'type': 'loss', 'content': 0.007155683822929859, 'timestamp': '2025-10-01 04:22:18.791861', 'step': 4188, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:18.834187', 'step': 4188, 'epoch': 3} {'type': 'loss', 'content': 0.006311376579105854, 'timestamp': '2025-10-01 04:22:18.843335', 'step': 4189, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:18.885864', 'step': 4189, 'epoch': 3} {'type': 'loss', 'content': 0.00446103373542428, 'timestamp': '2025-10-01 04:22:18.894264', 'step': 4190, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:22:18.944647', 'step': 4190, 'epoch': 3} {'type': 'loss', 'content': 0.0027842437848448753, 'timestamp': '2025-10-01 04:22:18.960424', 'step': 4191, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:19.004955', 'step': 4191, 'epoch': 3} {'type': 'loss', 'content': 0.005400232505053282, 'timestamp': '2025-10-01 04:22:19.034315', 'step': 4192, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:22:19.088640', 'step': 4192, 'epoch': 3} {'type': 'loss', 'content': 0.004840423818677664, 'timestamp': '2025-10-01 04:22:19.101530', 'step': 4193, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:19.154795', 'step': 4193, 'epoch': 3} {'type': 'loss', 'content': 0.007588986307382584, 'timestamp': '2025-10-01 04:22:19.166280', 'step': 4194, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:22:19.229674', 'step': 4194, 'epoch': 3} {'type': 'loss', 'content': 0.002714746166020632, 'timestamp': '2025-10-01 04:22:19.243778', 'step': 4195, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:22:19.293258', 'step': 4195, 'epoch': 3} {'type': 'loss', 'content': 0.0018153444398194551, 'timestamp': '2025-10-01 04:22:19.327862', 'step': 4196, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:22:19.379124', 'step': 4196, 'epoch': 3} {'type': 'loss', 'content': 0.0020221699960529804, 'timestamp': '2025-10-01 04:22:19.390374', 'step': 4197, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:22:19.439522', 'step': 4197, 'epoch': 3} {'type': 'loss', 'content': 0.0014591705985367298, 'timestamp': '2025-10-01 04:22:19.452025', 'step': 4198, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:22:19.503796', 'step': 4198, 'epoch': 3} {'type': 'loss', 'content': 0.005283168982714415, 'timestamp': '2025-10-01 04:22:19.519910', 'step': 4199, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:22:19.569625', 'step': 4199, 'epoch': 3} {'type': 'loss', 'content': 0.006497543305158615, 'timestamp': '2025-10-01 04:22:19.604746', 'step': 4200, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:22:19.652713', 'step': 4200, 'epoch': 3} {'type': 'loss', 'content': 0.001634647836908698, 'timestamp': '2025-10-01 04:22:19.661148', 'step': 4201, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:22:19.715617', 'step': 4201, 'epoch': 3} {'type': 'loss', 'content': 0.003639001166447997, 'timestamp': '2025-10-01 04:22:19.729665', 'step': 4202, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:22:19.773146', 'step': 4202, 'epoch': 3} {'type': 'loss', 'content': 0.005222144536674023, 'timestamp': '2025-10-01 04:22:19.785672', 'step': 4203, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:19.823113', 'step': 4203, 'epoch': 3} {'type': 'loss', 'content': 0.006335406098514795, 'timestamp': '2025-10-01 04:22:19.855503', 'step': 4204, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:22:19.891142', 'step': 4204, 'epoch': 3} {'type': 'loss', 'content': 0.005709010176360607, 'timestamp': '2025-10-01 04:22:19.898522', 'step': 4205, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:22:19.938140', 'step': 4205, 'epoch': 3} {'type': 'loss', 'content': 0.005102238617837429, 'timestamp': '2025-10-01 04:22:19.949108', 'step': 4206, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:19.988352', 'step': 4206, 'epoch': 3} {'type': 'loss', 'content': 0.005280550103634596, 'timestamp': '2025-10-01 04:22:19.999879', 'step': 4207, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:22:20.049480', 'step': 4207, 'epoch': 3} {'type': 'loss', 'content': 0.0042456346563994884, 'timestamp': '2025-10-01 04:22:20.078937', 'step': 4208, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:20.134885', 'step': 4208, 'epoch': 3} {'type': 'loss', 'content': 0.013480653055012226, 'timestamp': '2025-10-01 04:22:20.142910', 'step': 4209, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:22:20.188384', 'step': 4209, 'epoch': 3} {'type': 'loss', 'content': 0.012360784225165844, 'timestamp': '2025-10-01 04:22:20.195990', 'step': 4210, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:22:20.241845', 'step': 4210, 'epoch': 3} {'type': 'loss', 'content': 0.006369257345795631, 'timestamp': '2025-10-01 04:22:20.255745', 'step': 4211, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:20.309229', 'step': 4211, 'epoch': 3} {'type': 'loss', 'content': 0.003228358458727598, 'timestamp': '2025-10-01 04:22:20.338788', 'step': 4212, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:22:20.384926', 'step': 4212, 'epoch': 3} {'type': 'loss', 'content': 0.0021818773820996284, 'timestamp': '2025-10-01 04:22:20.398434', 'step': 4213, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:22:20.447272', 'step': 4213, 'epoch': 3} {'type': 'loss', 'content': 0.003697713604196906, 'timestamp': '2025-10-01 04:22:20.460062', 'step': 4214, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:20.501928', 'step': 4214, 'epoch': 3} {'type': 'loss', 'content': 0.008032994344830513, 'timestamp': '2025-10-01 04:22:20.510266', 'step': 4215, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:22:20.551474', 'step': 4215, 'epoch': 3} {'type': 'loss', 'content': 0.012107309885323048, 'timestamp': '2025-10-01 04:22:20.584966', 'step': 4216, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:22:20.632783', 'step': 4216, 'epoch': 3} {'type': 'loss', 'content': 0.006143786944448948, 'timestamp': '2025-10-01 04:22:20.646303', 'step': 4217, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:22:20.694107', 'step': 4217, 'epoch': 3} {'type': 'loss', 'content': 0.0028584797400981188, 'timestamp': '2025-10-01 04:22:20.706873', 'step': 4218, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:22:20.754603', 'step': 4218, 'epoch': 3} {'type': 'loss', 'content': 0.001885659876279533, 'timestamp': '2025-10-01 04:22:20.768668', 'step': 4219, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:22:20.812558', 'step': 4219, 'epoch': 3} {'type': 'loss', 'content': 0.002386491047218442, 'timestamp': '2025-10-01 04:22:20.841464', 'step': 4220, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:22:20.905226', 'step': 4220, 'epoch': 3} {'type': 'loss', 'content': 0.00872388668358326, 'timestamp': '2025-10-01 04:22:20.911092', 'step': 4221, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:20.954897', 'step': 4221, 'epoch': 3} {'type': 'loss', 'content': 0.00853246171027422, 'timestamp': '2025-10-01 04:22:20.963340', 'step': 4222, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:22:21.007582', 'step': 4222, 'epoch': 3} {'type': 'loss', 'content': 0.0033285755198448896, 'timestamp': '2025-10-01 04:22:21.021203', 'step': 4223, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:22:21.064385', 'step': 4223, 'epoch': 3} {'type': 'loss', 'content': 0.0019786865450441837, 'timestamp': '2025-10-01 04:22:21.093374', 'step': 4224, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:22:21.135628', 'step': 4224, 'epoch': 3} {'type': 'loss', 'content': 0.004117278847843409, 'timestamp': '2025-10-01 04:22:21.140972', 'step': 4225, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:22:21.187746', 'step': 4225, 'epoch': 3} {'type': 'loss', 'content': 0.005257939454168081, 'timestamp': '2025-10-01 04:22:21.195795', 'step': 4226, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:22:21.245612', 'step': 4226, 'epoch': 3} {'type': 'loss', 'content': 0.006142887752503157, 'timestamp': '2025-10-01 04:22:21.259203', 'step': 4227, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:21.301093', 'step': 4227, 'epoch': 3} {'type': 'loss', 'content': 0.004112836439162493, 'timestamp': '2025-10-01 04:22:21.330538', 'step': 4228, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:22:21.373602', 'step': 4228, 'epoch': 3} {'type': 'loss', 'content': 0.0024964436888694763, 'timestamp': '2025-10-01 04:22:21.386925', 'step': 4229, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:22:21.432549', 'step': 4229, 'epoch': 3} {'type': 'loss', 'content': 0.005816625896841288, 'timestamp': '2025-10-01 04:22:21.446598', 'step': 4230, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:22:21.495984', 'step': 4230, 'epoch': 3} {'type': 'loss', 'content': 0.014218274503946304, 'timestamp': '2025-10-01 04:22:21.509537', 'step': 4231, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:22:21.568879', 'step': 4231, 'epoch': 3} {'type': 'loss', 'content': 0.004670611582696438, 'timestamp': '2025-10-01 04:22:21.603868', 'step': 4232, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:22:21.652318', 'step': 4232, 'epoch': 3} {'type': 'loss', 'content': 0.005887454375624657, 'timestamp': '2025-10-01 04:22:21.663014', 'step': 4233, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:21.709481', 'step': 4233, 'epoch': 3} {'type': 'loss', 'content': 0.004608998075127602, 'timestamp': '2025-10-01 04:22:21.721267', 'step': 4234, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:22:21.763810', 'step': 4234, 'epoch': 3} {'type': 'loss', 'content': 0.003439784748479724, 'timestamp': '2025-10-01 04:22:21.776572', 'step': 4235, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:22:21.819255', 'step': 4235, 'epoch': 3} {'type': 'loss', 'content': 0.004998174961656332, 'timestamp': '2025-10-01 04:22:21.852971', 'step': 4236, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:22:21.897788', 'step': 4236, 'epoch': 3} {'type': 'loss', 'content': 0.006165375467389822, 'timestamp': '2025-10-01 04:22:21.906295', 'step': 4237, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:22:21.950607', 'step': 4237, 'epoch': 3} {'type': 'loss', 'content': 0.006723262369632721, 'timestamp': '2025-10-01 04:22:21.959074', 'step': 4238, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:22:22.000467', 'step': 4238, 'epoch': 3} {'type': 'loss', 'content': 0.004797711968421936, 'timestamp': '2025-10-01 04:22:22.007799', 'step': 4239, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:22:22.054795', 'step': 4239, 'epoch': 3} {'type': 'loss', 'content': 0.005106324329972267, 'timestamp': '2025-10-01 04:22:22.084064', 'step': 4240, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:22.120254', 'step': 4240, 'epoch': 3} {'type': 'loss', 'content': 0.0054281544871628284, 'timestamp': '2025-10-01 04:22:22.134343', 'step': 4241, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:22:22.176037', 'step': 4241, 'epoch': 3} {'type': 'loss', 'content': 0.007330107036978006, 'timestamp': '2025-10-01 04:22:22.183608', 'step': 4242, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:22:22.231001', 'step': 4242, 'epoch': 3} {'type': 'loss', 'content': 0.005962714087218046, 'timestamp': '2025-10-01 04:22:22.246865', 'step': 4243, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:22:22.297290', 'step': 4243, 'epoch': 3} {'type': 'loss', 'content': 0.002431190339848399, 'timestamp': '2025-10-01 04:22:22.332171', 'step': 4244, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:22:22.393114', 'step': 4244, 'epoch': 3} {'type': 'loss', 'content': 0.0026457698550075293, 'timestamp': '2025-10-01 04:22:22.399038', 'step': 4245, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-10-01 04:22:22.431062', 'step': 4245, 'epoch': 3} {'type': 'loss', 'content': 0.01150007825344801, 'timestamp': '2025-10-01 04:22:22.437020', 'step': 4246, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:22:22.487499', 'step': 4246, 'epoch': 3} {'type': 'loss', 'content': 0.004941039253026247, 'timestamp': '2025-10-01 04:22:22.495456', 'step': 4247, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:22:22.544823', 'step': 4247, 'epoch': 3} {'type': 'loss', 'content': 0.006880153901875019, 'timestamp': '2025-10-01 04:22:22.578465', 'step': 4248, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:22:22.615921', 'step': 4248, 'epoch': 3} {'type': 'loss', 'content': 0.0026835862081497908, 'timestamp': '2025-10-01 04:22:22.621736', 'step': 4249, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:22:22.665467', 'step': 4249, 'epoch': 3} {'type': 'loss', 'content': 0.007213926874101162, 'timestamp': '2025-10-01 04:22:22.678057', 'step': 4250, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:22.716111', 'step': 4250, 'epoch': 3} {'type': 'loss', 'content': 0.0023390422575175762, 'timestamp': '2025-10-01 04:22:22.724556', 'step': 4251, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:22.763535', 'step': 4251, 'epoch': 3} {'type': 'loss', 'content': 0.00620240205898881, 'timestamp': '2025-10-01 04:22:22.796885', 'step': 4252, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:22.834136', 'step': 4252, 'epoch': 3} {'type': 'loss', 'content': 0.00990772433578968, 'timestamp': '2025-10-01 04:22:22.841564', 'step': 4253, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:22:22.883973', 'step': 4253, 'epoch': 3} {'type': 'loss', 'content': 0.007946731522679329, 'timestamp': '2025-10-01 04:22:22.894997', 'step': 4254, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:22:22.937657', 'step': 4254, 'epoch': 3} {'type': 'loss', 'content': 0.005293496884405613, 'timestamp': '2025-10-01 04:22:22.951657', 'step': 4255, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-10-01 04:22:25.841388', 'step': 4255, 'epoch': 3} {'type': 'pplx', 'content': 5.990666905837665, 'timestamp': '2025-10-01 04:22:25.848439', 'step': 4255, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-10-01 04:22:25.890374', 'step': 4255, 'epoch': 3} {'type': 'loss', 'content': 0.0008204663172364235, 'timestamp': '2025-10-01 04:22:25.914988', 'step': 4256, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:22:25.960432', 'step': 4256, 'epoch': 3} {'type': 'loss', 'content': 0.0030027623288333416, 'timestamp': '2025-10-01 04:22:25.968454', 'step': 4257, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:22:26.016793', 'step': 4257, 'epoch': 3} {'type': 'loss', 'content': 0.00993004534393549, 'timestamp': '2025-10-01 04:22:26.025022', 'step': 4258, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:22:26.073447', 'step': 4258, 'epoch': 3} {'type': 'loss', 'content': 0.0077980696223676205, 'timestamp': '2025-10-01 04:22:26.087462', 'step': 4259, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:22:26.138604', 'step': 4259, 'epoch': 3} {'type': 'loss', 'content': 0.002822862472385168, 'timestamp': '2025-10-01 04:22:26.173590', 'step': 4260, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:22:26.222766', 'step': 4260, 'epoch': 3} {'type': 'loss', 'content': 0.002723793964833021, 'timestamp': '2025-10-01 04:22:26.235633', 'step': 4261, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:26.299639', 'step': 4261, 'epoch': 3} {'type': 'loss', 'content': 0.0028680087998509407, 'timestamp': '2025-10-01 04:22:26.307957', 'step': 4262, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:22:26.357982', 'step': 4262, 'epoch': 3} {'type': 'loss', 'content': 0.0019245354924350977, 'timestamp': '2025-10-01 04:22:26.365881', 'step': 4263, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:26.415614', 'step': 4263, 'epoch': 3} {'type': 'loss', 'content': 0.0027670483104884624, 'timestamp': '2025-10-01 04:22:26.448017', 'step': 4264, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-10-01 04:22:26.498832', 'step': 4264, 'epoch': 3} {'type': 'loss', 'content': 0.005966589320451021, 'timestamp': '2025-10-01 04:22:26.514842', 'step': 4265, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:26.559407', 'step': 4265, 'epoch': 3} {'type': 'loss', 'content': 0.007864401675760746, 'timestamp': '2025-10-01 04:22:26.571025', 'step': 4266, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:22:26.619375', 'step': 4266, 'epoch': 3} {'type': 'loss', 'content': 0.003821858437731862, 'timestamp': '2025-10-01 04:22:26.631832', 'step': 4267, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:26.683915', 'step': 4267, 'epoch': 3} {'type': 'loss', 'content': 0.0038813194260001183, 'timestamp': '2025-10-01 04:22:26.713078', 'step': 4268, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 17085996872448}, 'timestamp': '2025-10-01 04:22:26.773027', 'step': 4268, 'epoch': 3} {'type': 'loss', 'content': 0.0034527545794844627, 'timestamp': '2025-10-01 04:22:26.792427', 'step': 4269, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:22:26.843698', 'step': 4269, 'epoch': 3} {'type': 'loss', 'content': 0.003933602478355169, 'timestamp': '2025-10-01 04:22:26.857801', 'step': 4270, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:22:26.904334', 'step': 4270, 'epoch': 3} {'type': 'loss', 'content': 0.010453340597450733, 'timestamp': '2025-10-01 04:22:26.915004', 'step': 4271, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:22:26.961603', 'step': 4271, 'epoch': 3} {'type': 'loss', 'content': 0.005423793103545904, 'timestamp': '2025-10-01 04:22:26.993420', 'step': 4272, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:22:27.038127', 'step': 4272, 'epoch': 3} {'type': 'loss', 'content': 0.008656603284180164, 'timestamp': '2025-10-01 04:22:27.049266', 'step': 4273, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:22:27.102386', 'step': 4273, 'epoch': 3} {'type': 'loss', 'content': 0.0009675725013948977, 'timestamp': '2025-10-01 04:22:27.116455', 'step': 4274, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:22:27.171145', 'step': 4274, 'epoch': 3} {'type': 'loss', 'content': 0.0030769521836191416, 'timestamp': '2025-10-01 04:22:27.185146', 'step': 4275, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:22:27.234600', 'step': 4275, 'epoch': 3} {'type': 'loss', 'content': 0.0096583915874362, 'timestamp': '2025-10-01 04:22:27.265802', 'step': 4276, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:27.313806', 'step': 4276, 'epoch': 3} {'type': 'loss', 'content': 0.0017127582104876637, 'timestamp': '2025-10-01 04:22:27.322975', 'step': 4277, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:27.367796', 'step': 4277, 'epoch': 3} {'type': 'loss', 'content': 0.00716383894905448, 'timestamp': '2025-10-01 04:22:27.379327', 'step': 4278, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:27.424983', 'step': 4278, 'epoch': 3} {'type': 'loss', 'content': 0.008541286922991276, 'timestamp': '2025-10-01 04:22:27.436508', 'step': 4279, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:22:27.479706', 'step': 4279, 'epoch': 3} {'type': 'loss', 'content': 0.005353893153369427, 'timestamp': '2025-10-01 04:22:27.511446', 'step': 4280, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-10-01 04:22:27.561451', 'step': 4280, 'epoch': 3} {'type': 'loss', 'content': 0.004160082433372736, 'timestamp': '2025-10-01 04:22:27.578356', 'step': 4281, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:22:27.620930', 'step': 4281, 'epoch': 3} {'type': 'loss', 'content': 0.0031805182807147503, 'timestamp': '2025-10-01 04:22:27.628413', 'step': 4282, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:22:27.675391', 'step': 4282, 'epoch': 3} {'type': 'loss', 'content': 0.005822019185870886, 'timestamp': '2025-10-01 04:22:27.686156', 'step': 4283, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:27.738104', 'step': 4283, 'epoch': 3} {'type': 'loss', 'content': 0.006066902540624142, 'timestamp': '2025-10-01 04:22:27.767954', 'step': 4284, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:27.813791', 'step': 4284, 'epoch': 3} {'type': 'loss', 'content': 0.001116434345021844, 'timestamp': '2025-10-01 04:22:27.825737', 'step': 4285, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:27.875915', 'step': 4285, 'epoch': 3} {'type': 'loss', 'content': 0.008018722757697105, 'timestamp': '2025-10-01 04:22:27.887341', 'step': 4286, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:22:27.938719', 'step': 4286, 'epoch': 3} {'type': 'loss', 'content': 0.00853513553738594, 'timestamp': '2025-10-01 04:22:27.952227', 'step': 4287, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:27.999804', 'step': 4287, 'epoch': 3} {'type': 'loss', 'content': 0.005777149926871061, 'timestamp': '2025-10-01 04:22:28.031084', 'step': 4288, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:22:28.072639', 'step': 4288, 'epoch': 3} {'type': 'loss', 'content': 0.01096270140260458, 'timestamp': '2025-10-01 04:22:28.082667', 'step': 4289, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:22:28.128800', 'step': 4289, 'epoch': 3} {'type': 'loss', 'content': 0.0048272027634084225, 'timestamp': '2025-10-01 04:22:28.136382', 'step': 4290, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:22:28.184684', 'step': 4290, 'epoch': 3} {'type': 'loss', 'content': 0.005774823948740959, 'timestamp': '2025-10-01 04:22:28.192621', 'step': 4291, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:28.238041', 'step': 4291, 'epoch': 3} {'type': 'loss', 'content': 0.004754411056637764, 'timestamp': '2025-10-01 04:22:28.270502', 'step': 4292, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:22:28.311037', 'step': 4292, 'epoch': 3} {'type': 'loss', 'content': 0.0038338645827025175, 'timestamp': '2025-10-01 04:22:28.316793', 'step': 4293, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:22:28.354226', 'step': 4293, 'epoch': 3} {'type': 'loss', 'content': 0.002847907366231084, 'timestamp': '2025-10-01 04:22:28.365124', 'step': 4294, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:22:28.401681', 'step': 4294, 'epoch': 3} {'type': 'loss', 'content': 0.003644203068688512, 'timestamp': '2025-10-01 04:22:28.415666', 'step': 4295, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:22:28.453658', 'step': 4295, 'epoch': 3} {'type': 'loss', 'content': 0.005938881076872349, 'timestamp': '2025-10-01 04:22:28.482140', 'step': 4296, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:28.518982', 'step': 4296, 'epoch': 3} {'type': 'loss', 'content': 0.00693379295989871, 'timestamp': '2025-10-01 04:22:28.528052', 'step': 4297, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:22:28.573049', 'step': 4297, 'epoch': 3} {'type': 'loss', 'content': 0.011410829611122608, 'timestamp': '2025-10-01 04:22:28.583527', 'step': 4298, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:28.637623', 'step': 4298, 'epoch': 3} {'type': 'loss', 'content': 0.002445804886519909, 'timestamp': '2025-10-01 04:22:28.649039', 'step': 4299, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:28.705355', 'step': 4299, 'epoch': 3} {'type': 'loss', 'content': 0.008198847994208336, 'timestamp': '2025-10-01 04:22:28.737914', 'step': 4300, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:22:28.781985', 'step': 4300, 'epoch': 3} {'type': 'loss', 'content': 0.0010678486432880163, 'timestamp': '2025-10-01 04:22:28.787306', 'step': 4301, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:28.827479', 'step': 4301, 'epoch': 3} {'type': 'loss', 'content': 0.007874487899243832, 'timestamp': '2025-10-01 04:22:28.839026', 'step': 4302, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:28.880804', 'step': 4302, 'epoch': 3} {'type': 'loss', 'content': 0.0010819978779181838, 'timestamp': '2025-10-01 04:22:28.892383', 'step': 4303, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:22:28.931937', 'step': 4303, 'epoch': 3} {'type': 'loss', 'content': 0.008256263099610806, 'timestamp': '2025-10-01 04:22:28.965642', 'step': 4304, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:29.007768', 'step': 4304, 'epoch': 3} {'type': 'loss', 'content': 0.0018726121634244919, 'timestamp': '2025-10-01 04:22:29.016101', 'step': 4305, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:22:29.065128', 'step': 4305, 'epoch': 3} {'type': 'loss', 'content': 0.006179199554026127, 'timestamp': '2025-10-01 04:22:29.075992', 'step': 4306, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:29.119068', 'step': 4306, 'epoch': 3} {'type': 'loss', 'content': 0.0038736695423722267, 'timestamp': '2025-10-01 04:22:29.130618', 'step': 4307, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:29.178639', 'step': 4307, 'epoch': 3} {'type': 'loss', 'content': 0.0039551216177642345, 'timestamp': '2025-10-01 04:22:29.207943', 'step': 4308, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:22:29.260685', 'step': 4308, 'epoch': 3} {'type': 'loss', 'content': 0.004531411919742823, 'timestamp': '2025-10-01 04:22:29.274018', 'step': 4309, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 14238374516352}, 'timestamp': '2025-10-01 04:22:29.333097', 'step': 4309, 'epoch': 3} {'type': 'loss', 'content': 0.005633550696074963, 'timestamp': '2025-10-01 04:22:29.350560', 'step': 4310, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:22:29.398400', 'step': 4310, 'epoch': 3} {'type': 'loss', 'content': 0.00610339967533946, 'timestamp': '2025-10-01 04:22:29.412694', 'step': 4311, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:22:29.466475', 'step': 4311, 'epoch': 3} {'type': 'loss', 'content': 0.008874920196831226, 'timestamp': '2025-10-01 04:22:29.501400', 'step': 4312, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:22:29.556997', 'step': 4312, 'epoch': 3} {'type': 'loss', 'content': 0.006323575507849455, 'timestamp': '2025-10-01 04:22:29.565276', 'step': 4313, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:22:29.630560', 'step': 4313, 'epoch': 3} {'type': 'loss', 'content': 0.007020341698080301, 'timestamp': '2025-10-01 04:22:29.644600', 'step': 4314, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-10-01 04:22:29.720073', 'step': 4314, 'epoch': 3} {'type': 'loss', 'content': 0.001439164043404162, 'timestamp': '2025-10-01 04:22:29.737374', 'step': 4315, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:22:29.795054', 'step': 4315, 'epoch': 3} {'type': 'loss', 'content': 0.006692590191960335, 'timestamp': '2025-10-01 04:22:29.829940', 'step': 4316, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:22:29.887044', 'step': 4316, 'epoch': 3} {'type': 'loss', 'content': 0.005388326942920685, 'timestamp': '2025-10-01 04:22:29.900591', 'step': 4317, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-10-01 04:22:29.957764', 'step': 4317, 'epoch': 3} {'type': 'loss', 'content': 0.003564318409189582, 'timestamp': '2025-10-01 04:22:29.974307', 'step': 4318, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 14238374516352}, 'timestamp': '2025-10-01 04:22:30.036572', 'step': 4318, 'epoch': 3} {'type': 'loss', 'content': 0.002061659935861826, 'timestamp': '2025-10-01 04:22:30.054057', 'step': 4319, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:22:30.111615', 'step': 4319, 'epoch': 3} {'type': 'loss', 'content': 0.004964157473295927, 'timestamp': '2025-10-01 04:22:30.146823', 'step': 4320, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-10-01 04:22:30.198108', 'step': 4320, 'epoch': 3} {'type': 'loss', 'content': 0.0037471724208444357, 'timestamp': '2025-10-01 04:22:30.214134', 'step': 4321, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:22:30.262727', 'step': 4321, 'epoch': 3} {'type': 'loss', 'content': 0.021193915978074074, 'timestamp': '2025-10-01 04:22:30.270868', 'step': 4322, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:30.315738', 'step': 4322, 'epoch': 3} {'type': 'loss', 'content': 0.001246259082108736, 'timestamp': '2025-10-01 04:22:30.323911', 'step': 4323, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:22:30.366860', 'step': 4323, 'epoch': 3} {'type': 'loss', 'content': 0.005574243143200874, 'timestamp': '2025-10-01 04:22:30.405272', 'step': 4324, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:22:30.447791', 'step': 4324, 'epoch': 3} {'type': 'loss', 'content': 0.003588765161111951, 'timestamp': '2025-10-01 04:22:30.458158', 'step': 4325, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:22:30.501168', 'step': 4325, 'epoch': 3} {'type': 'loss', 'content': 0.003143772715702653, 'timestamp': '2025-10-01 04:22:30.513757', 'step': 4326, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:22:30.563682', 'step': 4326, 'epoch': 3} {'type': 'loss', 'content': 0.004772980231791735, 'timestamp': '2025-10-01 04:22:30.576441', 'step': 4327, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:22:30.618285', 'step': 4327, 'epoch': 3} {'type': 'loss', 'content': 0.002133280737325549, 'timestamp': '2025-10-01 04:22:30.649975', 'step': 4328, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:22:30.689512', 'step': 4328, 'epoch': 3} {'type': 'loss', 'content': 0.004378051962703466, 'timestamp': '2025-10-01 04:22:30.702381', 'step': 4329, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:22:30.753461', 'step': 4329, 'epoch': 3} {'type': 'loss', 'content': 0.0062160007655620575, 'timestamp': '2025-10-01 04:22:30.767043', 'step': 4330, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:22:30.811146', 'step': 4330, 'epoch': 3} {'type': 'loss', 'content': 0.004291733261197805, 'timestamp': '2025-10-01 04:22:30.819181', 'step': 4331, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:22:30.870044', 'step': 4331, 'epoch': 3} {'type': 'loss', 'content': 0.003975988365709782, 'timestamp': '2025-10-01 04:22:30.898247', 'step': 4332, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:30.937631', 'step': 4332, 'epoch': 3} {'type': 'loss', 'content': 0.0043129874393343925, 'timestamp': '2025-10-01 04:22:30.946689', 'step': 4333, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:22:30.996802', 'step': 4333, 'epoch': 3} {'type': 'loss', 'content': 0.0029328016098588705, 'timestamp': '2025-10-01 04:22:31.009298', 'step': 4334, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:22:31.052318', 'step': 4334, 'epoch': 3} {'type': 'loss', 'content': 0.004240902606397867, 'timestamp': '2025-10-01 04:22:31.060107', 'step': 4335, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:22:31.103207', 'step': 4335, 'epoch': 3} {'type': 'loss', 'content': 0.002475401619449258, 'timestamp': '2025-10-01 04:22:31.136692', 'step': 4336, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:22:31.177267', 'step': 4336, 'epoch': 3} {'type': 'loss', 'content': 0.004595813807100058, 'timestamp': '2025-10-01 04:22:31.184083', 'step': 4337, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:22:31.228350', 'step': 4337, 'epoch': 3} {'type': 'loss', 'content': 0.0010594056220725179, 'timestamp': '2025-10-01 04:22:31.239306', 'step': 4338, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:22:31.281619', 'step': 4338, 'epoch': 3} {'type': 'loss', 'content': 0.004363698419183493, 'timestamp': '2025-10-01 04:22:31.289038', 'step': 4339, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:22:31.331816', 'step': 4339, 'epoch': 3} {'type': 'loss', 'content': 0.004368562251329422, 'timestamp': '2025-10-01 04:22:31.363632', 'step': 4340, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:22:31.401007', 'step': 4340, 'epoch': 3} {'type': 'loss', 'content': 0.0018032328225672245, 'timestamp': '2025-10-01 04:22:31.409451', 'step': 4341, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:22:31.447822', 'step': 4341, 'epoch': 3} {'type': 'loss', 'content': 0.0075434972532093525, 'timestamp': '2025-10-01 04:22:31.460553', 'step': 4342, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:22:31.505087', 'step': 4342, 'epoch': 3} {'type': 'loss', 'content': 0.004310798365622759, 'timestamp': '2025-10-01 04:22:31.519336', 'step': 4343, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:31.567723', 'step': 4343, 'epoch': 3} {'type': 'loss', 'content': 0.0032361438497900963, 'timestamp': '2025-10-01 04:22:31.601867', 'step': 4344, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:22:31.644462', 'step': 4344, 'epoch': 3} {'type': 'loss', 'content': 0.005337737966328859, 'timestamp': '2025-10-01 04:22:31.650098', 'step': 4345, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:22:31.706699', 'step': 4345, 'epoch': 3} {'type': 'loss', 'content': 0.0032337354496121407, 'timestamp': '2025-10-01 04:22:31.716123', 'step': 4346, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:31.769462', 'step': 4346, 'epoch': 3} {'type': 'loss', 'content': 0.0071181911043822765, 'timestamp': '2025-10-01 04:22:31.780498', 'step': 4347, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:31.827314', 'step': 4347, 'epoch': 3} {'type': 'loss', 'content': 0.017937885597348213, 'timestamp': '2025-10-01 04:22:31.860441', 'step': 4348, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:22:31.896684', 'step': 4348, 'epoch': 3} {'type': 'loss', 'content': 0.005546420346945524, 'timestamp': '2025-10-01 04:22:31.905149', 'step': 4349, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:22:31.953952', 'step': 4349, 'epoch': 3} {'type': 'loss', 'content': 0.003178501036018133, 'timestamp': '2025-10-01 04:22:31.966678', 'step': 4350, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:22:32.006663', 'step': 4350, 'epoch': 3} {'type': 'loss', 'content': 0.001436619320884347, 'timestamp': '2025-10-01 04:22:32.017890', 'step': 4351, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:22:32.066800', 'step': 4351, 'epoch': 3} {'type': 'loss', 'content': 0.0025395690463483334, 'timestamp': '2025-10-01 04:22:32.102307', 'step': 4352, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:22:32.150892', 'step': 4352, 'epoch': 3} {'type': 'loss', 'content': 0.003889989573508501, 'timestamp': '2025-10-01 04:22:32.161346', 'step': 4353, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:22:32.206479', 'step': 4353, 'epoch': 3} {'type': 'loss', 'content': 0.0038142246194183826, 'timestamp': '2025-10-01 04:22:32.217569', 'step': 4354, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:22:32.258269', 'step': 4354, 'epoch': 3} {'type': 'loss', 'content': 0.003159706946462393, 'timestamp': '2025-10-01 04:22:32.271853', 'step': 4355, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:32.316754', 'step': 4355, 'epoch': 3} {'type': 'loss', 'content': 0.005124642513692379, 'timestamp': '2025-10-01 04:22:32.346070', 'step': 4356, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:22:32.380914', 'step': 4356, 'epoch': 3} {'type': 'loss', 'content': 0.00405614310875535, 'timestamp': '2025-10-01 04:22:32.389482', 'step': 4357, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:22:32.438684', 'step': 4357, 'epoch': 3} {'type': 'loss', 'content': 0.00383318355306983, 'timestamp': '2025-10-01 04:22:32.454765', 'step': 4358, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:22:32.501254', 'step': 4358, 'epoch': 3} {'type': 'loss', 'content': 0.001238912926055491, 'timestamp': '2025-10-01 04:22:32.514819', 'step': 4359, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:22:32.569795', 'step': 4359, 'epoch': 3} {'type': 'loss', 'content': 0.006316446233540773, 'timestamp': '2025-10-01 04:22:32.604799', 'step': 4360, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:22:32.661906', 'step': 4360, 'epoch': 3} {'type': 'loss', 'content': 0.0040861270390450954, 'timestamp': '2025-10-01 04:22:32.675231', 'step': 4361, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:22:32.722973', 'step': 4361, 'epoch': 3} {'type': 'loss', 'content': 0.0028759948909282684, 'timestamp': '2025-10-01 04:22:32.735524', 'step': 4362, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:32.784525', 'step': 4362, 'epoch': 3} {'type': 'loss', 'content': 0.00490927416831255, 'timestamp': '2025-10-01 04:22:32.792810', 'step': 4363, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:22:32.839426', 'step': 4363, 'epoch': 3} {'type': 'loss', 'content': 0.003976278938353062, 'timestamp': '2025-10-01 04:22:32.874383', 'step': 4364, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:22:32.920731', 'step': 4364, 'epoch': 3} {'type': 'loss', 'content': 0.01000644639134407, 'timestamp': '2025-10-01 04:22:32.933605', 'step': 4365, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:22:32.985811', 'step': 4365, 'epoch': 3} {'type': 'loss', 'content': 0.0038587814196944237, 'timestamp': '2025-10-01 04:22:32.999753', 'step': 4366, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:22:33.049435', 'step': 4366, 'epoch': 3} {'type': 'loss', 'content': 0.0035041647497564554, 'timestamp': '2025-10-01 04:22:33.056889', 'step': 4367, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:22:33.108222', 'step': 4367, 'epoch': 3} {'type': 'loss', 'content': 0.002340652048587799, 'timestamp': '2025-10-01 04:22:33.137267', 'step': 4368, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:22:33.181286', 'step': 4368, 'epoch': 3} {'type': 'loss', 'content': 0.00969999935477972, 'timestamp': '2025-10-01 04:22:33.196559', 'step': 4369, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:22:33.245826', 'step': 4369, 'epoch': 3} {'type': 'loss', 'content': 0.0015985369682312012, 'timestamp': '2025-10-01 04:22:33.256868', 'step': 4370, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-10-01 04:22:36.152294', 'step': 4370, 'epoch': 3} {'type': 'pplx', 'content': 5.97715336305907, 'timestamp': '2025-10-01 04:22:36.157122', 'step': 4370, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:22:36.192793', 'step': 4370, 'epoch': 3} {'type': 'loss', 'content': 0.00771109014749527, 'timestamp': '2025-10-01 04:22:36.199826', 'step': 4371, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:36.250332', 'step': 4371, 'epoch': 3} {'type': 'loss', 'content': 0.0026406990364193916, 'timestamp': '2025-10-01 04:22:36.282577', 'step': 4372, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:36.331734', 'step': 4372, 'epoch': 3} {'type': 'loss', 'content': 0.005381821654736996, 'timestamp': '2025-10-01 04:22:36.337637', 'step': 4373, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:22:36.383560', 'step': 4373, 'epoch': 3} {'type': 'loss', 'content': 0.007869486697018147, 'timestamp': '2025-10-01 04:22:36.396072', 'step': 4374, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:22:36.443677', 'step': 4374, 'epoch': 3} {'type': 'loss', 'content': 0.004388319794088602, 'timestamp': '2025-10-01 04:22:36.451604', 'step': 4375, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:22:36.507949', 'step': 4375, 'epoch': 3} {'type': 'loss', 'content': 0.0036104139871895313, 'timestamp': '2025-10-01 04:22:36.541667', 'step': 4376, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:22:36.592780', 'step': 4376, 'epoch': 3} {'type': 'loss', 'content': 0.001794837531633675, 'timestamp': '2025-10-01 04:22:36.606146', 'step': 4377, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:22:36.647050', 'step': 4377, 'epoch': 3} {'type': 'loss', 'content': 0.002092750510200858, 'timestamp': '2025-10-01 04:22:36.657824', 'step': 4378, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:22:36.707122', 'step': 4378, 'epoch': 3} {'type': 'loss', 'content': 0.005108834244310856, 'timestamp': '2025-10-01 04:22:36.717952', 'step': 4379, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:36.770858', 'step': 4379, 'epoch': 3} {'type': 'loss', 'content': 0.0012070629745721817, 'timestamp': '2025-10-01 04:22:36.803334', 'step': 4380, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:22:36.849893', 'step': 4380, 'epoch': 3} {'type': 'loss', 'content': 0.0014626404736191034, 'timestamp': '2025-10-01 04:22:36.858180', 'step': 4381, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:22:36.918988', 'step': 4381, 'epoch': 3} {'type': 'loss', 'content': 0.004221557639539242, 'timestamp': '2025-10-01 04:22:36.926552', 'step': 4382, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:36.982061', 'step': 4382, 'epoch': 3} {'type': 'loss', 'content': 0.00448877178132534, 'timestamp': '2025-10-01 04:22:36.993363', 'step': 4383, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-10-01 04:22:37.062144', 'step': 4383, 'epoch': 3} {'type': 'loss', 'content': 0.005113864317536354, 'timestamp': '2025-10-01 04:22:37.101336', 'step': 4384, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:22:37.150702', 'step': 4384, 'epoch': 3} {'type': 'loss', 'content': 0.0006909265648573637, 'timestamp': '2025-10-01 04:22:37.160045', 'step': 4385, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:22:37.209562', 'step': 4385, 'epoch': 3} {'type': 'loss', 'content': 0.006013736128807068, 'timestamp': '2025-10-01 04:22:37.217035', 'step': 4386, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:22:37.260975', 'step': 4386, 'epoch': 3} {'type': 'loss', 'content': 0.006908543407917023, 'timestamp': '2025-10-01 04:22:37.269100', 'step': 4387, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:37.320343', 'step': 4387, 'epoch': 3} {'type': 'loss', 'content': 0.004467893857508898, 'timestamp': '2025-10-01 04:22:37.352782', 'step': 4388, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:37.390654', 'step': 4388, 'epoch': 3} {'type': 'loss', 'content': 0.0076827420853078365, 'timestamp': '2025-10-01 04:22:37.397908', 'step': 4389, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:37.440071', 'step': 4389, 'epoch': 3} {'type': 'loss', 'content': 0.008598397485911846, 'timestamp': '2025-10-01 04:22:37.451397', 'step': 4390, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:22:37.505170', 'step': 4390, 'epoch': 3} {'type': 'loss', 'content': 0.004322738386690617, 'timestamp': '2025-10-01 04:22:37.517632', 'step': 4391, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:22:37.568567', 'step': 4391, 'epoch': 3} {'type': 'loss', 'content': 0.002431736560538411, 'timestamp': '2025-10-01 04:22:37.596972', 'step': 4392, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:37.646519', 'step': 4392, 'epoch': 3} {'type': 'loss', 'content': 0.0060173943638801575, 'timestamp': '2025-10-01 04:22:37.655572', 'step': 4393, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:22:37.694498', 'step': 4393, 'epoch': 3} {'type': 'loss', 'content': 0.0012290967861190438, 'timestamp': '2025-10-01 04:22:37.702008', 'step': 4394, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:22:37.746044', 'step': 4394, 'epoch': 3} {'type': 'loss', 'content': 0.004877561703324318, 'timestamp': '2025-10-01 04:22:37.755356', 'step': 4395, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:22:37.807899', 'step': 4395, 'epoch': 3} {'type': 'loss', 'content': 0.007359648589044809, 'timestamp': '2025-10-01 04:22:37.836849', 'step': 4396, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:22:37.884600', 'step': 4396, 'epoch': 3} {'type': 'loss', 'content': 0.0022778611164540052, 'timestamp': '2025-10-01 04:22:37.892151', 'step': 4397, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:22:37.935234', 'step': 4397, 'epoch': 3} {'type': 'loss', 'content': 0.0012331405887380242, 'timestamp': '2025-10-01 04:22:37.942727', 'step': 4398, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:37.990188', 'step': 4398, 'epoch': 3} {'type': 'loss', 'content': 0.003893211716786027, 'timestamp': '2025-10-01 04:22:37.998147', 'step': 4399, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:22:38.055924', 'step': 4399, 'epoch': 3} {'type': 'loss', 'content': 0.0029770159162580967, 'timestamp': '2025-10-01 04:22:38.087535', 'step': 4400, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:22:38.127686', 'step': 4400, 'epoch': 3} {'type': 'loss', 'content': 0.0036013484932482243, 'timestamp': '2025-10-01 04:22:38.135461', 'step': 4401, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:22:38.179615', 'step': 4401, 'epoch': 3} {'type': 'loss', 'content': 0.0032264580950140953, 'timestamp': '2025-10-01 04:22:38.190436', 'step': 4402, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-10-01 04:22:38.243462', 'step': 4402, 'epoch': 3} {'type': 'loss', 'content': 0.0012543355114758015, 'timestamp': '2025-10-01 04:22:38.249781', 'step': 4403, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:22:38.301137', 'step': 4403, 'epoch': 3} {'type': 'loss', 'content': 0.009242878295481205, 'timestamp': '2025-10-01 04:22:38.327053', 'step': 4404, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:22:38.371050', 'step': 4404, 'epoch': 3} {'type': 'loss', 'content': 0.0007656366797164083, 'timestamp': '2025-10-01 04:22:38.376325', 'step': 4405, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:22:38.414002', 'step': 4405, 'epoch': 3} {'type': 'loss', 'content': 0.03132671117782593, 'timestamp': '2025-10-01 04:22:38.421417', 'step': 4406, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:22:38.459564', 'step': 4406, 'epoch': 3} {'type': 'loss', 'content': 0.004999412689357996, 'timestamp': '2025-10-01 04:22:38.467171', 'step': 4407, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:22:38.509542', 'step': 4407, 'epoch': 3} {'type': 'loss', 'content': 0.002057774690911174, 'timestamp': '2025-10-01 04:22:38.538076', 'step': 4408, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:38.583698', 'step': 4408, 'epoch': 3} {'type': 'loss', 'content': 0.001789300818927586, 'timestamp': '2025-10-01 04:22:38.589584', 'step': 4409, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:22:38.634529', 'step': 4409, 'epoch': 3} {'type': 'loss', 'content': 0.0005116586107760668, 'timestamp': '2025-10-01 04:22:38.641944', 'step': 4410, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:22:38.686600', 'step': 4410, 'epoch': 3} {'type': 'loss', 'content': 0.0025419036392122507, 'timestamp': '2025-10-01 04:22:38.694146', 'step': 4411, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:22:38.753689', 'step': 4411, 'epoch': 3} {'type': 'loss', 'content': 0.000993579626083374, 'timestamp': '2025-10-01 04:22:38.790644', 'step': 4412, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:22:38.832001', 'step': 4412, 'epoch': 3} {'type': 'loss', 'content': 0.0021159392781555653, 'timestamp': '2025-10-01 04:22:38.838532', 'step': 4413, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:22:38.881800', 'step': 4413, 'epoch': 3} {'type': 'loss', 'content': 0.0024633673019707203, 'timestamp': '2025-10-01 04:22:38.889341', 'step': 4414, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:38.929361', 'step': 4414, 'epoch': 3} {'type': 'loss', 'content': 0.004866219125688076, 'timestamp': '2025-10-01 04:22:38.941024', 'step': 4415, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:22:38.988049', 'step': 4415, 'epoch': 3} {'type': 'loss', 'content': 0.0028167397249490023, 'timestamp': '2025-10-01 04:22:39.022984', 'step': 4416, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:39.075242', 'step': 4416, 'epoch': 3} {'type': 'loss', 'content': 0.003124905750155449, 'timestamp': '2025-10-01 04:22:39.084268', 'step': 4417, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:22:39.130499', 'step': 4417, 'epoch': 3} {'type': 'loss', 'content': 0.0021465723402798176, 'timestamp': '2025-10-01 04:22:39.137895', 'step': 4418, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:22:39.172359', 'step': 4418, 'epoch': 3} {'type': 'loss', 'content': 0.004392168018966913, 'timestamp': '2025-10-01 04:22:39.180295', 'step': 4419, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:22:39.228629', 'step': 4419, 'epoch': 3} {'type': 'loss', 'content': 0.004314699210226536, 'timestamp': '2025-10-01 04:22:39.260558', 'step': 4420, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:39.299077', 'step': 4420, 'epoch': 3} {'type': 'loss', 'content': 0.0006865004543215036, 'timestamp': '2025-10-01 04:22:39.305137', 'step': 4421, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:22:39.345315', 'step': 4421, 'epoch': 3} {'type': 'loss', 'content': 0.0016007455997169018, 'timestamp': '2025-10-01 04:22:39.358029', 'step': 4422, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:39.396660', 'step': 4422, 'epoch': 3} {'type': 'loss', 'content': 0.005868961568921804, 'timestamp': '2025-10-01 04:22:39.408380', 'step': 4423, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:22:39.450920', 'step': 4423, 'epoch': 3} {'type': 'loss', 'content': 0.004030468408018351, 'timestamp': '2025-10-01 04:22:39.479481', 'step': 4424, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:22:39.528697', 'step': 4424, 'epoch': 3} {'type': 'loss', 'content': 0.002642478095367551, 'timestamp': '2025-10-01 04:22:39.543934', 'step': 4425, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:39.578548', 'step': 4425, 'epoch': 3} {'type': 'loss', 'content': 0.010294117033481598, 'timestamp': '2025-10-01 04:22:39.586824', 'step': 4426, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:22:39.623855', 'step': 4426, 'epoch': 3} {'type': 'loss', 'content': 0.0024090006481856108, 'timestamp': '2025-10-01 04:22:39.634534', 'step': 4427, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:22:39.676400', 'step': 4427, 'epoch': 3} {'type': 'loss', 'content': 0.00898026768118143, 'timestamp': '2025-10-01 04:22:39.708288', 'step': 4428, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:39.760313', 'step': 4428, 'epoch': 3} {'type': 'loss', 'content': 0.011247357353568077, 'timestamp': '2025-10-01 04:22:39.768682', 'step': 4429, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 14238374516352}, 'timestamp': '2025-10-01 04:22:39.829994', 'step': 4429, 'epoch': 3} {'type': 'loss', 'content': 0.006415958516299725, 'timestamp': '2025-10-01 04:22:39.847441', 'step': 4430, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:22:39.898167', 'step': 4430, 'epoch': 3} {'type': 'loss', 'content': 0.0003272869798820466, 'timestamp': '2025-10-01 04:22:39.910710', 'step': 4431, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:22:39.962396', 'step': 4431, 'epoch': 3} {'type': 'loss', 'content': 0.0030690310522913933, 'timestamp': '2025-10-01 04:22:39.991381', 'step': 4432, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:22:40.043016', 'step': 4432, 'epoch': 3} {'type': 'loss', 'content': 0.006653620861470699, 'timestamp': '2025-10-01 04:22:40.054090', 'step': 4433, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:40.108376', 'step': 4433, 'epoch': 3} {'type': 'loss', 'content': 0.005080343224108219, 'timestamp': '2025-10-01 04:22:40.116711', 'step': 4434, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:22:40.164932', 'step': 4434, 'epoch': 3} {'type': 'loss', 'content': 0.006405284162610769, 'timestamp': '2025-10-01 04:22:40.172737', 'step': 4435, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:40.220579', 'step': 4435, 'epoch': 3} {'type': 'loss', 'content': 0.002584701869636774, 'timestamp': '2025-10-01 04:22:40.249909', 'step': 4436, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:40.298321', 'step': 4436, 'epoch': 3} {'type': 'loss', 'content': 0.0035252717789262533, 'timestamp': '2025-10-01 04:22:40.307534', 'step': 4437, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:40.358908', 'step': 4437, 'epoch': 3} {'type': 'loss', 'content': 0.0026096836663782597, 'timestamp': '2025-10-01 04:22:40.370438', 'step': 4438, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:40.406242', 'step': 4438, 'epoch': 3} {'type': 'loss', 'content': 0.003275781637057662, 'timestamp': '2025-10-01 04:22:40.417597', 'step': 4439, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:22:40.457972', 'step': 4439, 'epoch': 3} {'type': 'loss', 'content': 0.007334516383707523, 'timestamp': '2025-10-01 04:22:40.492907', 'step': 4440, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:40.538857', 'step': 4440, 'epoch': 3} {'type': 'loss', 'content': 0.006189171690493822, 'timestamp': '2025-10-01 04:22:40.547791', 'step': 4441, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:40.595068', 'step': 4441, 'epoch': 3} {'type': 'loss', 'content': 0.0031126239337027073, 'timestamp': '2025-10-01 04:22:40.606651', 'step': 4442, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:40.656231', 'step': 4442, 'epoch': 3} {'type': 'loss', 'content': 0.005287133622914553, 'timestamp': '2025-10-01 04:22:40.667850', 'step': 4443, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:22:40.716427', 'step': 4443, 'epoch': 3} {'type': 'loss', 'content': 0.0025730945635586977, 'timestamp': '2025-10-01 04:22:40.750247', 'step': 4444, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:22:40.799508', 'step': 4444, 'epoch': 3} {'type': 'loss', 'content': 0.0038187410682439804, 'timestamp': '2025-10-01 04:22:40.807988', 'step': 4445, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:40.852562', 'step': 4445, 'epoch': 3} {'type': 'loss', 'content': 0.0014202359598129988, 'timestamp': '2025-10-01 04:22:40.863861', 'step': 4446, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:22:40.903495', 'step': 4446, 'epoch': 3} {'type': 'loss', 'content': 0.005726667586714029, 'timestamp': '2025-10-01 04:22:40.917468', 'step': 4447, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:22:40.971715', 'step': 4447, 'epoch': 3} {'type': 'loss', 'content': 0.014204002916812897, 'timestamp': '2025-10-01 04:22:41.006740', 'step': 4448, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:22:41.059987', 'step': 4448, 'epoch': 3} {'type': 'loss', 'content': 0.00339250429533422, 'timestamp': '2025-10-01 04:22:41.073286', 'step': 4449, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:22:41.131261', 'step': 4449, 'epoch': 3} {'type': 'loss', 'content': 0.0007084971875883639, 'timestamp': '2025-10-01 04:22:41.142102', 'step': 4450, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:22:41.193963', 'step': 4450, 'epoch': 3} {'type': 'loss', 'content': 0.002337252488359809, 'timestamp': '2025-10-01 04:22:41.207968', 'step': 4451, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:41.253898', 'step': 4451, 'epoch': 3} {'type': 'loss', 'content': 0.006088781636208296, 'timestamp': '2025-10-01 04:22:41.286273', 'step': 4452, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:22:41.338868', 'step': 4452, 'epoch': 3} {'type': 'loss', 'content': 0.004133738111704588, 'timestamp': '2025-10-01 04:22:41.349040', 'step': 4453, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 18984411776512}, 'timestamp': '2025-10-01 04:22:41.407369', 'step': 4453, 'epoch': 3} {'type': 'loss', 'content': 0.0034643535036593676, 'timestamp': '2025-10-01 04:22:41.429306', 'step': 4454, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:22:41.478906', 'step': 4454, 'epoch': 3} {'type': 'loss', 'content': 0.0014107805909588933, 'timestamp': '2025-10-01 04:22:41.489777', 'step': 4455, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:22:41.541633', 'step': 4455, 'epoch': 3} {'type': 'loss', 'content': 0.0018816562369465828, 'timestamp': '2025-10-01 04:22:41.567348', 'step': 4456, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:22:41.615891', 'step': 4456, 'epoch': 3} {'type': 'loss', 'content': 0.0061798011884093285, 'timestamp': '2025-10-01 04:22:41.620787', 'step': 4457, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:22:41.668929', 'step': 4457, 'epoch': 3} {'type': 'loss', 'content': 0.0035044963005930185, 'timestamp': '2025-10-01 04:22:41.679695', 'step': 4458, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:22:41.725863', 'step': 4458, 'epoch': 3} {'type': 'loss', 'content': 0.007699693087488413, 'timestamp': '2025-10-01 04:22:41.748875', 'step': 4459, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:22:41.798183', 'step': 4459, 'epoch': 3} {'type': 'loss', 'content': 0.0005517601384781301, 'timestamp': '2025-10-01 04:22:41.833151', 'step': 4460, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:22:41.872745', 'step': 4460, 'epoch': 3} {'type': 'loss', 'content': 0.004926334600895643, 'timestamp': '2025-10-01 04:22:41.881029', 'step': 4461, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:22:41.927969', 'step': 4461, 'epoch': 3} {'type': 'loss', 'content': 0.003012654837220907, 'timestamp': '2025-10-01 04:22:41.941673', 'step': 4462, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:22:41.993916', 'step': 4462, 'epoch': 3} {'type': 'loss', 'content': 0.0021120617166161537, 'timestamp': '2025-10-01 04:22:42.007354', 'step': 4463, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:22:42.064015', 'step': 4463, 'epoch': 3} {'type': 'loss', 'content': 0.0020889276638627052, 'timestamp': '2025-10-01 04:22:42.098926', 'step': 4464, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:22:42.144960', 'step': 4464, 'epoch': 3} {'type': 'loss', 'content': 0.005269672721624374, 'timestamp': '2025-10-01 04:22:42.158455', 'step': 4465, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:42.206958', 'step': 4465, 'epoch': 3} {'type': 'loss', 'content': 0.002283100038766861, 'timestamp': '2025-10-01 04:22:42.215088', 'step': 4466, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:22:42.258368', 'step': 4466, 'epoch': 3} {'type': 'loss', 'content': 0.0059283701702952385, 'timestamp': '2025-10-01 04:22:42.266110', 'step': 4467, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:22:42.305867', 'step': 4467, 'epoch': 3} {'type': 'loss', 'content': 0.004353041760623455, 'timestamp': '2025-10-01 04:22:42.339518', 'step': 4468, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:22:42.386091', 'step': 4468, 'epoch': 3} {'type': 'loss', 'content': 0.007089049555361271, 'timestamp': '2025-10-01 04:22:42.401676', 'step': 4469, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:42.446608', 'step': 4469, 'epoch': 3} {'type': 'loss', 'content': 0.0011453673942014575, 'timestamp': '2025-10-01 04:22:42.454853', 'step': 4470, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:22:42.506939', 'step': 4470, 'epoch': 3} {'type': 'loss', 'content': 0.002702338621020317, 'timestamp': '2025-10-01 04:22:42.520442', 'step': 4471, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:22:42.572304', 'step': 4471, 'epoch': 3} {'type': 'loss', 'content': 0.03961280360817909, 'timestamp': '2025-10-01 04:22:42.606050', 'step': 4472, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:22:42.650200', 'step': 4472, 'epoch': 3} {'type': 'loss', 'content': 0.00449209101498127, 'timestamp': '2025-10-01 04:22:42.658593', 'step': 4473, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:22:42.704849', 'step': 4473, 'epoch': 3} {'type': 'loss', 'content': 0.0014613433741033077, 'timestamp': '2025-10-01 04:22:42.712485', 'step': 4474, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 16136789420416}, 'timestamp': '2025-10-01 04:22:42.777028', 'step': 4474, 'epoch': 3} {'type': 'loss', 'content': 0.0031853073742240667, 'timestamp': '2025-10-01 04:22:42.796171', 'step': 4475, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:22:42.838544', 'step': 4475, 'epoch': 3} {'type': 'loss', 'content': 0.006478056777268648, 'timestamp': '2025-10-01 04:22:42.867026', 'step': 4476, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:42.911480', 'step': 4476, 'epoch': 3} {'type': 'loss', 'content': 0.0010468451073393226, 'timestamp': '2025-10-01 04:22:42.917388', 'step': 4477, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:22:42.972338', 'step': 4477, 'epoch': 3} {'type': 'loss', 'content': 0.00545223243534565, 'timestamp': '2025-10-01 04:22:42.986323', 'step': 4478, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:22:43.029258', 'step': 4478, 'epoch': 3} {'type': 'loss', 'content': 0.00516932550817728, 'timestamp': '2025-10-01 04:22:43.037333', 'step': 4479, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:43.082158', 'step': 4479, 'epoch': 3} {'type': 'loss', 'content': 0.004243725910782814, 'timestamp': '2025-10-01 04:22:43.111307', 'step': 4480, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:22:43.155633', 'step': 4480, 'epoch': 3} {'type': 'loss', 'content': 0.0015784947900101542, 'timestamp': '2025-10-01 04:22:43.166621', 'step': 4481, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:22:43.221569', 'step': 4481, 'epoch': 3} {'type': 'loss', 'content': 0.0036339187063276768, 'timestamp': '2025-10-01 04:22:43.232450', 'step': 4482, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:43.282079', 'step': 4482, 'epoch': 3} {'type': 'loss', 'content': 0.002432120032608509, 'timestamp': '2025-10-01 04:22:43.290073', 'step': 4483, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:22:43.325374', 'step': 4483, 'epoch': 3} {'type': 'loss', 'content': 0.0038058904465287924, 'timestamp': '2025-10-01 04:22:43.356934', 'step': 4484, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:22:43.393635', 'step': 4484, 'epoch': 3} {'type': 'loss', 'content': 0.005920307710766792, 'timestamp': '2025-10-01 04:22:43.404431', 'step': 4485, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-10-01 04:22:46.241977', 'step': 4485, 'epoch': 3} {'type': 'pplx', 'content': 5.839870003491799, 'timestamp': '2025-10-01 04:22:46.251256', 'step': 4485, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:22:46.292437', 'step': 4485, 'epoch': 3} {'type': 'loss', 'content': 0.003124330658465624, 'timestamp': '2025-10-01 04:22:46.302416', 'step': 4486, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:46.349680', 'step': 4486, 'epoch': 3} {'type': 'loss', 'content': 0.002984036458656192, 'timestamp': '2025-10-01 04:22:46.357577', 'step': 4487, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:22:46.400751', 'step': 4487, 'epoch': 3} {'type': 'loss', 'content': 0.00233078608289361, 'timestamp': '2025-10-01 04:22:46.429296', 'step': 4488, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:46.476167', 'step': 4488, 'epoch': 3} {'type': 'loss', 'content': 0.0022284584119915962, 'timestamp': '2025-10-01 04:22:46.486311', 'step': 4489, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:46.534763', 'step': 4489, 'epoch': 3} {'type': 'loss', 'content': 0.0011683037737384439, 'timestamp': '2025-10-01 04:22:46.543008', 'step': 4490, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:22:46.591344', 'step': 4490, 'epoch': 3} {'type': 'loss', 'content': 0.0009661922813393176, 'timestamp': '2025-10-01 04:22:46.603882', 'step': 4491, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:22:46.652574', 'step': 4491, 'epoch': 3} {'type': 'loss', 'content': 0.003951190505176783, 'timestamp': '2025-10-01 04:22:46.684418', 'step': 4492, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:22:46.731817', 'step': 4492, 'epoch': 3} {'type': 'loss', 'content': 0.006488875951617956, 'timestamp': '2025-10-01 04:22:46.740151', 'step': 4493, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:22:46.784044', 'step': 4493, 'epoch': 3} {'type': 'loss', 'content': 0.0017644702456891537, 'timestamp': '2025-10-01 04:22:46.792105', 'step': 4494, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:22:46.852925', 'step': 4494, 'epoch': 3} {'type': 'loss', 'content': 0.0036000267136842012, 'timestamp': '2025-10-01 04:22:46.868756', 'step': 4495, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:22:46.914031', 'step': 4495, 'epoch': 3} {'type': 'loss', 'content': 0.0031648287549614906, 'timestamp': '2025-10-01 04:22:46.942739', 'step': 4496, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:22:46.986208', 'step': 4496, 'epoch': 3} {'type': 'loss', 'content': 0.001870179665274918, 'timestamp': '2025-10-01 04:22:46.999657', 'step': 4497, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 14238374516352}, 'timestamp': '2025-10-01 04:22:47.054021', 'step': 4497, 'epoch': 3} {'type': 'loss', 'content': 0.0017189126228913665, 'timestamp': '2025-10-01 04:22:47.071549', 'step': 4498, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:47.125263', 'step': 4498, 'epoch': 3} {'type': 'loss', 'content': 0.002658865414559841, 'timestamp': '2025-10-01 04:22:47.141083', 'step': 4499, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:22:47.194823', 'step': 4499, 'epoch': 3} {'type': 'loss', 'content': 0.0052988044917583466, 'timestamp': '2025-10-01 04:22:47.226670', 'step': 4500, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 4500', 'timestamp': '2025-10-01 04:22:52.740082', 'step': 4500, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:22:52.787922', 'step': 4500, 'epoch': 3} {'type': 'loss', 'content': 0.0015157569432631135, 'timestamp': '2025-10-01 04:22:52.801087', 'step': 4501, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:52.873528', 'step': 4501, 'epoch': 3} {'type': 'loss', 'content': 0.007292396854609251, 'timestamp': '2025-10-01 04:22:52.884720', 'step': 4502, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:52.955117', 'step': 4502, 'epoch': 3} {'type': 'loss', 'content': 0.0027140267193317413, 'timestamp': '2025-10-01 04:22:52.963486', 'step': 4503, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:22:53.027456', 'step': 4503, 'epoch': 3} {'type': 'loss', 'content': 0.0017444654367864132, 'timestamp': '2025-10-01 04:22:53.062598', 'step': 4504, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:22:53.108877', 'step': 4504, 'epoch': 3} {'type': 'loss', 'content': 0.004234980791807175, 'timestamp': '2025-10-01 04:22:53.119837', 'step': 4505, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:53.167814', 'step': 4505, 'epoch': 3} {'type': 'loss', 'content': 0.003975946921855211, 'timestamp': '2025-10-01 04:22:53.179307', 'step': 4506, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:22:53.235310', 'step': 4506, 'epoch': 3} {'type': 'loss', 'content': 0.01014590635895729, 'timestamp': '2025-10-01 04:22:53.249522', 'step': 4507, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:22:53.291502', 'step': 4507, 'epoch': 3} {'type': 'loss', 'content': 0.00722027150914073, 'timestamp': '2025-10-01 04:22:53.320482', 'step': 4508, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:22:53.376067', 'step': 4508, 'epoch': 3} {'type': 'loss', 'content': 0.001971011282876134, 'timestamp': '2025-10-01 04:22:53.386921', 'step': 4509, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:22:53.446790', 'step': 4509, 'epoch': 3} {'type': 'loss', 'content': 0.003174201585352421, 'timestamp': '2025-10-01 04:22:53.454902', 'step': 4510, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:22:53.506125', 'step': 4510, 'epoch': 3} {'type': 'loss', 'content': 0.002916463417932391, 'timestamp': '2025-10-01 04:22:53.514088', 'step': 4511, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:22:53.565297', 'step': 4511, 'epoch': 3} {'type': 'loss', 'content': 0.0012195173185318708, 'timestamp': '2025-10-01 04:22:53.597241', 'step': 4512, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:53.650845', 'step': 4512, 'epoch': 3} {'type': 'loss', 'content': 0.011909237131476402, 'timestamp': '2025-10-01 04:22:53.656858', 'step': 4513, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:22:53.708657', 'step': 4513, 'epoch': 3} {'type': 'loss', 'content': 0.0007921972428448498, 'timestamp': '2025-10-01 04:22:53.721442', 'step': 4514, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:22:53.775251', 'step': 4514, 'epoch': 3} {'type': 'loss', 'content': 0.0013254779623821378, 'timestamp': '2025-10-01 04:22:53.788039', 'step': 4515, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:53.858934', 'step': 4515, 'epoch': 3} {'type': 'loss', 'content': 0.0032736649736762047, 'timestamp': '2025-10-01 04:22:53.891416', 'step': 4516, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:22:53.949797', 'step': 4516, 'epoch': 3} {'type': 'loss', 'content': 0.002140364143997431, 'timestamp': '2025-10-01 04:22:53.955455', 'step': 4517, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:54.008469', 'step': 4517, 'epoch': 3} {'type': 'loss', 'content': 0.004744696896523237, 'timestamp': '2025-10-01 04:22:54.020926', 'step': 4518, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:22:54.076732', 'step': 4518, 'epoch': 3} {'type': 'loss', 'content': 0.006604358553886414, 'timestamp': '2025-10-01 04:22:54.090754', 'step': 4519, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:22:54.135229', 'step': 4519, 'epoch': 3} {'type': 'loss', 'content': 0.008320217952132225, 'timestamp': '2025-10-01 04:22:54.167001', 'step': 4520, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:54.214295', 'step': 4520, 'epoch': 3} {'type': 'loss', 'content': 0.010712097398936749, 'timestamp': '2025-10-01 04:22:54.220329', 'step': 4521, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:54.282036', 'step': 4521, 'epoch': 3} {'type': 'loss', 'content': 0.004045157227665186, 'timestamp': '2025-10-01 04:22:54.293572', 'step': 4522, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:22:54.362056', 'step': 4522, 'epoch': 3} {'type': 'loss', 'content': 0.02311292476952076, 'timestamp': '2025-10-01 04:22:54.373013', 'step': 4523, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:22:54.423972', 'step': 4523, 'epoch': 3} {'type': 'loss', 'content': 0.0032463029492646456, 'timestamp': '2025-10-01 04:22:54.457452', 'step': 4524, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:54.507022', 'step': 4524, 'epoch': 3} {'type': 'loss', 'content': 0.002869298681616783, 'timestamp': '2025-10-01 04:22:54.516356', 'step': 4525, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:54.562333', 'step': 4525, 'epoch': 3} {'type': 'loss', 'content': 0.005234246142208576, 'timestamp': '2025-10-01 04:22:54.570670', 'step': 4526, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:22:54.617329', 'step': 4526, 'epoch': 3} {'type': 'loss', 'content': 0.007434080354869366, 'timestamp': '2025-10-01 04:22:54.624870', 'step': 4527, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:22:54.666248', 'step': 4527, 'epoch': 3} {'type': 'loss', 'content': 0.006294552702456713, 'timestamp': '2025-10-01 04:22:54.695232', 'step': 4528, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:22:54.754205', 'step': 4528, 'epoch': 3} {'type': 'loss', 'content': 0.00285795284435153, 'timestamp': '2025-10-01 04:22:54.764612', 'step': 4529, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:22:54.810618', 'step': 4529, 'epoch': 3} {'type': 'loss', 'content': 0.007996376603841782, 'timestamp': '2025-10-01 04:22:54.823419', 'step': 4530, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:22:54.875375', 'step': 4530, 'epoch': 3} {'type': 'loss', 'content': 0.0006767441518604755, 'timestamp': '2025-10-01 04:22:54.889351', 'step': 4531, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:22:54.955681', 'step': 4531, 'epoch': 3} {'type': 'loss', 'content': 0.0016859783791005611, 'timestamp': '2025-10-01 04:22:54.984548', 'step': 4532, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:55.038553', 'step': 4532, 'epoch': 3} {'type': 'loss', 'content': 0.007151003926992416, 'timestamp': '2025-10-01 04:22:55.044690', 'step': 4533, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:22:55.100443', 'step': 4533, 'epoch': 3} {'type': 'loss', 'content': 0.00400347588583827, 'timestamp': '2025-10-01 04:22:55.111377', 'step': 4534, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:55.166156', 'step': 4534, 'epoch': 3} {'type': 'loss', 'content': 0.002640200313180685, 'timestamp': '2025-10-01 04:22:55.177942', 'step': 4535, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:55.228777', 'step': 4535, 'epoch': 3} {'type': 'loss', 'content': 0.005768063943833113, 'timestamp': '2025-10-01 04:22:55.258166', 'step': 4536, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 15662185694400}, 'timestamp': '2025-10-01 04:22:55.325867', 'step': 4536, 'epoch': 3} {'type': 'loss', 'content': 0.005276952870190144, 'timestamp': '2025-10-01 04:22:55.345007', 'step': 4537, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:22:55.394970', 'step': 4537, 'epoch': 3} {'type': 'loss', 'content': 0.0014834296889603138, 'timestamp': '2025-10-01 04:22:55.407739', 'step': 4538, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:22:55.458072', 'step': 4538, 'epoch': 3} {'type': 'loss', 'content': 0.011031442321836948, 'timestamp': '2025-10-01 04:22:55.472159', 'step': 4539, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:22:55.520122', 'step': 4539, 'epoch': 3} {'type': 'loss', 'content': 0.002428436418995261, 'timestamp': '2025-10-01 04:22:55.554722', 'step': 4540, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:22:55.608978', 'step': 4540, 'epoch': 3} {'type': 'loss', 'content': 0.002278492320328951, 'timestamp': '2025-10-01 04:22:55.619491', 'step': 4541, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:22:55.669736', 'step': 4541, 'epoch': 3} {'type': 'loss', 'content': 0.0012135558063164353, 'timestamp': '2025-10-01 04:22:55.683769', 'step': 4542, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:22:55.730239', 'step': 4542, 'epoch': 3} {'type': 'loss', 'content': 0.000731511099729687, 'timestamp': '2025-10-01 04:22:55.742745', 'step': 4543, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:22:55.798752', 'step': 4543, 'epoch': 3} {'type': 'loss', 'content': 0.0012557662557810545, 'timestamp': '2025-10-01 04:22:55.832248', 'step': 4544, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:22:55.871275', 'step': 4544, 'epoch': 3} {'type': 'loss', 'content': 0.008615415543317795, 'timestamp': '2025-10-01 04:22:55.881632', 'step': 4545, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:22:55.927939', 'step': 4545, 'epoch': 3} {'type': 'loss', 'content': 0.0036831737961620092, 'timestamp': '2025-10-01 04:22:55.940522', 'step': 4546, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:22:55.997688', 'step': 4546, 'epoch': 3} {'type': 'loss', 'content': 0.00797025766223669, 'timestamp': '2025-10-01 04:22:56.011881', 'step': 4547, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:56.061967', 'step': 4547, 'epoch': 3} {'type': 'loss', 'content': 0.001341190654784441, 'timestamp': '2025-10-01 04:22:56.091290', 'step': 4548, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:22:56.152802', 'step': 4548, 'epoch': 3} {'type': 'loss', 'content': 0.0037668542936444283, 'timestamp': '2025-10-01 04:22:56.166297', 'step': 4549, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:22:56.205419', 'step': 4549, 'epoch': 3} {'type': 'loss', 'content': 0.0024854557123035192, 'timestamp': '2025-10-01 04:22:56.218204', 'step': 4550, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:22:56.273386', 'step': 4550, 'epoch': 3} {'type': 'loss', 'content': 0.0014433779288083315, 'timestamp': '2025-10-01 04:22:56.281499', 'step': 4551, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:56.325472', 'step': 4551, 'epoch': 3} {'type': 'loss', 'content': 0.0052131954580545425, 'timestamp': '2025-10-01 04:22:56.354700', 'step': 4552, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:22:56.395683', 'step': 4552, 'epoch': 3} {'type': 'loss', 'content': 0.004093370400369167, 'timestamp': '2025-10-01 04:22:56.406768', 'step': 4553, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:22:56.450134', 'step': 4553, 'epoch': 3} {'type': 'loss', 'content': 0.0038811122067272663, 'timestamp': '2025-10-01 04:22:56.464188', 'step': 4554, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:22:56.513073', 'step': 4554, 'epoch': 3} {'type': 'loss', 'content': 0.004123773891478777, 'timestamp': '2025-10-01 04:22:56.525666', 'step': 4555, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:22:56.577794', 'step': 4555, 'epoch': 3} {'type': 'loss', 'content': 0.0017884231638163328, 'timestamp': '2025-10-01 04:22:56.611293', 'step': 4556, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:22:56.653649', 'step': 4556, 'epoch': 3} {'type': 'loss', 'content': 0.0017098251264542341, 'timestamp': '2025-10-01 04:22:56.664426', 'step': 4557, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:22:56.709338', 'step': 4557, 'epoch': 3} {'type': 'loss', 'content': 0.0036045582965016365, 'timestamp': '2025-10-01 04:22:56.721871', 'step': 4558, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:56.761536', 'step': 4558, 'epoch': 3} {'type': 'loss', 'content': 0.003566620871424675, 'timestamp': '2025-10-01 04:22:56.773237', 'step': 4559, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:22:56.814749', 'step': 4559, 'epoch': 3} {'type': 'loss', 'content': 0.002887391485273838, 'timestamp': '2025-10-01 04:22:56.843716', 'step': 4560, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:22:56.897216', 'step': 4560, 'epoch': 3} {'type': 'loss', 'content': 0.0075920806266367435, 'timestamp': '2025-10-01 04:22:56.910796', 'step': 4561, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:22:56.977809', 'step': 4561, 'epoch': 3} {'type': 'loss', 'content': 0.0029249198269098997, 'timestamp': '2025-10-01 04:22:56.991796', 'step': 4562, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:22:57.053404', 'step': 4562, 'epoch': 3} {'type': 'loss', 'content': 0.0017354931915178895, 'timestamp': '2025-10-01 04:22:57.066152', 'step': 4563, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:22:57.122927', 'step': 4563, 'epoch': 3} {'type': 'loss', 'content': 0.006077379919588566, 'timestamp': '2025-10-01 04:22:57.157534', 'step': 4564, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:22:57.207113', 'step': 4564, 'epoch': 3} {'type': 'loss', 'content': 0.0009074280969798565, 'timestamp': '2025-10-01 04:22:57.217476', 'step': 4565, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:22:57.259181', 'step': 4565, 'epoch': 3} {'type': 'loss', 'content': 0.003388401586562395, 'timestamp': '2025-10-01 04:22:57.270062', 'step': 4566, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:22:57.315065', 'step': 4566, 'epoch': 3} {'type': 'loss', 'content': 0.0023988992907106876, 'timestamp': '2025-10-01 04:22:57.322711', 'step': 4567, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:22:57.372434', 'step': 4567, 'epoch': 3} {'type': 'loss', 'content': 0.002602166961878538, 'timestamp': '2025-10-01 04:22:57.407618', 'step': 4568, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:22:57.462857', 'step': 4568, 'epoch': 3} {'type': 'loss', 'content': 0.004300771746784449, 'timestamp': '2025-10-01 04:22:57.473355', 'step': 4569, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:22:57.535083', 'step': 4569, 'epoch': 3} {'type': 'loss', 'content': 0.0067619821056723595, 'timestamp': '2025-10-01 04:22:57.549158', 'step': 4570, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:22:57.601791', 'step': 4570, 'epoch': 3} {'type': 'loss', 'content': 0.007754541467875242, 'timestamp': '2025-10-01 04:22:57.614351', 'step': 4571, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:57.670824', 'step': 4571, 'epoch': 3} {'type': 'loss', 'content': 0.0017012072494253516, 'timestamp': '2025-10-01 04:22:57.703398', 'step': 4572, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:57.756032', 'step': 4572, 'epoch': 3} {'type': 'loss', 'content': 0.00494743138551712, 'timestamp': '2025-10-01 04:22:57.761860', 'step': 4573, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:22:57.829315', 'step': 4573, 'epoch': 3} {'type': 'loss', 'content': 0.008700964041054249, 'timestamp': '2025-10-01 04:22:57.843317', 'step': 4574, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:22:57.892798', 'step': 4574, 'epoch': 3} {'type': 'loss', 'content': 0.0022333890665322542, 'timestamp': '2025-10-01 04:22:57.905392', 'step': 4575, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:22:57.972386', 'step': 4575, 'epoch': 3} {'type': 'loss', 'content': 0.001991432160139084, 'timestamp': '2025-10-01 04:22:58.009078', 'step': 4576, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:22:58.067376', 'step': 4576, 'epoch': 3} {'type': 'loss', 'content': 0.002795493695884943, 'timestamp': '2025-10-01 04:22:58.078454', 'step': 4577, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:22:58.131094', 'step': 4577, 'epoch': 3} {'type': 'loss', 'content': 0.00312325032427907, 'timestamp': '2025-10-01 04:22:58.143649', 'step': 4578, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:22:58.194226', 'step': 4578, 'epoch': 3} {'type': 'loss', 'content': 0.0017396288458257914, 'timestamp': '2025-10-01 04:22:58.199148', 'step': 4579, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:22:58.255181', 'step': 4579, 'epoch': 3} {'type': 'loss', 'content': 0.0006885019829496741, 'timestamp': '2025-10-01 04:22:58.283638', 'step': 4580, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:22:58.339131', 'step': 4580, 'epoch': 3} {'type': 'loss', 'content': 0.0015844326699152589, 'timestamp': '2025-10-01 04:22:58.344383', 'step': 4581, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:58.382064', 'step': 4581, 'epoch': 3} {'type': 'loss', 'content': 0.0029582371935248375, 'timestamp': '2025-10-01 04:22:58.390425', 'step': 4582, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:22:58.433250', 'step': 4582, 'epoch': 3} {'type': 'loss', 'content': 0.00021285646653268486, 'timestamp': '2025-10-01 04:22:58.444077', 'step': 4583, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:58.498995', 'step': 4583, 'epoch': 3} {'type': 'loss', 'content': 0.0008210684754885733, 'timestamp': '2025-10-01 04:22:58.531579', 'step': 4584, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:58.583899', 'step': 4584, 'epoch': 3} {'type': 'loss', 'content': 0.003986839205026627, 'timestamp': '2025-10-01 04:22:58.593113', 'step': 4585, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:22:58.653013', 'step': 4585, 'epoch': 3} {'type': 'loss', 'content': 0.0066764880903065205, 'timestamp': '2025-10-01 04:22:58.666525', 'step': 4586, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:58.716855', 'step': 4586, 'epoch': 3} {'type': 'loss', 'content': 0.0021772675681859255, 'timestamp': '2025-10-01 04:22:58.728500', 'step': 4587, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:22:58.779586', 'step': 4587, 'epoch': 3} {'type': 'loss', 'content': 0.0009396339301019907, 'timestamp': '2025-10-01 04:22:58.807979', 'step': 4588, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:58.853178', 'step': 4588, 'epoch': 3} {'type': 'loss', 'content': 0.0027676380705088377, 'timestamp': '2025-10-01 04:22:58.862349', 'step': 4589, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:22:58.911039', 'step': 4589, 'epoch': 3} {'type': 'loss', 'content': 0.005033126566559076, 'timestamp': '2025-10-01 04:22:58.922679', 'step': 4590, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:22:58.981463', 'step': 4590, 'epoch': 3} {'type': 'loss', 'content': 0.0020180244464427233, 'timestamp': '2025-10-01 04:22:58.992480', 'step': 4591, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:22:59.033545', 'step': 4591, 'epoch': 3} {'type': 'loss', 'content': 0.0015107388608157635, 'timestamp': '2025-10-01 04:22:59.062040', 'step': 4592, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:22:59.108204', 'step': 4592, 'epoch': 3} {'type': 'loss', 'content': 0.0021236054599285126, 'timestamp': '2025-10-01 04:22:59.118642', 'step': 4593, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:22:59.161816', 'step': 4593, 'epoch': 3} {'type': 'loss', 'content': 0.000295362580800429, 'timestamp': '2025-10-01 04:22:59.166855', 'step': 4594, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:59.217956', 'step': 4594, 'epoch': 3} {'type': 'loss', 'content': 0.0006073531112633646, 'timestamp': '2025-10-01 04:22:59.226343', 'step': 4595, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:22:59.277691', 'step': 4595, 'epoch': 3} {'type': 'loss', 'content': 0.004292263649404049, 'timestamp': '2025-10-01 04:22:59.303454', 'step': 4596, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:22:59.367746', 'step': 4596, 'epoch': 3} {'type': 'loss', 'content': 0.002775833709165454, 'timestamp': '2025-10-01 04:22:59.373017', 'step': 4597, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:22:59.429036', 'step': 4597, 'epoch': 3} {'type': 'loss', 'content': 0.0013087184634059668, 'timestamp': '2025-10-01 04:22:59.436933', 'step': 4598, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:22:59.494203', 'step': 4598, 'epoch': 3} {'type': 'loss', 'content': 0.002921068575233221, 'timestamp': '2025-10-01 04:22:59.502353', 'step': 4599, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:22:59.542737', 'step': 4599, 'epoch': 3} {'type': 'loss', 'content': 0.0005559622077271342, 'timestamp': '2025-10-01 04:22:59.574597', 'step': 4600, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-10-01 04:23:02.955100', 'step': 4600, 'epoch': 3} {'type': 'pplx', 'content': 5.8172618779844365, 'timestamp': '2025-10-01 04:23:02.958741', 'step': 4600, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:02.996819', 'step': 4600, 'epoch': 3} {'type': 'loss', 'content': 0.0005481796688400209, 'timestamp': '2025-10-01 04:23:03.004386', 'step': 4601, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:03.061447', 'step': 4601, 'epoch': 3} {'type': 'loss', 'content': 0.0008748734835535288, 'timestamp': '2025-10-01 04:23:03.072296', 'step': 4602, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 15187581968384}, 'timestamp': '2025-10-01 04:23:03.142275', 'step': 4602, 'epoch': 3} {'type': 'loss', 'content': 0.005472549702972174, 'timestamp': '2025-10-01 04:23:03.160181', 'step': 4603, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:23:03.218321', 'step': 4603, 'epoch': 3} {'type': 'loss', 'content': 0.005068325437605381, 'timestamp': '2025-10-01 04:23:03.251981', 'step': 4604, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:23:03.301533', 'step': 4604, 'epoch': 3} {'type': 'loss', 'content': 0.028034459799528122, 'timestamp': '2025-10-01 04:23:03.314335', 'step': 4605, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:03.363973', 'step': 4605, 'epoch': 3} {'type': 'loss', 'content': 0.0012317642103880644, 'timestamp': '2025-10-01 04:23:03.375098', 'step': 4606, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:03.416538', 'step': 4606, 'epoch': 3} {'type': 'loss', 'content': 0.0007249437621794641, 'timestamp': '2025-10-01 04:23:03.427406', 'step': 4607, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:03.478059', 'step': 4607, 'epoch': 3} {'type': 'loss', 'content': 0.0013745810138061643, 'timestamp': '2025-10-01 04:23:03.510754', 'step': 4608, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:03.566105', 'step': 4608, 'epoch': 3} {'type': 'loss', 'content': 0.002619256032630801, 'timestamp': '2025-10-01 04:23:03.577364', 'step': 4609, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:23:03.629593', 'step': 4609, 'epoch': 3} {'type': 'loss', 'content': 0.0028676651418209076, 'timestamp': '2025-10-01 04:23:03.642184', 'step': 4610, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-10-01 04:23:03.708557', 'step': 4610, 'epoch': 3} {'type': 'loss', 'content': 0.0011679199524223804, 'timestamp': '2025-10-01 04:23:03.725837', 'step': 4611, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:03.771737', 'step': 4611, 'epoch': 3} {'type': 'loss', 'content': 0.0023691230453550816, 'timestamp': '2025-10-01 04:23:03.803529', 'step': 4612, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:23:03.870052', 'step': 4612, 'epoch': 3} {'type': 'loss', 'content': 0.000603407621383667, 'timestamp': '2025-10-01 04:23:03.879795', 'step': 4613, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:23:03.954543', 'step': 4613, 'epoch': 3} {'type': 'loss', 'content': 0.0003012138477060944, 'timestamp': '2025-10-01 04:23:03.963608', 'step': 4614, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:04.025177', 'step': 4614, 'epoch': 3} {'type': 'loss', 'content': 0.002456010552123189, 'timestamp': '2025-10-01 04:23:04.036008', 'step': 4615, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:23:04.096618', 'step': 4615, 'epoch': 3} {'type': 'loss', 'content': 7.373586413450539e-05, 'timestamp': '2025-10-01 04:23:04.127206', 'step': 4616, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:23:04.187290', 'step': 4616, 'epoch': 3} {'type': 'loss', 'content': 0.0016673555364832282, 'timestamp': '2025-10-01 04:23:04.198992', 'step': 4617, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:23:04.273834', 'step': 4617, 'epoch': 3} {'type': 'loss', 'content': 0.0030930901411920786, 'timestamp': '2025-10-01 04:23:04.287842', 'step': 4618, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:23:04.349696', 'step': 4618, 'epoch': 3} {'type': 'loss', 'content': 0.00022783088206779212, 'timestamp': '2025-10-01 04:23:04.360041', 'step': 4619, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:04.415214', 'step': 4619, 'epoch': 3} {'type': 'loss', 'content': 0.0001414766302332282, 'timestamp': '2025-10-01 04:23:04.444466', 'step': 4620, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:23:04.509344', 'step': 4620, 'epoch': 3} {'type': 'loss', 'content': 0.0035861164797097445, 'timestamp': '2025-10-01 04:23:04.520400', 'step': 4621, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:23:04.584275', 'step': 4621, 'epoch': 3} {'type': 'loss', 'content': 0.00685293972492218, 'timestamp': '2025-10-01 04:23:04.589238', 'step': 4622, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:23:04.648965', 'step': 4622, 'epoch': 3} {'type': 'loss', 'content': 0.0003698310174513608, 'timestamp': '2025-10-01 04:23:04.662921', 'step': 4623, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:23:04.729445', 'step': 4623, 'epoch': 3} {'type': 'loss', 'content': 0.001613875851035118, 'timestamp': '2025-10-01 04:23:04.763920', 'step': 4624, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:23:04.824562', 'step': 4624, 'epoch': 3} {'type': 'loss', 'content': 0.005695657338947058, 'timestamp': '2025-10-01 04:23:04.840131', 'step': 4625, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:04.881250', 'step': 4625, 'epoch': 3} {'type': 'loss', 'content': 0.004531265236437321, 'timestamp': '2025-10-01 04:23:04.893061', 'step': 4626, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:23:04.943593', 'step': 4626, 'epoch': 3} {'type': 'loss', 'content': 0.007674043066799641, 'timestamp': '2025-10-01 04:23:04.957645', 'step': 4627, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-10-01 04:23:05.016908', 'step': 4627, 'epoch': 3} {'type': 'loss', 'content': 0.00672054523602128, 'timestamp': '2025-10-01 04:23:05.055120', 'step': 4628, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:23:05.099057', 'step': 4628, 'epoch': 3} {'type': 'loss', 'content': 0.0013367017963901162, 'timestamp': '2025-10-01 04:23:05.112173', 'step': 4629, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:05.160103', 'step': 4629, 'epoch': 3} {'type': 'loss', 'content': 0.0012233088491484523, 'timestamp': '2025-10-01 04:23:05.171242', 'step': 4630, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:23:05.210600', 'step': 4630, 'epoch': 3} {'type': 'loss', 'content': 0.002121256198734045, 'timestamp': '2025-10-01 04:23:05.224633', 'step': 4631, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:05.270459', 'step': 4631, 'epoch': 3} {'type': 'loss', 'content': 0.002516039414331317, 'timestamp': '2025-10-01 04:23:05.302233', 'step': 4632, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:05.345819', 'step': 4632, 'epoch': 3} {'type': 'loss', 'content': 0.006630541756749153, 'timestamp': '2025-10-01 04:23:05.360108', 'step': 4633, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:23:05.412762', 'step': 4633, 'epoch': 3} {'type': 'loss', 'content': 0.0018081674352288246, 'timestamp': '2025-10-01 04:23:05.428643', 'step': 4634, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:23:05.466827', 'step': 4634, 'epoch': 3} {'type': 'loss', 'content': 0.0040318891406059265, 'timestamp': '2025-10-01 04:23:05.479625', 'step': 4635, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:23:05.531106', 'step': 4635, 'epoch': 3} {'type': 'loss', 'content': 0.0056265657767653465, 'timestamp': '2025-10-01 04:23:05.566073', 'step': 4636, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:23:05.618345', 'step': 4636, 'epoch': 3} {'type': 'loss', 'content': 0.004820076748728752, 'timestamp': '2025-10-01 04:23:05.628656', 'step': 4637, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:23:05.669102', 'step': 4637, 'epoch': 3} {'type': 'loss', 'content': 0.00042217233567498624, 'timestamp': '2025-10-01 04:23:05.682433', 'step': 4638, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:23:05.729705', 'step': 4638, 'epoch': 3} {'type': 'loss', 'content': 0.0007663873257115483, 'timestamp': '2025-10-01 04:23:05.737694', 'step': 4639, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:23:05.773302', 'step': 4639, 'epoch': 3} {'type': 'loss', 'content': 0.005183704197406769, 'timestamp': '2025-10-01 04:23:05.808186', 'step': 4640, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:23:05.858134', 'step': 4640, 'epoch': 3} {'type': 'loss', 'content': 0.0007436895393766463, 'timestamp': '2025-10-01 04:23:05.873388', 'step': 4641, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:05.916899', 'step': 4641, 'epoch': 3} {'type': 'loss', 'content': 0.0007140880334191024, 'timestamp': '2025-10-01 04:23:05.928578', 'step': 4642, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:05.983676', 'step': 4642, 'epoch': 3} {'type': 'loss', 'content': 0.0023184206802397966, 'timestamp': '2025-10-01 04:23:05.997974', 'step': 4643, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:06.045202', 'step': 4643, 'epoch': 3} {'type': 'loss', 'content': 0.0026147228199988604, 'timestamp': '2025-10-01 04:23:06.077800', 'step': 4644, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:23:06.123582', 'step': 4644, 'epoch': 3} {'type': 'loss', 'content': 0.0004082793602719903, 'timestamp': '2025-10-01 04:23:06.135981', 'step': 4645, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:06.183618', 'step': 4645, 'epoch': 3} {'type': 'loss', 'content': 0.0004313488316256553, 'timestamp': '2025-10-01 04:23:06.196272', 'step': 4646, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:23:06.243320', 'step': 4646, 'epoch': 3} {'type': 'loss', 'content': 0.0019822425674647093, 'timestamp': '2025-10-01 04:23:06.256146', 'step': 4647, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:23:06.309045', 'step': 4647, 'epoch': 3} {'type': 'loss', 'content': 0.0050553809851408005, 'timestamp': '2025-10-01 04:23:06.348496', 'step': 4648, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:06.384610', 'step': 4648, 'epoch': 3} {'type': 'loss', 'content': 0.0032993583008646965, 'timestamp': '2025-10-01 04:23:06.399573', 'step': 4649, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:23:06.444723', 'step': 4649, 'epoch': 3} {'type': 'loss', 'content': 0.0003009256615769118, 'timestamp': '2025-10-01 04:23:06.452379', 'step': 4650, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:06.497651', 'step': 4650, 'epoch': 3} {'type': 'loss', 'content': 0.0021451732609421015, 'timestamp': '2025-10-01 04:23:06.509154', 'step': 4651, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:23:06.553604', 'step': 4651, 'epoch': 3} {'type': 'loss', 'content': 4.2509331251494586e-05, 'timestamp': '2025-10-01 04:23:06.582010', 'step': 4652, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:23:06.632284', 'step': 4652, 'epoch': 3} {'type': 'loss', 'content': 0.003211280796676874, 'timestamp': '2025-10-01 04:23:06.645879', 'step': 4653, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:06.682139', 'step': 4653, 'epoch': 3} {'type': 'loss', 'content': 0.0024309689179062843, 'timestamp': '2025-10-01 04:23:06.690605', 'step': 4654, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:23:06.734809', 'step': 4654, 'epoch': 3} {'type': 'loss', 'content': 0.0009190264972858131, 'timestamp': '2025-10-01 04:23:06.742167', 'step': 4655, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:23:06.781400', 'step': 4655, 'epoch': 3} {'type': 'loss', 'content': 0.0032037473283708096, 'timestamp': '2025-10-01 04:23:06.816345', 'step': 4656, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:23:06.866504', 'step': 4656, 'epoch': 3} {'type': 'loss', 'content': 0.0005168642965145409, 'timestamp': '2025-10-01 04:23:06.880033', 'step': 4657, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:23:06.928464', 'step': 4657, 'epoch': 3} {'type': 'loss', 'content': 0.0007295518298633397, 'timestamp': '2025-10-01 04:23:06.941005', 'step': 4658, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:23:06.978030', 'step': 4658, 'epoch': 3} {'type': 'loss', 'content': 0.0010004183277487755, 'timestamp': '2025-10-01 04:23:06.990911', 'step': 4659, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:23:07.042129', 'step': 4659, 'epoch': 3} {'type': 'loss', 'content': 0.0034235657658427954, 'timestamp': '2025-10-01 04:23:07.077102', 'step': 4660, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 15662185694400}, 'timestamp': '2025-10-01 04:23:07.135037', 'step': 4660, 'epoch': 3} {'type': 'loss', 'content': 0.004927926231175661, 'timestamp': '2025-10-01 04:23:07.154207', 'step': 4661, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:23:07.213839', 'step': 4661, 'epoch': 3} {'type': 'loss', 'content': 0.0006186141399666667, 'timestamp': '2025-10-01 04:23:07.229723', 'step': 4662, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:07.277573', 'step': 4662, 'epoch': 3} {'type': 'loss', 'content': 0.0018141487380489707, 'timestamp': '2025-10-01 04:23:07.289081', 'step': 4663, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:23:07.338971', 'step': 4663, 'epoch': 3} {'type': 'loss', 'content': 0.004006293136626482, 'timestamp': '2025-10-01 04:23:07.373984', 'step': 4664, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:07.418616', 'step': 4664, 'epoch': 3} {'type': 'loss', 'content': 0.0013808824587613344, 'timestamp': '2025-10-01 04:23:07.431585', 'step': 4665, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:23:07.481695', 'step': 4665, 'epoch': 3} {'type': 'loss', 'content': 0.01819748431444168, 'timestamp': '2025-10-01 04:23:07.494494', 'step': 4666, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:23:07.547979', 'step': 4666, 'epoch': 3} {'type': 'loss', 'content': 0.0018214844167232513, 'timestamp': '2025-10-01 04:23:07.563747', 'step': 4667, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:23:07.615427', 'step': 4667, 'epoch': 3} {'type': 'loss', 'content': 0.005904864054173231, 'timestamp': '2025-10-01 04:23:07.650490', 'step': 4668, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:07.688913', 'step': 4668, 'epoch': 3} {'type': 'loss', 'content': 0.00011462967086117715, 'timestamp': '2025-10-01 04:23:07.704916', 'step': 4669, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:23:07.756591', 'step': 4669, 'epoch': 3} {'type': 'loss', 'content': 0.00034902553306892514, 'timestamp': '2025-10-01 04:23:07.770135', 'step': 4670, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:23:07.819809', 'step': 4670, 'epoch': 3} {'type': 'loss', 'content': 5.125632378621958e-05, 'timestamp': '2025-10-01 04:23:07.831666', 'step': 4671, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:07.880047', 'step': 4671, 'epoch': 3} {'type': 'loss', 'content': 0.001495245611295104, 'timestamp': '2025-10-01 04:23:07.913789', 'step': 4672, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:23:07.952417', 'step': 4672, 'epoch': 3} {'type': 'loss', 'content': 0.0008956571109592915, 'timestamp': '2025-10-01 04:23:07.963383', 'step': 4673, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:23:08.013148', 'step': 4673, 'epoch': 3} {'type': 'loss', 'content': 0.0004708691849373281, 'timestamp': '2025-10-01 04:23:08.027840', 'step': 4674, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:08.074004', 'step': 4674, 'epoch': 3} {'type': 'loss', 'content': 0.004678243771195412, 'timestamp': '2025-10-01 04:23:08.085345', 'step': 4675, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 16136789420416}, 'timestamp': '2025-10-01 04:23:08.136947', 'step': 4675, 'epoch': 3} {'type': 'loss', 'content': 0.0006964110070839524, 'timestamp': '2025-10-01 04:23:08.177155', 'step': 4676, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:08.215839', 'step': 4676, 'epoch': 3} {'type': 'loss', 'content': 0.0009655052563175559, 'timestamp': '2025-10-01 04:23:08.227661', 'step': 4677, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:23:08.276966', 'step': 4677, 'epoch': 3} {'type': 'loss', 'content': 5.096415043226443e-05, 'timestamp': '2025-10-01 04:23:08.289735', 'step': 4678, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-10-01 04:23:08.349246', 'step': 4678, 'epoch': 3} {'type': 'loss', 'content': 0.0006081221508793533, 'timestamp': '2025-10-01 04:23:08.365750', 'step': 4679, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:08.412561', 'step': 4679, 'epoch': 3} {'type': 'loss', 'content': 0.00061324198031798, 'timestamp': '2025-10-01 04:23:08.447012', 'step': 4680, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:23:08.487269', 'step': 4680, 'epoch': 3} {'type': 'loss', 'content': 0.004515441134572029, 'timestamp': '2025-10-01 04:23:08.497719', 'step': 4681, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:08.545263', 'step': 4681, 'epoch': 3} {'type': 'loss', 'content': 0.0024223290383815765, 'timestamp': '2025-10-01 04:23:08.562068', 'step': 4682, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:08.607518', 'step': 4682, 'epoch': 3} {'type': 'loss', 'content': 0.0008002884569577873, 'timestamp': '2025-10-01 04:23:08.622776', 'step': 4683, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:23:08.677081', 'step': 4683, 'epoch': 3} {'type': 'loss', 'content': 0.004487915430217981, 'timestamp': '2025-10-01 04:23:08.711577', 'step': 4684, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:23:08.746362', 'step': 4684, 'epoch': 3} {'type': 'loss', 'content': 0.0001842819037847221, 'timestamp': '2025-10-01 04:23:08.754605', 'step': 4685, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:23:08.791750', 'step': 4685, 'epoch': 3} {'type': 'loss', 'content': 0.0011658893199637532, 'timestamp': '2025-10-01 04:23:08.805834', 'step': 4686, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:23:08.850412', 'step': 4686, 'epoch': 3} {'type': 'loss', 'content': 0.008085338398814201, 'timestamp': '2025-10-01 04:23:08.858438', 'step': 4687, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:08.904423', 'step': 4687, 'epoch': 3} {'type': 'loss', 'content': 0.0027636829763650894, 'timestamp': '2025-10-01 04:23:08.936791', 'step': 4688, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:23:08.972508', 'step': 4688, 'epoch': 3} {'type': 'loss', 'content': 0.0017094842623919249, 'timestamp': '2025-10-01 04:23:08.983441', 'step': 4689, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:09.024679', 'step': 4689, 'epoch': 3} {'type': 'loss', 'content': 0.0009196384344249964, 'timestamp': '2025-10-01 04:23:09.036225', 'step': 4690, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-10-01 04:23:09.077637', 'step': 4690, 'epoch': 3} {'type': 'loss', 'content': 0.0062644644640386105, 'timestamp': '2025-10-01 04:23:09.085918', 'step': 4691, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:23:09.138558', 'step': 4691, 'epoch': 3} {'type': 'loss', 'content': 0.00047644591541029513, 'timestamp': '2025-10-01 04:23:09.167117', 'step': 4692, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:23:09.211381', 'step': 4692, 'epoch': 3} {'type': 'loss', 'content': 0.0001565329439472407, 'timestamp': '2025-10-01 04:23:09.216985', 'step': 4693, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:23:09.266059', 'step': 4693, 'epoch': 3} {'type': 'loss', 'content': 0.0011630663648247719, 'timestamp': '2025-10-01 04:23:09.279571', 'step': 4694, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:09.323931', 'step': 4694, 'epoch': 3} {'type': 'loss', 'content': 0.002214627806097269, 'timestamp': '2025-10-01 04:23:09.334657', 'step': 4695, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:23:09.380327', 'step': 4695, 'epoch': 3} {'type': 'loss', 'content': 0.0033647031523287296, 'timestamp': '2025-10-01 04:23:09.415227', 'step': 4696, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:23:09.457950', 'step': 4696, 'epoch': 3} {'type': 'loss', 'content': 0.00733291357755661, 'timestamp': '2025-10-01 04:23:09.471328', 'step': 4697, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:23:09.504780', 'step': 4697, 'epoch': 3} {'type': 'loss', 'content': 0.004071072209626436, 'timestamp': '2025-10-01 04:23:09.515070', 'step': 4698, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:23:09.565187', 'step': 4698, 'epoch': 3} {'type': 'loss', 'content': 0.000901603780221194, 'timestamp': '2025-10-01 04:23:09.573451', 'step': 4699, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:09.615241', 'step': 4699, 'epoch': 3} {'type': 'loss', 'content': 0.002291906625032425, 'timestamp': '2025-10-01 04:23:09.647961', 'step': 4700, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:09.688668', 'step': 4700, 'epoch': 3} {'type': 'loss', 'content': 0.0001700749999145046, 'timestamp': '2025-10-01 04:23:09.697697', 'step': 4701, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:09.730934', 'step': 4701, 'epoch': 3} {'type': 'loss', 'content': 0.008707469329237938, 'timestamp': '2025-10-01 04:23:09.747392', 'step': 4702, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:23:09.800082', 'step': 4702, 'epoch': 3} {'type': 'loss', 'content': 0.0008731597336009145, 'timestamp': '2025-10-01 04:23:09.814127', 'step': 4703, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:09.861366', 'step': 4703, 'epoch': 3} {'type': 'loss', 'content': 0.006331549491733313, 'timestamp': '2025-10-01 04:23:09.894044', 'step': 4704, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:23:09.935179', 'step': 4704, 'epoch': 3} {'type': 'loss', 'content': 0.0037822765298187733, 'timestamp': '2025-10-01 04:23:09.952286', 'step': 4705, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:23:10.003680', 'step': 4705, 'epoch': 3} {'type': 'loss', 'content': 0.00022817014541942626, 'timestamp': '2025-10-01 04:23:10.013215', 'step': 4706, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:23:10.073009', 'step': 4706, 'epoch': 3} {'type': 'loss', 'content': 0.0009712950559332967, 'timestamp': '2025-10-01 04:23:10.087186', 'step': 4707, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 17560600598464}, 'timestamp': '2025-10-01 04:23:10.154094', 'step': 4707, 'epoch': 3} {'type': 'loss', 'content': 0.00412520207464695, 'timestamp': '2025-10-01 04:23:10.196344', 'step': 4708, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:23:10.253300', 'step': 4708, 'epoch': 3} {'type': 'loss', 'content': 0.000453525863122195, 'timestamp': '2025-10-01 04:23:10.263516', 'step': 4709, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:23:10.316525', 'step': 4709, 'epoch': 3} {'type': 'loss', 'content': 0.0008704562787897885, 'timestamp': '2025-10-01 04:23:10.330589', 'step': 4710, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:23:10.362807', 'step': 4710, 'epoch': 3} {'type': 'loss', 'content': 0.002035569166764617, 'timestamp': '2025-10-01 04:23:10.371056', 'step': 4711, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:10.408903', 'step': 4711, 'epoch': 3} {'type': 'loss', 'content': 0.0026916805654764175, 'timestamp': '2025-10-01 04:23:10.440704', 'step': 4712, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:23:10.483036', 'step': 4712, 'epoch': 3} {'type': 'loss', 'content': 0.0009390160557813942, 'timestamp': '2025-10-01 04:23:10.488997', 'step': 4713, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:23:10.520684', 'step': 4713, 'epoch': 3} {'type': 'loss', 'content': 0.003035308327525854, 'timestamp': '2025-10-01 04:23:10.528613', 'step': 4714, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:23:10.565609', 'step': 4714, 'epoch': 3} {'type': 'loss', 'content': 0.0017804540693759918, 'timestamp': '2025-10-01 04:23:10.576461', 'step': 4715, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-10-01 04:23:13.123953', 'step': 4715, 'epoch': 3} {'type': 'pplx', 'content': 6.032945119280696, 'timestamp': '2025-10-01 04:23:13.131561', 'step': 4715, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:23:13.175716', 'step': 4715, 'epoch': 3} {'type': 'loss', 'content': 0.013262028805911541, 'timestamp': '2025-10-01 04:23:13.208816', 'step': 4716, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:13.245687', 'step': 4716, 'epoch': 3} {'type': 'loss', 'content': 0.00030436465749517083, 'timestamp': '2025-10-01 04:23:13.254576', 'step': 4717, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:23:13.291516', 'step': 4717, 'epoch': 3} {'type': 'loss', 'content': 0.0009589921100996435, 'timestamp': '2025-10-01 04:23:13.304262', 'step': 4718, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:23:13.346798', 'step': 4718, 'epoch': 3} {'type': 'loss', 'content': 0.005784152075648308, 'timestamp': '2025-10-01 04:23:13.354789', 'step': 4719, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:13.441971', 'step': 4719, 'epoch': 3} {'type': 'loss', 'content': 0.0033182636834681034, 'timestamp': '2025-10-01 04:23:13.473638', 'step': 4720, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-10-01 04:23:13.541220', 'step': 4720, 'epoch': 3} {'type': 'loss', 'content': 0.00412555830553174, 'timestamp': '2025-10-01 04:23:13.547027', 'step': 4721, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:23:13.620336', 'step': 4721, 'epoch': 3} {'type': 'loss', 'content': 0.0010444001527503133, 'timestamp': '2025-10-01 04:23:13.628405', 'step': 4722, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:23:13.699003', 'step': 4722, 'epoch': 3} {'type': 'loss', 'content': 0.004676435142755508, 'timestamp': '2025-10-01 04:23:13.712592', 'step': 4723, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:13.776977', 'step': 4723, 'epoch': 3} {'type': 'loss', 'content': 0.00283204042352736, 'timestamp': '2025-10-01 04:23:13.806288', 'step': 4724, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:13.875485', 'step': 4724, 'epoch': 3} {'type': 'loss', 'content': 0.0032740302849560976, 'timestamp': '2025-10-01 04:23:13.883827', 'step': 4725, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:13.935476', 'step': 4725, 'epoch': 3} {'type': 'loss', 'content': 0.0030358589719980955, 'timestamp': '2025-10-01 04:23:13.943774', 'step': 4726, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:23:13.984974', 'step': 4726, 'epoch': 3} {'type': 'loss', 'content': 0.0046774460934102535, 'timestamp': '2025-10-01 04:23:13.997550', 'step': 4727, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:23:14.062792', 'step': 4727, 'epoch': 3} {'type': 'loss', 'content': 0.0103486068546772, 'timestamp': '2025-10-01 04:23:14.097305', 'step': 4728, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:14.139665', 'step': 4728, 'epoch': 3} {'type': 'loss', 'content': 0.0009858858538791537, 'timestamp': '2025-10-01 04:23:14.145443', 'step': 4729, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:14.194272', 'step': 4729, 'epoch': 3} {'type': 'loss', 'content': 0.012892410159111023, 'timestamp': '2025-10-01 04:23:14.202561', 'step': 4730, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:14.244169', 'step': 4730, 'epoch': 3} {'type': 'loss', 'content': 0.0010288037592545152, 'timestamp': '2025-10-01 04:23:14.252535', 'step': 4731, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:23:14.301803', 'step': 4731, 'epoch': 3} {'type': 'loss', 'content': 0.0001894195011118427, 'timestamp': '2025-10-01 04:23:14.335420', 'step': 4732, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:23:14.384228', 'step': 4732, 'epoch': 3} {'type': 'loss', 'content': 0.0019442768534645438, 'timestamp': '2025-10-01 04:23:14.399549', 'step': 4733, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:23:14.454529', 'step': 4733, 'epoch': 3} {'type': 'loss', 'content': 0.0076557728461921215, 'timestamp': '2025-10-01 04:23:14.468561', 'step': 4734, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:23:14.503681', 'step': 4734, 'epoch': 3} {'type': 'loss', 'content': 0.0023784651421010494, 'timestamp': '2025-10-01 04:23:14.510888', 'step': 4735, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-10-01 04:23:14.565861', 'step': 4735, 'epoch': 3} {'type': 'loss', 'content': 0.001712842145934701, 'timestamp': '2025-10-01 04:23:14.603313', 'step': 4736, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:23:14.652699', 'step': 4736, 'epoch': 3} {'type': 'loss', 'content': 0.0033300973009318113, 'timestamp': '2025-10-01 04:23:14.665564', 'step': 4737, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:23:14.716598', 'step': 4737, 'epoch': 3} {'type': 'loss', 'content': 0.0020110979676246643, 'timestamp': '2025-10-01 04:23:14.730592', 'step': 4738, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:14.774104', 'step': 4738, 'epoch': 3} {'type': 'loss', 'content': 0.005838059354573488, 'timestamp': '2025-10-01 04:23:14.783111', 'step': 4739, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:23:14.835928', 'step': 4739, 'epoch': 3} {'type': 'loss', 'content': 0.0040391855873167515, 'timestamp': '2025-10-01 04:23:14.871051', 'step': 4740, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:23:14.914249', 'step': 4740, 'epoch': 3} {'type': 'loss', 'content': 0.0016707225004211068, 'timestamp': '2025-10-01 04:23:14.919978', 'step': 4741, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:14.962565', 'step': 4741, 'epoch': 3} {'type': 'loss', 'content': 0.0005026211147196591, 'timestamp': '2025-10-01 04:23:14.970909', 'step': 4742, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:15.015718', 'step': 4742, 'epoch': 3} {'type': 'loss', 'content': 0.0033131318632513285, 'timestamp': '2025-10-01 04:23:15.026497', 'step': 4743, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:23:15.072319', 'step': 4743, 'epoch': 3} {'type': 'loss', 'content': 0.0006107672234065831, 'timestamp': '2025-10-01 04:23:15.105987', 'step': 4744, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-10-01 04:23:15.160078', 'step': 4744, 'epoch': 3} {'type': 'loss', 'content': 0.0006733962218277156, 'timestamp': '2025-10-01 04:23:15.176107', 'step': 4745, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:15.221472', 'step': 4745, 'epoch': 3} {'type': 'loss', 'content': 0.0052246758714318275, 'timestamp': '2025-10-01 04:23:15.232449', 'step': 4746, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:15.272320', 'step': 4746, 'epoch': 3} {'type': 'loss', 'content': 0.003196112113073468, 'timestamp': '2025-10-01 04:23:15.280761', 'step': 4747, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:23:15.332005', 'step': 4747, 'epoch': 3} {'type': 'loss', 'content': 0.00231133378110826, 'timestamp': '2025-10-01 04:23:15.360932', 'step': 4748, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:23:15.414287', 'step': 4748, 'epoch': 3} {'type': 'loss', 'content': 0.004580761305987835, 'timestamp': '2025-10-01 04:23:15.427650', 'step': 4749, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:23:15.474156', 'step': 4749, 'epoch': 3} {'type': 'loss', 'content': 0.0009525356581434608, 'timestamp': '2025-10-01 04:23:15.483577', 'step': 4750, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:23:15.529106', 'step': 4750, 'epoch': 3} {'type': 'loss', 'content': 0.004659296479076147, 'timestamp': '2025-10-01 04:23:15.535634', 'step': 4751, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:15.587542', 'step': 4751, 'epoch': 3} {'type': 'loss', 'content': 0.0035503029357641935, 'timestamp': '2025-10-01 04:23:15.620064', 'step': 4752, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:15.667203', 'step': 4752, 'epoch': 3} {'type': 'loss', 'content': 0.0021698165219277143, 'timestamp': '2025-10-01 04:23:15.673251', 'step': 4753, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:15.721739', 'step': 4753, 'epoch': 3} {'type': 'loss', 'content': 0.0020651742815971375, 'timestamp': '2025-10-01 04:23:15.732584', 'step': 4754, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:23:15.778195', 'step': 4754, 'epoch': 3} {'type': 'loss', 'content': 0.003300589509308338, 'timestamp': '2025-10-01 04:23:15.783148', 'step': 4755, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:23:15.840459', 'step': 4755, 'epoch': 3} {'type': 'loss', 'content': 0.0025506296660751104, 'timestamp': '2025-10-01 04:23:15.875013', 'step': 4756, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:15.924155', 'step': 4756, 'epoch': 3} {'type': 'loss', 'content': 0.004953200928866863, 'timestamp': '2025-10-01 04:23:15.932571', 'step': 4757, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:23:15.981722', 'step': 4757, 'epoch': 3} {'type': 'loss', 'content': 0.0011750432895496488, 'timestamp': '2025-10-01 04:23:15.990007', 'step': 4758, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:23:16.041680', 'step': 4758, 'epoch': 3} {'type': 'loss', 'content': 0.002955620177090168, 'timestamp': '2025-10-01 04:23:16.046584', 'step': 4759, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:16.090088', 'step': 4759, 'epoch': 3} {'type': 'loss', 'content': 0.0011926627485081553, 'timestamp': '2025-10-01 04:23:16.121999', 'step': 4760, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:23:16.173536', 'step': 4760, 'epoch': 3} {'type': 'loss', 'content': 0.0041986447758972645, 'timestamp': '2025-10-01 04:23:16.182034', 'step': 4761, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:23:16.227374', 'step': 4761, 'epoch': 3} {'type': 'loss', 'content': 0.0019082068465650082, 'timestamp': '2025-10-01 04:23:16.234727', 'step': 4762, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-10-01 04:23:16.284963', 'step': 4762, 'epoch': 3} {'type': 'loss', 'content': 0.0005876366631127894, 'timestamp': '2025-10-01 04:23:16.297625', 'step': 4763, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:23:16.352865', 'step': 4763, 'epoch': 3} {'type': 'loss', 'content': 0.002388122957199812, 'timestamp': '2025-10-01 04:23:16.386577', 'step': 4764, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-10-01 04:23:16.428175', 'step': 4764, 'epoch': 3} {'type': 'loss', 'content': 0.006167857442051172, 'timestamp': '2025-10-01 04:23:16.445560', 'step': 4765, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:23:16.485677', 'step': 4765, 'epoch': 3} {'type': 'loss', 'content': 0.0010622577974572778, 'timestamp': '2025-10-01 04:23:16.490588', 'step': 4766, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:23:16.546068', 'step': 4766, 'epoch': 3} {'type': 'loss', 'content': 0.00019288391922600567, 'timestamp': '2025-10-01 04:23:16.564393', 'step': 4767, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:16.610351', 'step': 4767, 'epoch': 3} {'type': 'loss', 'content': 0.0018122660694643855, 'timestamp': '2025-10-01 04:23:16.639499', 'step': 4768, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:23:16.684529', 'step': 4768, 'epoch': 3} {'type': 'loss', 'content': 0.001247819047421217, 'timestamp': '2025-10-01 04:23:16.690064', 'step': 4769, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:23:16.745294', 'step': 4769, 'epoch': 3} {'type': 'loss', 'content': 0.003279162338003516, 'timestamp': '2025-10-01 04:23:16.752656', 'step': 4770, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:23:16.796344', 'step': 4770, 'epoch': 3} {'type': 'loss', 'content': 0.0011290283873677254, 'timestamp': '2025-10-01 04:23:16.810179', 'step': 4771, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:23:16.854311', 'step': 4771, 'epoch': 3} {'type': 'loss', 'content': 0.0026691488455981016, 'timestamp': '2025-10-01 04:23:16.882624', 'step': 4772, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:23:16.934278', 'step': 4772, 'epoch': 3} {'type': 'loss', 'content': 0.0014798991614952683, 'timestamp': '2025-10-01 04:23:16.939190', 'step': 4773, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:16.995574', 'step': 4773, 'epoch': 3} {'type': 'loss', 'content': 0.0012831002241000533, 'timestamp': '2025-10-01 04:23:17.007071', 'step': 4774, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:23:17.059368', 'step': 4774, 'epoch': 3} {'type': 'loss', 'content': 0.001655695610679686, 'timestamp': '2025-10-01 04:23:17.071946', 'step': 4775, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:23:17.123751', 'step': 4775, 'epoch': 3} {'type': 'loss', 'content': 0.0070608872920274734, 'timestamp': '2025-10-01 04:23:17.158861', 'step': 4776, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:23:17.202911', 'step': 4776, 'epoch': 3} {'type': 'loss', 'content': 0.0008231059764511883, 'timestamp': '2025-10-01 04:23:17.213167', 'step': 4777, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:17.257293', 'step': 4777, 'epoch': 3} {'type': 'loss', 'content': 0.002061838051304221, 'timestamp': '2025-10-01 04:23:17.268920', 'step': 4778, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:17.322841', 'step': 4778, 'epoch': 3} {'type': 'loss', 'content': 0.001343821408227086, 'timestamp': '2025-10-01 04:23:17.333446', 'step': 4779, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:23:17.382004', 'step': 4779, 'epoch': 3} {'type': 'loss', 'content': 0.000805669988039881, 'timestamp': '2025-10-01 04:23:17.410467', 'step': 4780, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:23:17.456748', 'step': 4780, 'epoch': 3} {'type': 'loss', 'content': 0.02133777365088463, 'timestamp': '2025-10-01 04:23:17.472003', 'step': 4781, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:23:17.520971', 'step': 4781, 'epoch': 3} {'type': 'loss', 'content': 0.002580334199592471, 'timestamp': '2025-10-01 04:23:17.534976', 'step': 4782, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:17.590920', 'step': 4782, 'epoch': 3} {'type': 'loss', 'content': 0.001459325198084116, 'timestamp': '2025-10-01 04:23:17.603601', 'step': 4783, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:17.654692', 'step': 4783, 'epoch': 3} {'type': 'loss', 'content': 0.0016503583174198866, 'timestamp': '2025-10-01 04:23:17.683911', 'step': 4784, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:23:17.736471', 'step': 4784, 'epoch': 3} {'type': 'loss', 'content': 0.0023265511263161898, 'timestamp': '2025-10-01 04:23:17.750840', 'step': 4785, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:23:17.797038', 'step': 4785, 'epoch': 3} {'type': 'loss', 'content': 0.00105811539106071, 'timestamp': '2025-10-01 04:23:17.808047', 'step': 4786, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:23:17.864441', 'step': 4786, 'epoch': 3} {'type': 'loss', 'content': 0.0016094543971121311, 'timestamp': '2025-10-01 04:23:17.878668', 'step': 4787, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:23:17.927591', 'step': 4787, 'epoch': 3} {'type': 'loss', 'content': 0.0009604722145013511, 'timestamp': '2025-10-01 04:23:17.962409', 'step': 4788, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 14238374516352}, 'timestamp': '2025-10-01 04:23:18.029395', 'step': 4788, 'epoch': 3} {'type': 'loss', 'content': 0.0032057748176157475, 'timestamp': '2025-10-01 04:23:18.046536', 'step': 4789, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:18.101447', 'step': 4789, 'epoch': 3} {'type': 'loss', 'content': 0.0007721529109403491, 'timestamp': '2025-10-01 04:23:18.112176', 'step': 4790, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:23:18.162148', 'step': 4790, 'epoch': 3} {'type': 'loss', 'content': 0.0005165711045265198, 'timestamp': '2025-10-01 04:23:18.172010', 'step': 4791, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:23:18.223171', 'step': 4791, 'epoch': 3} {'type': 'loss', 'content': 0.0038516372442245483, 'timestamp': '2025-10-01 04:23:18.252129', 'step': 4792, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:18.300393', 'step': 4792, 'epoch': 3} {'type': 'loss', 'content': 0.004658108577132225, 'timestamp': '2025-10-01 04:23:18.315370', 'step': 4793, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:23:18.358697', 'step': 4793, 'epoch': 3} {'type': 'loss', 'content': 0.0005281376652419567, 'timestamp': '2025-10-01 04:23:18.367562', 'step': 4794, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:23:18.406318', 'step': 4794, 'epoch': 3} {'type': 'loss', 'content': 0.0009487469214946032, 'timestamp': '2025-10-01 04:23:18.418829', 'step': 4795, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:23:18.466153', 'step': 4795, 'epoch': 3} {'type': 'loss', 'content': 0.0010233260691165924, 'timestamp': '2025-10-01 04:23:18.497114', 'step': 4796, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:23:18.550511', 'step': 4796, 'epoch': 3} {'type': 'loss', 'content': 0.003567192004993558, 'timestamp': '2025-10-01 04:23:18.556034', 'step': 4797, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:23:18.606288', 'step': 4797, 'epoch': 3} {'type': 'loss', 'content': 0.0013490754645317793, 'timestamp': '2025-10-01 04:23:18.619048', 'step': 4798, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:18.663872', 'step': 4798, 'epoch': 3} {'type': 'loss', 'content': 0.0024822638370096684, 'timestamp': '2025-10-01 04:23:18.674602', 'step': 4799, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:23:18.722589', 'step': 4799, 'epoch': 3} {'type': 'loss', 'content': 0.0007072230218909681, 'timestamp': '2025-10-01 04:23:18.752905', 'step': 4800, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:23:18.801887', 'step': 4800, 'epoch': 3} {'type': 'loss', 'content': 0.0018865527817979455, 'timestamp': '2025-10-01 04:23:18.815290', 'step': 4801, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:23:18.867168', 'step': 4801, 'epoch': 3} {'type': 'loss', 'content': 0.0013568942667916417, 'timestamp': '2025-10-01 04:23:18.878307', 'step': 4802, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:18.930425', 'step': 4802, 'epoch': 3} {'type': 'loss', 'content': 0.0013161146780475974, 'timestamp': '2025-10-01 04:23:18.941129', 'step': 4803, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:23:18.990215', 'step': 4803, 'epoch': 3} {'type': 'loss', 'content': 0.0008852147147990763, 'timestamp': '2025-10-01 04:23:19.021509', 'step': 4804, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-10-01 04:23:19.076787', 'step': 4804, 'epoch': 3} {'type': 'loss', 'content': 0.002249819226562977, 'timestamp': '2025-10-01 04:23:19.092848', 'step': 4805, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:23:19.148375', 'step': 4805, 'epoch': 3} {'type': 'loss', 'content': 0.002700617304071784, 'timestamp': '2025-10-01 04:23:19.155759', 'step': 4806, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:23:19.206356', 'step': 4806, 'epoch': 3} {'type': 'loss', 'content': 0.0003198760678060353, 'timestamp': '2025-10-01 04:23:19.219121', 'step': 4807, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:19.270933', 'step': 4807, 'epoch': 3} {'type': 'loss', 'content': 0.0008747085812501609, 'timestamp': '2025-10-01 04:23:19.302348', 'step': 4808, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:23:19.355936', 'step': 4808, 'epoch': 3} {'type': 'loss', 'content': 0.006467355415225029, 'timestamp': '2025-10-01 04:23:19.369484', 'step': 4809, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:23:19.421897', 'step': 4809, 'epoch': 3} {'type': 'loss', 'content': 0.002014380181208253, 'timestamp': '2025-10-01 04:23:19.436120', 'step': 4810, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:23:19.484907', 'step': 4810, 'epoch': 3} {'type': 'loss', 'content': 0.0022908993996679783, 'timestamp': '2025-10-01 04:23:19.497446', 'step': 4811, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:23:19.549345', 'step': 4811, 'epoch': 3} {'type': 'loss', 'content': 0.0002187347854487598, 'timestamp': '2025-10-01 04:23:19.578189', 'step': 4812, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:23:19.636693', 'step': 4812, 'epoch': 3} {'type': 'loss', 'content': 0.004814946558326483, 'timestamp': '2025-10-01 04:23:19.652108', 'step': 4813, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:23:19.712809', 'step': 4813, 'epoch': 3} {'type': 'loss', 'content': 0.0023718317970633507, 'timestamp': '2025-10-01 04:23:19.725539', 'step': 4814, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:23:19.778353', 'step': 4814, 'epoch': 3} {'type': 'loss', 'content': 0.001414129976183176, 'timestamp': '2025-10-01 04:23:19.794142', 'step': 4815, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:19.835958', 'step': 4815, 'epoch': 3} {'type': 'loss', 'content': 0.0008204428013414145, 'timestamp': '2025-10-01 04:23:19.865074', 'step': 4816, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:23:19.917798', 'step': 4816, 'epoch': 3} {'type': 'loss', 'content': 0.005787448491901159, 'timestamp': '2025-10-01 04:23:19.931211', 'step': 4817, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:23:19.981126', 'step': 4817, 'epoch': 3} {'type': 'loss', 'content': 0.0014672187389805913, 'timestamp': '2025-10-01 04:23:19.995121', 'step': 4818, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:23:20.051236', 'step': 4818, 'epoch': 3} {'type': 'loss', 'content': 0.0012166673550382257, 'timestamp': '2025-10-01 04:23:20.058745', 'step': 4819, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:23:20.108280', 'step': 4819, 'epoch': 3} {'type': 'loss', 'content': 0.0063219936564564705, 'timestamp': '2025-10-01 04:23:20.142825', 'step': 4820, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:23:20.184804', 'step': 4820, 'epoch': 3} {'type': 'loss', 'content': 0.0025782533921301365, 'timestamp': '2025-10-01 04:23:20.195610', 'step': 4821, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:23:20.246393', 'step': 4821, 'epoch': 3} {'type': 'loss', 'content': 0.000760015333071351, 'timestamp': '2025-10-01 04:23:20.258872', 'step': 4822, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:20.308103', 'step': 4822, 'epoch': 3} {'type': 'loss', 'content': 0.006945210043340921, 'timestamp': '2025-10-01 04:23:20.319336', 'step': 4823, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:23:20.370973', 'step': 4823, 'epoch': 3} {'type': 'loss', 'content': 0.0006799621041864157, 'timestamp': '2025-10-01 04:23:20.399294', 'step': 4824, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:23:20.451227', 'step': 4824, 'epoch': 3} {'type': 'loss', 'content': 0.0002588914940133691, 'timestamp': '2025-10-01 04:23:20.456219', 'step': 4825, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:23:20.500252', 'step': 4825, 'epoch': 3} {'type': 'loss', 'content': 0.005682771559804678, 'timestamp': '2025-10-01 04:23:20.512830', 'step': 4826, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:20.567540', 'step': 4826, 'epoch': 3} {'type': 'loss', 'content': 0.002423760714009404, 'timestamp': '2025-10-01 04:23:20.579010', 'step': 4827, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:20.621685', 'step': 4827, 'epoch': 3} {'type': 'loss', 'content': 0.0004428729589562863, 'timestamp': '2025-10-01 04:23:20.654268', 'step': 4828, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:23:20.707419', 'step': 4828, 'epoch': 3} {'type': 'loss', 'content': 0.00259587075561285, 'timestamp': '2025-10-01 04:23:20.720882', 'step': 4829, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:23:20.792137', 'step': 4829, 'epoch': 3} {'type': 'loss', 'content': 0.0032190887723118067, 'timestamp': '2025-10-01 04:23:20.800020', 'step': 4830, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-10-01 04:23:23.730089', 'step': 4830, 'epoch': 3} {'type': 'pplx', 'content': 6.0748657198063185, 'timestamp': '2025-10-01 04:23:23.733837', 'step': 4830, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:23:23.768601', 'step': 4830, 'epoch': 3} {'type': 'loss', 'content': 0.0009038069983944297, 'timestamp': '2025-10-01 04:23:23.779771', 'step': 4831, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:23.818961', 'step': 4831, 'epoch': 3} {'type': 'loss', 'content': 0.000989011488854885, 'timestamp': '2025-10-01 04:23:23.850949', 'step': 4832, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:23.887807', 'step': 4832, 'epoch': 3} {'type': 'loss', 'content': 0.00029252024251036346, 'timestamp': '2025-10-01 04:23:23.893575', 'step': 4833, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:23:23.937293', 'step': 4833, 'epoch': 3} {'type': 'loss', 'content': 0.0007896265597082675, 'timestamp': '2025-10-01 04:23:23.949829', 'step': 4834, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:23:23.989718', 'step': 4834, 'epoch': 3} {'type': 'loss', 'content': 0.0003486830973997712, 'timestamp': '2025-10-01 04:23:23.997042', 'step': 4835, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:23:24.037790', 'step': 4835, 'epoch': 3} {'type': 'loss', 'content': 0.0047712805680930614, 'timestamp': '2025-10-01 04:23:24.071255', 'step': 4836, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:24.115221', 'step': 4836, 'epoch': 3} {'type': 'loss', 'content': 0.0033169530797749758, 'timestamp': '2025-10-01 04:23:24.124309', 'step': 4837, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:23:24.159223', 'step': 4837, 'epoch': 3} {'type': 'loss', 'content': 0.03875613212585449, 'timestamp': '2025-10-01 04:23:24.167211', 'step': 4838, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:24.207799', 'step': 4838, 'epoch': 3} {'type': 'loss', 'content': 0.000937818898819387, 'timestamp': '2025-10-01 04:23:24.226083', 'step': 4839, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:24.276802', 'step': 4839, 'epoch': 3} {'type': 'loss', 'content': 0.0001631501509109512, 'timestamp': '2025-10-01 04:23:24.309040', 'step': 4840, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-10-01 04:23:24.374197', 'step': 4840, 'epoch': 3} {'type': 'loss', 'content': 0.0017157014226540923, 'timestamp': '2025-10-01 04:23:24.391095', 'step': 4841, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:23:24.437249', 'step': 4841, 'epoch': 3} {'type': 'loss', 'content': 0.0010921858483925462, 'timestamp': '2025-10-01 04:23:24.450844', 'step': 4842, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:23:24.497599', 'step': 4842, 'epoch': 3} {'type': 'loss', 'content': 0.0017649835208430886, 'timestamp': '2025-10-01 04:23:24.511638', 'step': 4843, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:23:24.556155', 'step': 4843, 'epoch': 3} {'type': 'loss', 'content': 0.005355138331651688, 'timestamp': '2025-10-01 04:23:24.590667', 'step': 4844, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:23:24.629243', 'step': 4844, 'epoch': 3} {'type': 'loss', 'content': 0.001025281730107963, 'timestamp': '2025-10-01 04:23:24.642623', 'step': 4845, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:23:24.678832', 'step': 4845, 'epoch': 3} {'type': 'loss', 'content': 0.003115252126008272, 'timestamp': '2025-10-01 04:23:24.691512', 'step': 4846, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:23:24.731974', 'step': 4846, 'epoch': 3} {'type': 'loss', 'content': 0.0005827572313137352, 'timestamp': '2025-10-01 04:23:24.744498', 'step': 4847, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:23:24.788581', 'step': 4847, 'epoch': 3} {'type': 'loss', 'content': 0.001824975828640163, 'timestamp': '2025-10-01 04:23:24.823577', 'step': 4848, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-10-01 04:23:24.871501', 'step': 4848, 'epoch': 3} {'type': 'loss', 'content': 0.0019131884910166264, 'timestamp': '2025-10-01 04:23:24.887385', 'step': 4849, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:24.935467', 'step': 4849, 'epoch': 3} {'type': 'loss', 'content': 0.0011504155118018389, 'timestamp': '2025-10-01 04:23:24.947036', 'step': 4850, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:23:24.986403', 'step': 4850, 'epoch': 3} {'type': 'loss', 'content': 0.0020960490219295025, 'timestamp': '2025-10-01 04:23:24.999232', 'step': 4851, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:23:25.043982', 'step': 4851, 'epoch': 3} {'type': 'loss', 'content': 0.001548922504298389, 'timestamp': '2025-10-01 04:23:25.078984', 'step': 4852, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:23:25.124114', 'step': 4852, 'epoch': 3} {'type': 'loss', 'content': 0.005359550006687641, 'timestamp': '2025-10-01 04:23:25.135350', 'step': 4853, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:23:25.182197', 'step': 4853, 'epoch': 3} {'type': 'loss', 'content': 0.0011908698361366987, 'timestamp': '2025-10-01 04:23:25.195049', 'step': 4854, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:23:25.242355', 'step': 4854, 'epoch': 3} {'type': 'loss', 'content': 0.005919248331338167, 'timestamp': '2025-10-01 04:23:25.255147', 'step': 4855, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:23:25.299887', 'step': 4855, 'epoch': 3} {'type': 'loss', 'content': 0.004107283893972635, 'timestamp': '2025-10-01 04:23:25.333639', 'step': 4856, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:23:25.386040', 'step': 4856, 'epoch': 3} {'type': 'loss', 'content': 0.0031350036151707172, 'timestamp': '2025-10-01 04:23:25.396906', 'step': 4857, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:23:25.448715', 'step': 4857, 'epoch': 3} {'type': 'loss', 'content': 0.0035035412292927504, 'timestamp': '2025-10-01 04:23:25.462721', 'step': 4858, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:23:25.518281', 'step': 4858, 'epoch': 3} {'type': 'loss', 'content': 0.003710674587637186, 'timestamp': '2025-10-01 04:23:25.534063', 'step': 4859, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:23:25.584902', 'step': 4859, 'epoch': 3} {'type': 'loss', 'content': 0.004640538245439529, 'timestamp': '2025-10-01 04:23:25.619914', 'step': 4860, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 15187581968384}, 'timestamp': '2025-10-01 04:23:25.678557', 'step': 4860, 'epoch': 3} {'type': 'loss', 'content': 0.00427450193092227, 'timestamp': '2025-10-01 04:23:25.696129', 'step': 4861, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:23:25.743261', 'step': 4861, 'epoch': 3} {'type': 'loss', 'content': 0.008923573419451714, 'timestamp': '2025-10-01 04:23:25.756847', 'step': 4862, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:25.797694', 'step': 4862, 'epoch': 3} {'type': 'loss', 'content': 0.0016046829987317324, 'timestamp': '2025-10-01 04:23:25.809066', 'step': 4863, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:23:25.847742', 'step': 4863, 'epoch': 3} {'type': 'loss', 'content': 0.0024157478474080563, 'timestamp': '2025-10-01 04:23:25.881167', 'step': 4864, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:23:25.918094', 'step': 4864, 'epoch': 3} {'type': 'loss', 'content': 0.004199175629764795, 'timestamp': '2025-10-01 04:23:25.930936', 'step': 4865, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:23:25.972044', 'step': 4865, 'epoch': 3} {'type': 'loss', 'content': 0.009707994759082794, 'timestamp': '2025-10-01 04:23:25.986010', 'step': 4866, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:23:26.022841', 'step': 4866, 'epoch': 3} {'type': 'loss', 'content': 0.0037647117860615253, 'timestamp': '2025-10-01 04:23:26.036852', 'step': 4867, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:23:26.077537', 'step': 4867, 'epoch': 3} {'type': 'loss', 'content': 0.0007563737453892827, 'timestamp': '2025-10-01 04:23:26.112133', 'step': 4868, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:23:26.146408', 'step': 4868, 'epoch': 3} {'type': 'loss', 'content': 0.0007786885253153741, 'timestamp': '2025-10-01 04:23:26.156864', 'step': 4869, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:26.198300', 'step': 4869, 'epoch': 3} {'type': 'loss', 'content': 0.0004907820257358253, 'timestamp': '2025-10-01 04:23:26.209767', 'step': 4870, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:26.250579', 'step': 4870, 'epoch': 3} {'type': 'loss', 'content': 0.0031428642105311155, 'timestamp': '2025-10-01 04:23:26.261959', 'step': 4871, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:23:26.295119', 'step': 4871, 'epoch': 3} {'type': 'loss', 'content': 0.004075809381902218, 'timestamp': '2025-10-01 04:23:26.330425', 'step': 4872, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:23:26.366982', 'step': 4872, 'epoch': 3} {'type': 'loss', 'content': 0.0026793149299919605, 'timestamp': '2025-10-01 04:23:26.374548', 'step': 4873, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:26.408938', 'step': 4873, 'epoch': 3} {'type': 'loss', 'content': 0.00625904044136405, 'timestamp': '2025-10-01 04:23:26.420449', 'step': 4874, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-10-01 04:23:26.464322', 'step': 4874, 'epoch': 3} {'type': 'loss', 'content': 0.013078570365905762, 'timestamp': '2025-10-01 04:23:26.481596', 'step': 4875, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:23:26.516696', 'step': 4875, 'epoch': 3} {'type': 'loss', 'content': 0.0017086457228288054, 'timestamp': '2025-10-01 04:23:26.550098', 'step': 4876, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:23:26.584214', 'step': 4876, 'epoch': 3} {'type': 'loss', 'content': 0.0008184687467291951, 'timestamp': '2025-10-01 04:23:26.597095', 'step': 4877, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:23:26.638152', 'step': 4877, 'epoch': 3} {'type': 'loss', 'content': 0.0032613433431833982, 'timestamp': '2025-10-01 04:23:26.653972', 'step': 4878, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 16136789420416}, 'timestamp': '2025-10-01 04:23:26.704155', 'step': 4878, 'epoch': 3} {'type': 'loss', 'content': 0.005578815005719662, 'timestamp': '2025-10-01 04:23:26.723442', 'step': 4879, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:23:26.762016', 'step': 4879, 'epoch': 3} {'type': 'loss', 'content': 0.0037632223684340715, 'timestamp': '2025-10-01 04:23:26.797190', 'step': 4880, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:23:26.833012', 'step': 4880, 'epoch': 3} {'type': 'loss', 'content': 0.016661658883094788, 'timestamp': '2025-10-01 04:23:26.843331', 'step': 4881, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:23:26.876406', 'step': 4881, 'epoch': 3} {'type': 'loss', 'content': 0.003657797584310174, 'timestamp': '2025-10-01 04:23:26.888899', 'step': 4882, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:23:26.929785', 'step': 4882, 'epoch': 3} {'type': 'loss', 'content': 0.004004451911896467, 'timestamp': '2025-10-01 04:23:26.943861', 'step': 4883, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:23:26.983683', 'step': 4883, 'epoch': 3} {'type': 'loss', 'content': 0.0031420201994478703, 'timestamp': '2025-10-01 04:23:27.018731', 'step': 4884, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:23:27.057967', 'step': 4884, 'epoch': 3} {'type': 'loss', 'content': 0.0015690167201682925, 'timestamp': '2025-10-01 04:23:27.073616', 'step': 4885, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:23:27.109798', 'step': 4885, 'epoch': 3} {'type': 'loss', 'content': 0.000928352412302047, 'timestamp': '2025-10-01 04:23:27.123775', 'step': 4886, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:27.158443', 'step': 4886, 'epoch': 3} {'type': 'loss', 'content': 0.002138555282726884, 'timestamp': '2025-10-01 04:23:27.170042', 'step': 4887, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:23:27.207192', 'step': 4887, 'epoch': 3} {'type': 'loss', 'content': 0.012630687095224857, 'timestamp': '2025-10-01 04:23:27.240915', 'step': 4888, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-10-01 04:23:27.282858', 'step': 4888, 'epoch': 3} {'type': 'loss', 'content': 0.006005304399877787, 'timestamp': '2025-10-01 04:23:27.298803', 'step': 4889, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:27.329918', 'step': 4889, 'epoch': 3} {'type': 'loss', 'content': 0.0074011944234371185, 'timestamp': '2025-10-01 04:23:27.341592', 'step': 4890, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:23:27.379086', 'step': 4890, 'epoch': 3} {'type': 'loss', 'content': 0.00637515215203166, 'timestamp': '2025-10-01 04:23:27.392673', 'step': 4891, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-10-01 04:23:27.438690', 'step': 4891, 'epoch': 3} {'type': 'loss', 'content': 0.0009628082043491304, 'timestamp': '2025-10-01 04:23:27.476057', 'step': 4892, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:23:27.512677', 'step': 4892, 'epoch': 3} {'type': 'loss', 'content': 0.004707301501184702, 'timestamp': '2025-10-01 04:23:27.526009', 'step': 4893, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:23:27.560342', 'step': 4893, 'epoch': 3} {'type': 'loss', 'content': 0.0012113797711208463, 'timestamp': '2025-10-01 04:23:27.573125', 'step': 4894, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 14238374516352}, 'timestamp': '2025-10-01 04:23:27.618495', 'step': 4894, 'epoch': 3} {'type': 'loss', 'content': 0.004888806026428938, 'timestamp': '2025-10-01 04:23:27.635947', 'step': 4895, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:23:27.673370', 'step': 4895, 'epoch': 3} {'type': 'loss', 'content': 0.004710851702839136, 'timestamp': '2025-10-01 04:23:27.708557', 'step': 4896, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:27.740967', 'step': 4896, 'epoch': 3} {'type': 'loss', 'content': 0.0024943335447460413, 'timestamp': '2025-10-01 04:23:27.746917', 'step': 4897, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:23:27.778670', 'step': 4897, 'epoch': 3} {'type': 'loss', 'content': 0.0009260174119845033, 'timestamp': '2025-10-01 04:23:27.785992', 'step': 4898, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:27.819236', 'step': 4898, 'epoch': 3} {'type': 'loss', 'content': 0.004244108218699694, 'timestamp': '2025-10-01 04:23:27.827519', 'step': 4899, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:27.859793', 'step': 4899, 'epoch': 3} {'type': 'loss', 'content': 0.0023835201282054186, 'timestamp': '2025-10-01 04:23:27.888901', 'step': 4900, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:27.920686', 'step': 4900, 'epoch': 3} {'type': 'loss', 'content': 0.0007834671414457262, 'timestamp': '2025-10-01 04:23:27.926618', 'step': 4901, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:27.959677', 'step': 4901, 'epoch': 3} {'type': 'loss', 'content': 0.00036225776420906186, 'timestamp': '2025-10-01 04:23:27.967848', 'step': 4902, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:23:28.001278', 'step': 4902, 'epoch': 3} {'type': 'loss', 'content': 0.004591373726725578, 'timestamp': '2025-10-01 04:23:28.009321', 'step': 4903, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:28.042549', 'step': 4903, 'epoch': 3} {'type': 'loss', 'content': 0.0012170650297775865, 'timestamp': '2025-10-01 04:23:28.074263', 'step': 4904, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:23:28.108584', 'step': 4904, 'epoch': 3} {'type': 'loss', 'content': 0.00207507680170238, 'timestamp': '2025-10-01 04:23:28.113686', 'step': 4905, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:28.147883', 'step': 4905, 'epoch': 3} {'type': 'loss', 'content': 0.001138175604864955, 'timestamp': '2025-10-01 04:23:28.158644', 'step': 4906, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:23:28.191388', 'step': 4906, 'epoch': 3} {'type': 'loss', 'content': 0.0019401168683543801, 'timestamp': '2025-10-01 04:23:28.204129', 'step': 4907, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:28.238889', 'step': 4907, 'epoch': 3} {'type': 'loss', 'content': 0.002929888665676117, 'timestamp': '2025-10-01 04:23:28.267992', 'step': 4908, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:23:28.300564', 'step': 4908, 'epoch': 3} {'type': 'loss', 'content': 0.001301384181715548, 'timestamp': '2025-10-01 04:23:28.306011', 'step': 4909, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:28.337821', 'step': 4909, 'epoch': 3} {'type': 'loss', 'content': 0.0006822739378549159, 'timestamp': '2025-10-01 04:23:28.348491', 'step': 4910, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:23:28.380795', 'step': 4910, 'epoch': 3} {'type': 'loss', 'content': 0.0001643660361878574, 'timestamp': '2025-10-01 04:23:28.388359', 'step': 4911, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:23:28.424053', 'step': 4911, 'epoch': 3} {'type': 'loss', 'content': 0.01018628478050232, 'timestamp': '2025-10-01 04:23:28.452573', 'step': 4912, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:23:28.485342', 'step': 4912, 'epoch': 3} {'type': 'loss', 'content': 0.0004940920625813305, 'timestamp': '2025-10-01 04:23:28.490763', 'step': 4913, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:23:28.528676', 'step': 4913, 'epoch': 3} {'type': 'loss', 'content': 0.005153608042746782, 'timestamp': '2025-10-01 04:23:28.536772', 'step': 4914, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:23:28.570840', 'step': 4914, 'epoch': 3} {'type': 'loss', 'content': 0.001834893599152565, 'timestamp': '2025-10-01 04:23:28.583307', 'step': 4915, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:28.616558', 'step': 4915, 'epoch': 3} {'type': 'loss', 'content': 0.002472679130733013, 'timestamp': '2025-10-01 04:23:28.645745', 'step': 4916, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:23:28.678251', 'step': 4916, 'epoch': 3} {'type': 'loss', 'content': 0.0014688960509374738, 'timestamp': '2025-10-01 04:23:28.683854', 'step': 4917, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:23:28.720668', 'step': 4917, 'epoch': 3} {'type': 'loss', 'content': 0.0024924431927502155, 'timestamp': '2025-10-01 04:23:28.733163', 'step': 4918, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:23:28.772509', 'step': 4918, 'epoch': 3} {'type': 'loss', 'content': 0.0011573422234505415, 'timestamp': '2025-10-01 04:23:28.786055', 'step': 4919, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:23:28.819556', 'step': 4919, 'epoch': 3} {'type': 'loss', 'content': 0.004286591894924641, 'timestamp': '2025-10-01 04:23:28.847969', 'step': 4920, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:28.881020', 'step': 4920, 'epoch': 3} {'type': 'loss', 'content': 0.0011885614367201924, 'timestamp': '2025-10-01 04:23:28.886655', 'step': 4921, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:28.920474', 'step': 4921, 'epoch': 3} {'type': 'loss', 'content': 0.0015370044857263565, 'timestamp': '2025-10-01 04:23:28.928659', 'step': 4922, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:28.962558', 'step': 4922, 'epoch': 3} {'type': 'loss', 'content': 0.0023568393662571907, 'timestamp': '2025-10-01 04:23:28.974229', 'step': 4923, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:29.010428', 'step': 4923, 'epoch': 3} {'type': 'loss', 'content': 0.002636388409882784, 'timestamp': '2025-10-01 04:23:29.042236', 'step': 4924, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:23:29.077822', 'step': 4924, 'epoch': 3} {'type': 'loss', 'content': 0.012414614669978619, 'timestamp': '2025-10-01 04:23:29.083012', 'step': 4925, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:23:29.117989', 'step': 4925, 'epoch': 3} {'type': 'loss', 'content': 0.0007292100926861167, 'timestamp': '2025-10-01 04:23:29.125512', 'step': 4926, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:23:29.162071', 'step': 4926, 'epoch': 3} {'type': 'loss', 'content': 0.005325693171471357, 'timestamp': '2025-10-01 04:23:29.169693', 'step': 4927, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:23:29.203844', 'step': 4927, 'epoch': 3} {'type': 'loss', 'content': 0.0014804060338065028, 'timestamp': '2025-10-01 04:23:29.237574', 'step': 4928, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:29.269711', 'step': 4928, 'epoch': 3} {'type': 'loss', 'content': 0.00042974951793439686, 'timestamp': '2025-10-01 04:23:29.278126', 'step': 4929, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:23:29.313257', 'step': 4929, 'epoch': 3} {'type': 'loss', 'content': 0.002204685704782605, 'timestamp': '2025-10-01 04:23:29.321195', 'step': 4930, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:23:29.357309', 'step': 4930, 'epoch': 3} {'type': 'loss', 'content': 0.00041305404738523066, 'timestamp': '2025-10-01 04:23:29.365206', 'step': 4931, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:29.397199', 'step': 4931, 'epoch': 3} {'type': 'loss', 'content': 0.00034294428769499063, 'timestamp': '2025-10-01 04:23:29.426354', 'step': 4932, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:23:29.461706', 'step': 4932, 'epoch': 3} {'type': 'loss', 'content': 0.0005460705142468214, 'timestamp': '2025-10-01 04:23:29.467005', 'step': 4933, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:29.499252', 'step': 4933, 'epoch': 3} {'type': 'loss', 'content': 0.0029431069269776344, 'timestamp': '2025-10-01 04:23:29.507628', 'step': 4934, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:29.543088', 'step': 4934, 'epoch': 3} {'type': 'loss', 'content': 0.003676038235425949, 'timestamp': '2025-10-01 04:23:29.554806', 'step': 4935, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:29.589627', 'step': 4935, 'epoch': 3} {'type': 'loss', 'content': 0.002750186249613762, 'timestamp': '2025-10-01 04:23:29.621486', 'step': 4936, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:29.656659', 'step': 4936, 'epoch': 3} {'type': 'loss', 'content': 0.0010609711753204465, 'timestamp': '2025-10-01 04:23:29.665870', 'step': 4937, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:23:29.702021', 'step': 4937, 'epoch': 3} {'type': 'loss', 'content': 0.0006708634900860488, 'timestamp': '2025-10-01 04:23:29.710012', 'step': 4938, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:23:29.755180', 'step': 4938, 'epoch': 3} {'type': 'loss', 'content': 0.015629233792424202, 'timestamp': '2025-10-01 04:23:29.763231', 'step': 4939, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:29.797417', 'step': 4939, 'epoch': 3} {'type': 'loss', 'content': 0.0008614884573034942, 'timestamp': '2025-10-01 04:23:29.826602', 'step': 4940, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:29.864258', 'step': 4940, 'epoch': 3} {'type': 'loss', 'content': 0.0035869821440428495, 'timestamp': '2025-10-01 04:23:29.873344', 'step': 4941, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:29.907418', 'step': 4941, 'epoch': 3} {'type': 'loss', 'content': 0.001186956069432199, 'timestamp': '2025-10-01 04:23:29.918098', 'step': 4942, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:29.953032', 'step': 4942, 'epoch': 3} {'type': 'loss', 'content': 0.0005830395966768265, 'timestamp': '2025-10-01 04:23:29.963893', 'step': 4943, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:29.998621', 'step': 4943, 'epoch': 3} {'type': 'loss', 'content': 0.003130279714241624, 'timestamp': '2025-10-01 04:23:30.031042', 'step': 4944, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:30.072770', 'step': 4944, 'epoch': 3} {'type': 'loss', 'content': 0.001972094876691699, 'timestamp': '2025-10-01 04:23:30.084745', 'step': 4945, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-10-01 04:23:32.476583', 'step': 4945, 'epoch': 3} {'type': 'pplx', 'content': 5.914002681294019, 'timestamp': '2025-10-01 04:23:32.483792', 'step': 4945, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:32.517960', 'step': 4945, 'epoch': 3} {'type': 'loss', 'content': 0.0005158825661055744, 'timestamp': '2025-10-01 04:23:32.528247', 'step': 4946, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:32.563212', 'step': 4946, 'epoch': 3} {'type': 'loss', 'content': 0.0012885574251413345, 'timestamp': '2025-10-01 04:23:32.572515', 'step': 4947, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:32.607395', 'step': 4947, 'epoch': 3} {'type': 'loss', 'content': 0.0005017142393626273, 'timestamp': '2025-10-01 04:23:32.639680', 'step': 4948, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:32.674061', 'step': 4948, 'epoch': 3} {'type': 'loss', 'content': 0.0015701594529673457, 'timestamp': '2025-10-01 04:23:32.683275', 'step': 4949, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:23:32.731411', 'step': 4949, 'epoch': 3} {'type': 'loss', 'content': 0.006334614474326372, 'timestamp': '2025-10-01 04:23:32.744092', 'step': 4950, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:23:32.778162', 'step': 4950, 'epoch': 3} {'type': 'loss', 'content': 0.0008757282048463821, 'timestamp': '2025-10-01 04:23:32.790893', 'step': 4951, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:32.823338', 'step': 4951, 'epoch': 3} {'type': 'loss', 'content': 0.0005912580527365208, 'timestamp': '2025-10-01 04:23:32.855877', 'step': 4952, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:32.903351', 'step': 4952, 'epoch': 3} {'type': 'loss', 'content': 0.001383747672662139, 'timestamp': '2025-10-01 04:23:32.912015', 'step': 4953, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:32.946682', 'step': 4953, 'epoch': 3} {'type': 'loss', 'content': 0.0015887700719758868, 'timestamp': '2025-10-01 04:23:32.958225', 'step': 4954, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:23:32.992577', 'step': 4954, 'epoch': 3} {'type': 'loss', 'content': 0.0021559959277510643, 'timestamp': '2025-10-01 04:23:33.005143', 'step': 4955, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:33.036978', 'step': 4955, 'epoch': 3} {'type': 'loss', 'content': 0.0022559985518455505, 'timestamp': '2025-10-01 04:23:33.066057', 'step': 4956, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:23:33.098730', 'step': 4956, 'epoch': 3} {'type': 'loss', 'content': 0.0015862067230045795, 'timestamp': '2025-10-01 04:23:33.111556', 'step': 4957, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:33.144032', 'step': 4957, 'epoch': 3} {'type': 'loss', 'content': 0.0008800755022093654, 'timestamp': '2025-10-01 04:23:33.155331', 'step': 4958, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:23:33.194928', 'step': 4958, 'epoch': 3} {'type': 'loss', 'content': 0.0027902114670723677, 'timestamp': '2025-10-01 04:23:33.207531', 'step': 4959, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:23:33.246920', 'step': 4959, 'epoch': 3} {'type': 'loss', 'content': 0.00419683987274766, 'timestamp': '2025-10-01 04:23:33.281517', 'step': 4960, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:33.323888', 'step': 4960, 'epoch': 3} {'type': 'loss', 'content': 0.0004148964071646333, 'timestamp': '2025-10-01 04:23:33.332333', 'step': 4961, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:23:33.368316', 'step': 4961, 'epoch': 3} {'type': 'loss', 'content': 0.0018366541480645537, 'timestamp': '2025-10-01 04:23:33.381042', 'step': 4962, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:23:33.415509', 'step': 4962, 'epoch': 3} {'type': 'loss', 'content': 0.0011206620838493109, 'timestamp': '2025-10-01 04:23:33.427966', 'step': 4963, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:23:33.462516', 'step': 4963, 'epoch': 3} {'type': 'loss', 'content': 0.005052568390965462, 'timestamp': '2025-10-01 04:23:33.496257', 'step': 4964, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:23:33.532685', 'step': 4964, 'epoch': 3} {'type': 'loss', 'content': 0.001208461239002645, 'timestamp': '2025-10-01 04:23:33.549173', 'step': 4965, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:33.587680', 'step': 4965, 'epoch': 3} {'type': 'loss', 'content': 0.0010471034329384565, 'timestamp': '2025-10-01 04:23:33.599317', 'step': 4966, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:33.631093', 'step': 4966, 'epoch': 3} {'type': 'loss', 'content': 0.00398636469617486, 'timestamp': '2025-10-01 04:23:33.642610', 'step': 4967, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:23:33.676738', 'step': 4967, 'epoch': 3} {'type': 'loss', 'content': 0.0020904908888041973, 'timestamp': '2025-10-01 04:23:33.710303', 'step': 4968, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:33.742812', 'step': 4968, 'epoch': 3} {'type': 'loss', 'content': 0.0001748788490658626, 'timestamp': '2025-10-01 04:23:33.751888', 'step': 4969, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:33.786510', 'step': 4969, 'epoch': 3} {'type': 'loss', 'content': 0.0011954822111874819, 'timestamp': '2025-10-01 04:23:33.798066', 'step': 4970, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:23:33.832026', 'step': 4970, 'epoch': 3} {'type': 'loss', 'content': 0.00106249435339123, 'timestamp': '2025-10-01 04:23:33.844765', 'step': 4971, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:33.878675', 'step': 4971, 'epoch': 3} {'type': 'loss', 'content': 0.0016057714819908142, 'timestamp': '2025-10-01 04:23:33.912654', 'step': 4972, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:23:33.945086', 'step': 4972, 'epoch': 3} {'type': 'loss', 'content': 0.0008940848056226969, 'timestamp': '2025-10-01 04:23:33.956138', 'step': 4973, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:33.991704', 'step': 4973, 'epoch': 3} {'type': 'loss', 'content': 0.0023749079555273056, 'timestamp': '2025-10-01 04:23:34.002483', 'step': 4974, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:23:34.033931', 'step': 4974, 'epoch': 3} {'type': 'loss', 'content': 0.012745884247124195, 'timestamp': '2025-10-01 04:23:34.042022', 'step': 4975, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:23:34.076106', 'step': 4975, 'epoch': 3} {'type': 'loss', 'content': 0.001509687746874988, 'timestamp': '2025-10-01 04:23:34.109598', 'step': 4976, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:34.144289', 'step': 4976, 'epoch': 3} {'type': 'loss', 'content': 0.0020068150479346514, 'timestamp': '2025-10-01 04:23:34.153496', 'step': 4977, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:34.188275', 'step': 4977, 'epoch': 3} {'type': 'loss', 'content': 0.0009552549454383552, 'timestamp': '2025-10-01 04:23:34.196551', 'step': 4978, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:23:34.228403', 'step': 4978, 'epoch': 3} {'type': 'loss', 'content': 0.0008998421835713089, 'timestamp': '2025-10-01 04:23:34.240965', 'step': 4979, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:34.277121', 'step': 4979, 'epoch': 3} {'type': 'loss', 'content': 0.00040062525658868253, 'timestamp': '2025-10-01 04:23:34.306188', 'step': 4980, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:23:34.338341', 'step': 4980, 'epoch': 3} {'type': 'loss', 'content': 0.0014152370858937502, 'timestamp': '2025-10-01 04:23:34.349495', 'step': 4981, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:23:34.383281', 'step': 4981, 'epoch': 3} {'type': 'loss', 'content': 0.001374799874611199, 'timestamp': '2025-10-01 04:23:34.395827', 'step': 4982, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:23:34.431354', 'step': 4982, 'epoch': 3} {'type': 'loss', 'content': 0.0008091229829005897, 'timestamp': '2025-10-01 04:23:34.443890', 'step': 4983, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:23:34.479292', 'step': 4983, 'epoch': 3} {'type': 'loss', 'content': 0.0006871750229038298, 'timestamp': '2025-10-01 04:23:34.513036', 'step': 4984, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:23:34.549738', 'step': 4984, 'epoch': 3} {'type': 'loss', 'content': 0.0020768067333847284, 'timestamp': '2025-10-01 04:23:34.562652', 'step': 4985, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:23:34.598596', 'step': 4985, 'epoch': 3} {'type': 'loss', 'content': 0.0005894150235690176, 'timestamp': '2025-10-01 04:23:34.611336', 'step': 4986, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:34.648797', 'step': 4986, 'epoch': 3} {'type': 'loss', 'content': 0.0008976446697488427, 'timestamp': '2025-10-01 04:23:34.659818', 'step': 4987, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:23:34.694910', 'step': 4987, 'epoch': 3} {'type': 'loss', 'content': 0.007799651939421892, 'timestamp': '2025-10-01 04:23:34.728498', 'step': 4988, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:23:34.767054', 'step': 4988, 'epoch': 3} {'type': 'loss', 'content': 0.000879481085576117, 'timestamp': '2025-10-01 04:23:34.779975', 'step': 4989, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:23:34.816304', 'step': 4989, 'epoch': 3} {'type': 'loss', 'content': 0.002684621373191476, 'timestamp': '2025-10-01 04:23:34.830299', 'step': 4990, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:23:34.867356', 'step': 4990, 'epoch': 3} {'type': 'loss', 'content': 0.0009340860997326672, 'timestamp': '2025-10-01 04:23:34.880151', 'step': 4991, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:23:34.920898', 'step': 4991, 'epoch': 3} {'type': 'loss', 'content': 0.0005005821585655212, 'timestamp': '2025-10-01 04:23:34.954622', 'step': 4992, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:34.990328', 'step': 4992, 'epoch': 3} {'type': 'loss', 'content': 0.004023905843496323, 'timestamp': '2025-10-01 04:23:34.999505', 'step': 4993, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:23:35.034366', 'step': 4993, 'epoch': 3} {'type': 'loss', 'content': 0.00043332501081749797, 'timestamp': '2025-10-01 04:23:35.047169', 'step': 4994, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:23:35.080360', 'step': 4994, 'epoch': 3} {'type': 'loss', 'content': 0.0006133014103397727, 'timestamp': '2025-10-01 04:23:35.093150', 'step': 4995, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:23:35.130289', 'step': 4995, 'epoch': 3} {'type': 'loss', 'content': 0.0006827837787568569, 'timestamp': '2025-10-01 04:23:35.163779', 'step': 4996, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:23:35.197293', 'step': 4996, 'epoch': 3} {'type': 'loss', 'content': 0.0013464888324961066, 'timestamp': '2025-10-01 04:23:35.206331', 'step': 4997, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:23:35.240686', 'step': 4997, 'epoch': 3} {'type': 'loss', 'content': 0.0012291495222598314, 'timestamp': '2025-10-01 04:23:35.253196', 'step': 4998, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:23:35.293776', 'step': 4998, 'epoch': 3} {'type': 'loss', 'content': 0.0006819346453994513, 'timestamp': '2025-10-01 04:23:35.306508', 'step': 4999, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:35.344223', 'step': 4999, 'epoch': 3} {'type': 'loss', 'content': 0.0010100057115778327, 'timestamp': '2025-10-01 04:23:35.376125', 'step': 5000, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 5000', 'timestamp': '2025-10-01 04:23:40.857583', 'step': 5000, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:40.908518', 'step': 5000, 'epoch': 3} {'type': 'loss', 'content': 0.0002641484315972775, 'timestamp': '2025-10-01 04:23:40.915071', 'step': 5001, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:40.960175', 'step': 5001, 'epoch': 3} {'type': 'loss', 'content': 0.0012800403637811542, 'timestamp': '2025-10-01 04:23:40.970622', 'step': 5002, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:23:41.025233', 'step': 5002, 'epoch': 3} {'type': 'loss', 'content': 0.0025587426498532295, 'timestamp': '2025-10-01 04:23:41.034255', 'step': 5003, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:23:41.075303', 'step': 5003, 'epoch': 3} {'type': 'loss', 'content': 0.0013088446576148272, 'timestamp': '2025-10-01 04:23:41.104018', 'step': 5004, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:23:41.154689', 'step': 5004, 'epoch': 3} {'type': 'loss', 'content': 0.015633055940270424, 'timestamp': '2025-10-01 04:23:41.161011', 'step': 5005, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:23:41.199740', 'step': 5005, 'epoch': 3} {'type': 'loss', 'content': 0.0011835114564746618, 'timestamp': '2025-10-01 04:23:41.207657', 'step': 5006, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:23:41.255518', 'step': 5006, 'epoch': 3} {'type': 'loss', 'content': 0.00033270727726630867, 'timestamp': '2025-10-01 04:23:41.263557', 'step': 5007, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:23:41.311091', 'step': 5007, 'epoch': 3} {'type': 'loss', 'content': 0.0008642624597996473, 'timestamp': '2025-10-01 04:23:41.341035', 'step': 5008, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:41.384845', 'step': 5008, 'epoch': 3} {'type': 'loss', 'content': 0.001988004893064499, 'timestamp': '2025-10-01 04:23:41.390814', 'step': 5009, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:41.444493', 'step': 5009, 'epoch': 3} {'type': 'loss', 'content': 0.005797721911221743, 'timestamp': '2025-10-01 04:23:41.455409', 'step': 5010, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:23:41.503630', 'step': 5010, 'epoch': 3} {'type': 'loss', 'content': 0.005160896107554436, 'timestamp': '2025-10-01 04:23:41.510937', 'step': 5011, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:23:41.558246', 'step': 5011, 'epoch': 3} {'type': 'loss', 'content': 0.0055002328008413315, 'timestamp': '2025-10-01 04:23:41.587337', 'step': 5012, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:23:41.635406', 'step': 5012, 'epoch': 3} {'type': 'loss', 'content': 0.0023785505909472704, 'timestamp': '2025-10-01 04:23:41.640954', 'step': 5013, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-10-01 04:23:41.694784', 'step': 5013, 'epoch': 3} {'type': 'loss', 'content': 0.0016664154827594757, 'timestamp': '2025-10-01 04:23:41.700214', 'step': 5014, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:41.744288', 'step': 5014, 'epoch': 3} {'type': 'loss', 'content': 0.00023772873100824654, 'timestamp': '2025-10-01 04:23:41.755242', 'step': 5015, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:23:41.801976', 'step': 5015, 'epoch': 3} {'type': 'loss', 'content': 0.005730912089347839, 'timestamp': '2025-10-01 04:23:41.835480', 'step': 5016, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:23:41.886121', 'step': 5016, 'epoch': 3} {'type': 'loss', 'content': 0.0010704733431339264, 'timestamp': '2025-10-01 04:23:41.899599', 'step': 5017, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:23:41.945724', 'step': 5017, 'epoch': 3} {'type': 'loss', 'content': 0.00033869390608742833, 'timestamp': '2025-10-01 04:23:41.953243', 'step': 5018, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:23:41.995969', 'step': 5018, 'epoch': 3} {'type': 'loss', 'content': 0.011293898336589336, 'timestamp': '2025-10-01 04:23:42.008467', 'step': 5019, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:23:42.049564', 'step': 5019, 'epoch': 3} {'type': 'loss', 'content': 0.0016091278521344066, 'timestamp': '2025-10-01 04:23:42.084058', 'step': 5020, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:42.132157', 'step': 5020, 'epoch': 3} {'type': 'loss', 'content': 0.001859484356828034, 'timestamp': '2025-10-01 04:23:42.142233', 'step': 5021, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:23:42.185001', 'step': 5021, 'epoch': 3} {'type': 'loss', 'content': 0.0004010664706584066, 'timestamp': '2025-10-01 04:23:42.198580', 'step': 5022, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:23:42.233238', 'step': 5022, 'epoch': 3} {'type': 'loss', 'content': 0.011298754252493382, 'timestamp': '2025-10-01 04:23:42.245838', 'step': 5023, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:42.281169', 'step': 5023, 'epoch': 3} {'type': 'loss', 'content': 0.0009659253410063684, 'timestamp': '2025-10-01 04:23:42.315652', 'step': 5024, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:42.365326', 'step': 5024, 'epoch': 3} {'type': 'loss', 'content': 0.00029482177342288196, 'timestamp': '2025-10-01 04:23:42.376375', 'step': 5025, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:23:42.426317', 'step': 5025, 'epoch': 3} {'type': 'loss', 'content': 0.0007340683368965983, 'timestamp': '2025-10-01 04:23:42.439020', 'step': 5026, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:23:42.492842', 'step': 5026, 'epoch': 3} {'type': 'loss', 'content': 0.0002357888442929834, 'timestamp': '2025-10-01 04:23:42.504557', 'step': 5027, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:23:42.553425', 'step': 5027, 'epoch': 3} {'type': 'loss', 'content': 0.004564209375530481, 'timestamp': '2025-10-01 04:23:42.586241', 'step': 5028, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:42.635398', 'step': 5028, 'epoch': 3} {'type': 'loss', 'content': 0.0035511315800249577, 'timestamp': '2025-10-01 04:23:42.642492', 'step': 5029, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:42.693921', 'step': 5029, 'epoch': 3} {'type': 'loss', 'content': 0.001182259526103735, 'timestamp': '2025-10-01 04:23:42.703938', 'step': 5030, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:42.758105', 'step': 5030, 'epoch': 3} {'type': 'loss', 'content': 0.0006530649843625724, 'timestamp': '2025-10-01 04:23:42.767533', 'step': 5031, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:42.817872', 'step': 5031, 'epoch': 3} {'type': 'loss', 'content': 0.0003737068618647754, 'timestamp': '2025-10-01 04:23:42.848149', 'step': 5032, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:23:42.901990', 'step': 5032, 'epoch': 3} {'type': 'loss', 'content': 0.0016070945421233773, 'timestamp': '2025-10-01 04:23:42.911322', 'step': 5033, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:23:42.957895', 'step': 5033, 'epoch': 3} {'type': 'loss', 'content': 0.0002718182804528624, 'timestamp': '2025-10-01 04:23:42.970414', 'step': 5034, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:43.019834', 'step': 5034, 'epoch': 3} {'type': 'loss', 'content': 0.00016606935241725296, 'timestamp': '2025-10-01 04:23:43.031447', 'step': 5035, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:23:43.083118', 'step': 5035, 'epoch': 3} {'type': 'loss', 'content': 0.0009565561776980758, 'timestamp': '2025-10-01 04:23:43.118053', 'step': 5036, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:23:43.160614', 'step': 5036, 'epoch': 3} {'type': 'loss', 'content': 4.982936297892593e-05, 'timestamp': '2025-10-01 04:23:43.171766', 'step': 5037, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:43.219499', 'step': 5037, 'epoch': 3} {'type': 'loss', 'content': 0.005019980017095804, 'timestamp': '2025-10-01 04:23:43.227662', 'step': 5038, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:23:43.279563', 'step': 5038, 'epoch': 3} {'type': 'loss', 'content': 0.00896834209561348, 'timestamp': '2025-10-01 04:23:43.292383', 'step': 5039, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:23:43.335071', 'step': 5039, 'epoch': 3} {'type': 'loss', 'content': 0.026699386537075043, 'timestamp': '2025-10-01 04:23:43.369953', 'step': 5040, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:43.428281', 'step': 5040, 'epoch': 3} {'type': 'loss', 'content': 0.00018809699395205826, 'timestamp': '2025-10-01 04:23:43.437589', 'step': 5041, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:43.488323', 'step': 5041, 'epoch': 3} {'type': 'loss', 'content': 0.0002859558444470167, 'timestamp': '2025-10-01 04:23:43.500099', 'step': 5042, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:23:43.544012', 'step': 5042, 'epoch': 3} {'type': 'loss', 'content': 0.003856535768136382, 'timestamp': '2025-10-01 04:23:43.551651', 'step': 5043, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:23:43.592052', 'step': 5043, 'epoch': 3} {'type': 'loss', 'content': 0.005525995045900345, 'timestamp': '2025-10-01 04:23:43.620845', 'step': 5044, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:23:43.661286', 'step': 5044, 'epoch': 3} {'type': 'loss', 'content': 0.0038833937142044306, 'timestamp': '2025-10-01 04:23:43.668262', 'step': 5045, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:23:43.716699', 'step': 5045, 'epoch': 3} {'type': 'loss', 'content': 0.0009300418896600604, 'timestamp': '2025-10-01 04:23:43.726858', 'step': 5046, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:23:43.776103', 'step': 5046, 'epoch': 3} {'type': 'loss', 'content': 0.0006603685906156898, 'timestamp': '2025-10-01 04:23:43.783719', 'step': 5047, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:43.830620', 'step': 5047, 'epoch': 3} {'type': 'loss', 'content': 0.00039071080391295254, 'timestamp': '2025-10-01 04:23:43.862270', 'step': 5048, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:43.900282', 'step': 5048, 'epoch': 3} {'type': 'loss', 'content': 0.0007633455679751933, 'timestamp': '2025-10-01 04:23:43.908539', 'step': 5049, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:23:43.946772', 'step': 5049, 'epoch': 3} {'type': 'loss', 'content': 0.0003527500375639647, 'timestamp': '2025-10-01 04:23:43.959498', 'step': 5050, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:23:44.009507', 'step': 5050, 'epoch': 3} {'type': 'loss', 'content': 0.00035557273076847196, 'timestamp': '2025-10-01 04:23:44.023551', 'step': 5051, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:44.077227', 'step': 5051, 'epoch': 3} {'type': 'loss', 'content': 0.0006486850907094777, 'timestamp': '2025-10-01 04:23:44.110029', 'step': 5052, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:23:44.160971', 'step': 5052, 'epoch': 3} {'type': 'loss', 'content': 0.0011405295226722956, 'timestamp': '2025-10-01 04:23:44.172036', 'step': 5053, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:23:44.224518', 'step': 5053, 'epoch': 3} {'type': 'loss', 'content': 0.0008664733031764627, 'timestamp': '2025-10-01 04:23:44.232213', 'step': 5054, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:44.277119', 'step': 5054, 'epoch': 3} {'type': 'loss', 'content': 0.001291898195631802, 'timestamp': '2025-10-01 04:23:44.287597', 'step': 5055, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:23:44.333205', 'step': 5055, 'epoch': 3} {'type': 'loss', 'content': 0.0007537714554928243, 'timestamp': '2025-10-01 04:23:44.362973', 'step': 5056, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:44.407160', 'step': 5056, 'epoch': 3} {'type': 'loss', 'content': 0.006482606288045645, 'timestamp': '2025-10-01 04:23:44.414890', 'step': 5057, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:23:44.465903', 'step': 5057, 'epoch': 3} {'type': 'loss', 'content': 0.0025675795041024685, 'timestamp': '2025-10-01 04:23:44.479830', 'step': 5058, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:44.528456', 'step': 5058, 'epoch': 3} {'type': 'loss', 'content': 0.0018452700460329652, 'timestamp': '2025-10-01 04:23:44.536102', 'step': 5059, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:44.581251', 'step': 5059, 'epoch': 3} {'type': 'loss', 'content': 0.002813103375956416, 'timestamp': '2025-10-01 04:23:44.613254', 'step': 5060, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-10-01 04:23:47.427976', 'step': 5060, 'epoch': 3} {'type': 'pplx', 'content': 5.8900051141004965, 'timestamp': '2025-10-01 04:23:47.432374', 'step': 5060, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:47.477377', 'step': 5060, 'epoch': 3} {'type': 'loss', 'content': 0.0006045867339707911, 'timestamp': '2025-10-01 04:23:47.485362', 'step': 5061, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:23:47.530317', 'step': 5061, 'epoch': 3} {'type': 'loss', 'content': 0.0028265202417969704, 'timestamp': '2025-10-01 04:23:47.543169', 'step': 5062, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:47.596324', 'step': 5062, 'epoch': 3} {'type': 'loss', 'content': 0.00020499348465818912, 'timestamp': '2025-10-01 04:23:47.607099', 'step': 5063, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:47.649752', 'step': 5063, 'epoch': 3} {'type': 'loss', 'content': 0.00031671542092226446, 'timestamp': '2025-10-01 04:23:47.681442', 'step': 5064, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:23:47.733579', 'step': 5064, 'epoch': 3} {'type': 'loss', 'content': 0.00013525615213438869, 'timestamp': '2025-10-01 04:23:47.744496', 'step': 5065, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:47.798216', 'step': 5065, 'epoch': 3} {'type': 'loss', 'content': 0.0022914272267371416, 'timestamp': '2025-10-01 04:23:47.809840', 'step': 5066, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:23:47.856856', 'step': 5066, 'epoch': 3} {'type': 'loss', 'content': 0.0023403214290738106, 'timestamp': '2025-10-01 04:23:47.866443', 'step': 5067, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:47.906197', 'step': 5067, 'epoch': 3} {'type': 'loss', 'content': 0.000981300836429, 'timestamp': '2025-10-01 04:23:47.938682', 'step': 5068, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:23:47.987045', 'step': 5068, 'epoch': 3} {'type': 'loss', 'content': 0.002661599777638912, 'timestamp': '2025-10-01 04:23:48.000526', 'step': 5069, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:48.046405', 'step': 5069, 'epoch': 3} {'type': 'loss', 'content': 0.008596166968345642, 'timestamp': '2025-10-01 04:23:48.054583', 'step': 5070, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:23:48.104257', 'step': 5070, 'epoch': 3} {'type': 'loss', 'content': 0.009258071891963482, 'timestamp': '2025-10-01 04:23:48.118216', 'step': 5071, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:23:48.169336', 'step': 5071, 'epoch': 3} {'type': 'loss', 'content': 0.00020561979908961803, 'timestamp': '2025-10-01 04:23:48.203869', 'step': 5072, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:23:48.242178', 'step': 5072, 'epoch': 3} {'type': 'loss', 'content': 0.003018441842868924, 'timestamp': '2025-10-01 04:23:48.257788', 'step': 5073, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:23:48.304557', 'step': 5073, 'epoch': 3} {'type': 'loss', 'content': 0.0009228725684806705, 'timestamp': '2025-10-01 04:23:48.319267', 'step': 5074, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:48.361305', 'step': 5074, 'epoch': 3} {'type': 'loss', 'content': 0.0002928674512077123, 'timestamp': '2025-10-01 04:23:48.376920', 'step': 5075, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:48.422624', 'step': 5075, 'epoch': 3} {'type': 'loss', 'content': 0.0004083875974174589, 'timestamp': '2025-10-01 04:23:48.458511', 'step': 5076, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:23:48.502130', 'step': 5076, 'epoch': 3} {'type': 'loss', 'content': 0.0001760945306159556, 'timestamp': '2025-10-01 04:23:48.517836', 'step': 5077, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:23:48.569278', 'step': 5077, 'epoch': 3} {'type': 'loss', 'content': 0.0031010503880679607, 'timestamp': '2025-10-01 04:23:48.578860', 'step': 5078, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:23:48.626598', 'step': 5078, 'epoch': 3} {'type': 'loss', 'content': 0.001466546207666397, 'timestamp': '2025-10-01 04:23:48.637930', 'step': 5079, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:23:48.683762', 'step': 5079, 'epoch': 3} {'type': 'loss', 'content': 0.0031001309398561716, 'timestamp': '2025-10-01 04:23:48.715285', 'step': 5080, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:23:48.775547', 'step': 5080, 'epoch': 3} {'type': 'loss', 'content': 0.0017316548619419336, 'timestamp': '2025-10-01 04:23:48.788454', 'step': 5081, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:48.848540', 'step': 5081, 'epoch': 3} {'type': 'loss', 'content': 0.001095621264539659, 'timestamp': '2025-10-01 04:23:48.859992', 'step': 5082, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:48.901040', 'step': 5082, 'epoch': 3} {'type': 'loss', 'content': 0.0002863180125132203, 'timestamp': '2025-10-01 04:23:48.912619', 'step': 5083, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:23:48.966542', 'step': 5083, 'epoch': 3} {'type': 'loss', 'content': 0.001527465065009892, 'timestamp': '2025-10-01 04:23:49.001940', 'step': 5084, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:49.062601', 'step': 5084, 'epoch': 3} {'type': 'loss', 'content': 0.0019079854246228933, 'timestamp': '2025-10-01 04:23:49.072476', 'step': 5085, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:49.113836', 'step': 5085, 'epoch': 3} {'type': 'loss', 'content': 0.0010226693702861667, 'timestamp': '2025-10-01 04:23:49.125445', 'step': 5086, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:49.168287', 'step': 5086, 'epoch': 3} {'type': 'loss', 'content': 0.00479503907263279, 'timestamp': '2025-10-01 04:23:49.179158', 'step': 5087, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:49.227927', 'step': 5087, 'epoch': 3} {'type': 'loss', 'content': 0.0005273462738841772, 'timestamp': '2025-10-01 04:23:49.259626', 'step': 5088, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:49.311243', 'step': 5088, 'epoch': 3} {'type': 'loss', 'content': 0.0004894079174846411, 'timestamp': '2025-10-01 04:23:49.323034', 'step': 5089, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:23:49.370169', 'step': 5089, 'epoch': 3} {'type': 'loss', 'content': 0.002332783304154873, 'timestamp': '2025-10-01 04:23:49.377525', 'step': 5090, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:49.414536', 'step': 5090, 'epoch': 3} {'type': 'loss', 'content': 0.0018249072600156069, 'timestamp': '2025-10-01 04:23:49.425159', 'step': 5091, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:49.477423', 'step': 5091, 'epoch': 3} {'type': 'loss', 'content': 0.002364028710871935, 'timestamp': '2025-10-01 04:23:49.509826', 'step': 5092, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:23:49.552258', 'step': 5092, 'epoch': 3} {'type': 'loss', 'content': 0.0003845764440484345, 'timestamp': '2025-10-01 04:23:49.565106', 'step': 5093, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:49.604078', 'step': 5093, 'epoch': 3} {'type': 'loss', 'content': 0.0006956516299396753, 'timestamp': '2025-10-01 04:23:49.612439', 'step': 5094, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:49.646474', 'step': 5094, 'epoch': 3} {'type': 'loss', 'content': 0.000660264806356281, 'timestamp': '2025-10-01 04:23:49.658077', 'step': 5095, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:23:49.706867', 'step': 5095, 'epoch': 3} {'type': 'loss', 'content': 0.0034557024482637644, 'timestamp': '2025-10-01 04:23:49.740589', 'step': 5096, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:49.783631', 'step': 5096, 'epoch': 3} {'type': 'loss', 'content': 0.001290423097088933, 'timestamp': '2025-10-01 04:23:49.792668', 'step': 5097, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:23:49.839912', 'step': 5097, 'epoch': 3} {'type': 'loss', 'content': 0.0025181539822369814, 'timestamp': '2025-10-01 04:23:49.852409', 'step': 5098, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:49.912007', 'step': 5098, 'epoch': 3} {'type': 'loss', 'content': 0.0008415449992753565, 'timestamp': '2025-10-01 04:23:49.922357', 'step': 5099, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:49.972696', 'step': 5099, 'epoch': 3} {'type': 'loss', 'content': 0.0016002104384824634, 'timestamp': '2025-10-01 04:23:50.004467', 'step': 5100, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:23:50.045968', 'step': 5100, 'epoch': 3} {'type': 'loss', 'content': 0.0024114931002259254, 'timestamp': '2025-10-01 04:23:50.067485', 'step': 5101, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:23:50.118930', 'step': 5101, 'epoch': 3} {'type': 'loss', 'content': 0.00034262178814969957, 'timestamp': '2025-10-01 04:23:50.131500', 'step': 5102, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:23:50.186357', 'step': 5102, 'epoch': 3} {'type': 'loss', 'content': 0.0008155244868248701, 'timestamp': '2025-10-01 04:23:50.199887', 'step': 5103, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:50.251624', 'step': 5103, 'epoch': 3} {'type': 'loss', 'content': 0.0011354422895237803, 'timestamp': '2025-10-01 04:23:50.286605', 'step': 5104, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:23:50.324359', 'step': 5104, 'epoch': 3} {'type': 'loss', 'content': 0.0018026743782684207, 'timestamp': '2025-10-01 04:23:50.329525', 'step': 5105, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:50.378581', 'step': 5105, 'epoch': 3} {'type': 'loss', 'content': 0.00017947757442016155, 'timestamp': '2025-10-01 04:23:50.390061', 'step': 5106, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:23:50.443183', 'step': 5106, 'epoch': 3} {'type': 'loss', 'content': 0.00015958289441186935, 'timestamp': '2025-10-01 04:23:50.451000', 'step': 5107, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:50.503186', 'step': 5107, 'epoch': 3} {'type': 'loss', 'content': 0.0003814694646280259, 'timestamp': '2025-10-01 04:23:50.534857', 'step': 5108, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:50.590503', 'step': 5108, 'epoch': 3} {'type': 'loss', 'content': 0.0006090670940466225, 'timestamp': '2025-10-01 04:23:50.599693', 'step': 5109, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:50.653996', 'step': 5109, 'epoch': 3} {'type': 'loss', 'content': 0.001843673991970718, 'timestamp': '2025-10-01 04:23:50.662298', 'step': 5110, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:50.710045', 'step': 5110, 'epoch': 3} {'type': 'loss', 'content': 0.0032943997066468, 'timestamp': '2025-10-01 04:23:50.721597', 'step': 5111, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:23:50.768368', 'step': 5111, 'epoch': 3} {'type': 'loss', 'content': 0.005068407393991947, 'timestamp': '2025-10-01 04:23:50.800537', 'step': 5112, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:23:50.844701', 'step': 5112, 'epoch': 3} {'type': 'loss', 'content': 0.0003900574811268598, 'timestamp': '2025-10-01 04:23:50.853308', 'step': 5113, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:23:50.898948', 'step': 5113, 'epoch': 3} {'type': 'loss', 'content': 0.0009577356977388263, 'timestamp': '2025-10-01 04:23:50.906306', 'step': 5114, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-10-01 04:23:50.957935', 'step': 5114, 'epoch': 3} {'type': 'loss', 'content': 0.0011563096195459366, 'timestamp': '2025-10-01 04:23:50.962573', 'step': 5115, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:51.001986', 'step': 5115, 'epoch': 3} {'type': 'loss', 'content': 0.0010577775537967682, 'timestamp': '2025-10-01 04:23:51.033835', 'step': 5116, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:23:51.070554', 'step': 5116, 'epoch': 3} {'type': 'loss', 'content': 0.0005867871223017573, 'timestamp': '2025-10-01 04:23:51.086691', 'step': 5117, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:51.141032', 'step': 5117, 'epoch': 3} {'type': 'loss', 'content': 0.01299739908427, 'timestamp': '2025-10-01 04:23:51.157228', 'step': 5118, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-10-01 04:23:51.196672', 'step': 5118, 'epoch': 3} {'type': 'loss', 'content': 0.006393247749656439, 'timestamp': '2025-10-01 04:23:51.202557', 'step': 5119, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:23:51.241919', 'step': 5119, 'epoch': 3} {'type': 'loss', 'content': 0.0021541279274970293, 'timestamp': '2025-10-01 04:23:51.280291', 'step': 5120, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:23:51.328961', 'step': 5120, 'epoch': 3} {'type': 'loss', 'content': 0.0009931335225701332, 'timestamp': '2025-10-01 04:23:51.343564', 'step': 5121, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:23:51.383147', 'step': 5121, 'epoch': 3} {'type': 'loss', 'content': 0.00022925101802684367, 'timestamp': '2025-10-01 04:23:51.397608', 'step': 5122, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:51.445809', 'step': 5122, 'epoch': 3} {'type': 'loss', 'content': 0.006586472503840923, 'timestamp': '2025-10-01 04:23:51.460436', 'step': 5123, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:51.509587', 'step': 5123, 'epoch': 3} {'type': 'loss', 'content': 0.0015639232005923986, 'timestamp': '2025-10-01 04:23:51.541443', 'step': 5124, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:51.591072', 'step': 5124, 'epoch': 3} {'type': 'loss', 'content': 0.000831471523270011, 'timestamp': '2025-10-01 04:23:51.606003', 'step': 5125, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:51.652355', 'step': 5125, 'epoch': 3} {'type': 'loss', 'content': 0.0010313845705240965, 'timestamp': '2025-10-01 04:23:51.663206', 'step': 5126, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:51.708679', 'step': 5126, 'epoch': 3} {'type': 'loss', 'content': 0.00020770773699041456, 'timestamp': '2025-10-01 04:23:51.720231', 'step': 5127, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:51.768342', 'step': 5127, 'epoch': 3} {'type': 'loss', 'content': 0.00015437672846019268, 'timestamp': '2025-10-01 04:23:51.802843', 'step': 5128, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:23:51.849456', 'step': 5128, 'epoch': 3} {'type': 'loss', 'content': 0.0009849119232967496, 'timestamp': '2025-10-01 04:23:51.862592', 'step': 5129, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:51.910800', 'step': 5129, 'epoch': 3} {'type': 'loss', 'content': 0.00026983063435181975, 'timestamp': '2025-10-01 04:23:51.925073', 'step': 5130, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:23:51.967500', 'step': 5130, 'epoch': 3} {'type': 'loss', 'content': 0.0034073006827384233, 'timestamp': '2025-10-01 04:23:51.982172', 'step': 5131, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:23:52.043684', 'step': 5131, 'epoch': 3} {'type': 'loss', 'content': 0.005407939665019512, 'timestamp': '2025-10-01 04:23:52.079321', 'step': 5132, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:52.125833', 'step': 5132, 'epoch': 3} {'type': 'loss', 'content': 0.001619270071387291, 'timestamp': '2025-10-01 04:23:52.136197', 'step': 5133, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:52.179989', 'step': 5133, 'epoch': 3} {'type': 'loss', 'content': 7.74919317336753e-05, 'timestamp': '2025-10-01 04:23:52.191295', 'step': 5134, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:23:52.246913', 'step': 5134, 'epoch': 3} {'type': 'loss', 'content': 0.0043108477257192135, 'timestamp': '2025-10-01 04:23:52.263268', 'step': 5135, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:52.320203', 'step': 5135, 'epoch': 3} {'type': 'loss', 'content': 0.0021865793969482183, 'timestamp': '2025-10-01 04:23:52.349408', 'step': 5136, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:52.399279', 'step': 5136, 'epoch': 3} {'type': 'loss', 'content': 0.001219769474118948, 'timestamp': '2025-10-01 04:23:52.408043', 'step': 5137, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:23:52.447193', 'step': 5137, 'epoch': 3} {'type': 'loss', 'content': 0.00024972748360596597, 'timestamp': '2025-10-01 04:23:52.461377', 'step': 5138, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:23:52.518585', 'step': 5138, 'epoch': 3} {'type': 'loss', 'content': 0.0007210442563518882, 'timestamp': '2025-10-01 04:23:52.532148', 'step': 5139, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:23:52.587023', 'step': 5139, 'epoch': 3} {'type': 'loss', 'content': 0.0009110862738452852, 'timestamp': '2025-10-01 04:23:52.615231', 'step': 5140, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-10-01 04:23:52.672696', 'step': 5140, 'epoch': 3} {'type': 'loss', 'content': 0.003852423746138811, 'timestamp': '2025-10-01 04:23:52.686581', 'step': 5141, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:23:52.729040', 'step': 5141, 'epoch': 3} {'type': 'loss', 'content': 0.0006261406815610826, 'timestamp': '2025-10-01 04:23:52.736382', 'step': 5142, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:52.786440', 'step': 5142, 'epoch': 3} {'type': 'loss', 'content': 0.0007366882055066526, 'timestamp': '2025-10-01 04:23:52.797304', 'step': 5143, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:23:52.854686', 'step': 5143, 'epoch': 3} {'type': 'loss', 'content': 0.0013349037617444992, 'timestamp': '2025-10-01 04:23:52.886505', 'step': 5144, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:52.931683', 'step': 5144, 'epoch': 3} {'type': 'loss', 'content': 0.0011835891054943204, 'timestamp': '2025-10-01 04:23:52.939920', 'step': 5145, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:52.987106', 'step': 5145, 'epoch': 3} {'type': 'loss', 'content': 0.0006446353509090841, 'timestamp': '2025-10-01 04:23:52.997472', 'step': 5146, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-10-01 04:23:53.045393', 'step': 5146, 'epoch': 3} {'type': 'loss', 'content': 0.0002758449409157038, 'timestamp': '2025-10-01 04:23:53.057154', 'step': 5147, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:23:53.105687', 'step': 5147, 'epoch': 3} {'type': 'loss', 'content': 8.617419371148571e-05, 'timestamp': '2025-10-01 04:23:53.134054', 'step': 5148, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:53.180433', 'step': 5148, 'epoch': 3} {'type': 'loss', 'content': 0.00507898535579443, 'timestamp': '2025-10-01 04:23:53.192366', 'step': 5149, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:53.232620', 'step': 5149, 'epoch': 3} {'type': 'loss', 'content': 0.0005215750425122678, 'timestamp': '2025-10-01 04:23:53.240930', 'step': 5150, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:53.297343', 'step': 5150, 'epoch': 3} {'type': 'loss', 'content': 0.00012013287778245285, 'timestamp': '2025-10-01 04:23:53.308984', 'step': 5151, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:53.353874', 'step': 5151, 'epoch': 3} {'type': 'loss', 'content': 0.003907691687345505, 'timestamp': '2025-10-01 04:23:53.383023', 'step': 5152, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:23:53.436042', 'step': 5152, 'epoch': 3} {'type': 'loss', 'content': 0.0005671399994753301, 'timestamp': '2025-10-01 04:23:53.446200', 'step': 5153, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:23:53.495212', 'step': 5153, 'epoch': 3} {'type': 'loss', 'content': 0.0001279809803236276, 'timestamp': '2025-10-01 04:23:53.503222', 'step': 5154, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:23:53.564228', 'step': 5154, 'epoch': 3} {'type': 'loss', 'content': 0.0004147828440181911, 'timestamp': '2025-10-01 04:23:53.577039', 'step': 5155, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:23:53.627622', 'step': 5155, 'epoch': 3} {'type': 'loss', 'content': 0.0014564908342435956, 'timestamp': '2025-10-01 04:23:53.661089', 'step': 5156, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:53.708864', 'step': 5156, 'epoch': 3} {'type': 'loss', 'content': 0.004434733651578426, 'timestamp': '2025-10-01 04:23:53.718560', 'step': 5157, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:53.755935', 'step': 5157, 'epoch': 3} {'type': 'loss', 'content': 0.014830363914370537, 'timestamp': '2025-10-01 04:23:53.767560', 'step': 5158, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:53.799908', 'step': 5158, 'epoch': 3} {'type': 'loss', 'content': 0.0001822435879148543, 'timestamp': '2025-10-01 04:23:53.808082', 'step': 5159, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:23:53.852548', 'step': 5159, 'epoch': 3} {'type': 'loss', 'content': 0.0002811910817399621, 'timestamp': '2025-10-01 04:23:53.880842', 'step': 5160, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:23:53.926816', 'step': 5160, 'epoch': 3} {'type': 'loss', 'content': 0.00014303943316917866, 'timestamp': '2025-10-01 04:23:53.935398', 'step': 5161, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:53.979003', 'step': 5161, 'epoch': 3} {'type': 'loss', 'content': 0.01098064985126257, 'timestamp': '2025-10-01 04:23:53.991048', 'step': 5162, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:54.033672', 'step': 5162, 'epoch': 3} {'type': 'loss', 'content': 9.934379340847954e-05, 'timestamp': '2025-10-01 04:23:54.044603', 'step': 5163, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:23:54.083454', 'step': 5163, 'epoch': 3} {'type': 'loss', 'content': 0.0011944645084440708, 'timestamp': '2025-10-01 04:23:54.116953', 'step': 5164, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:54.157811', 'step': 5164, 'epoch': 3} {'type': 'loss', 'content': 0.0002592955424915999, 'timestamp': '2025-10-01 04:23:54.166294', 'step': 5165, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:23:54.209606', 'step': 5165, 'epoch': 3} {'type': 'loss', 'content': 0.002389096887782216, 'timestamp': '2025-10-01 04:23:54.217701', 'step': 5166, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:54.263098', 'step': 5166, 'epoch': 3} {'type': 'loss', 'content': 0.0010438412427902222, 'timestamp': '2025-10-01 04:23:54.273162', 'step': 5167, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:54.319957', 'step': 5167, 'epoch': 3} {'type': 'loss', 'content': 0.0003599376359488815, 'timestamp': '2025-10-01 04:23:54.357279', 'step': 5168, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:54.399312', 'step': 5168, 'epoch': 3} {'type': 'loss', 'content': 0.021521445363759995, 'timestamp': '2025-10-01 04:23:54.411050', 'step': 5169, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:54.454121', 'step': 5169, 'epoch': 3} {'type': 'loss', 'content': 0.001003202167339623, 'timestamp': '2025-10-01 04:23:54.466699', 'step': 5170, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:54.511017', 'step': 5170, 'epoch': 3} {'type': 'loss', 'content': 2.6719124434748664e-05, 'timestamp': '2025-10-01 04:23:54.522489', 'step': 5171, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:23:54.579625', 'step': 5171, 'epoch': 3} {'type': 'loss', 'content': 0.0002129612403223291, 'timestamp': '2025-10-01 04:23:54.614592', 'step': 5172, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:23:54.663050', 'step': 5172, 'epoch': 3} {'type': 'loss', 'content': 0.0002457977971062064, 'timestamp': '2025-10-01 04:23:54.676430', 'step': 5173, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:23:54.723910', 'step': 5173, 'epoch': 3} {'type': 'loss', 'content': 0.0004685234453063458, 'timestamp': '2025-10-01 04:23:54.731456', 'step': 5174, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:23:54.770256', 'step': 5174, 'epoch': 3} {'type': 'loss', 'content': 0.007552194409072399, 'timestamp': '2025-10-01 04:23:54.782959', 'step': 5175, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-10-01 04:23:57.547828', 'step': 5175, 'epoch': 3} {'type': 'pplx', 'content': 5.912733583571683, 'timestamp': '2025-10-01 04:23:57.555220', 'step': 5175, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 15187581968384}, 'timestamp': '2025-10-01 04:23:57.600692', 'step': 5175, 'epoch': 3} {'type': 'loss', 'content': 0.0011198458960279822, 'timestamp': '2025-10-01 04:23:57.639483', 'step': 5176, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:23:57.683630', 'step': 5176, 'epoch': 3} {'type': 'loss', 'content': 0.0002884196292143315, 'timestamp': '2025-10-01 04:23:57.693885', 'step': 5177, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:23:57.737156', 'step': 5177, 'epoch': 3} {'type': 'loss', 'content': 0.00025422696489840746, 'timestamp': '2025-10-01 04:23:57.749607', 'step': 5178, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:57.786816', 'step': 5178, 'epoch': 3} {'type': 'loss', 'content': 8.524732402293012e-05, 'timestamp': '2025-10-01 04:23:57.797687', 'step': 5179, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:57.842120', 'step': 5179, 'epoch': 3} {'type': 'loss', 'content': 8.998156408779323e-05, 'timestamp': '2025-10-01 04:23:57.874015', 'step': 5180, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:57.910361', 'step': 5180, 'epoch': 3} {'type': 'loss', 'content': 0.007737918756902218, 'timestamp': '2025-10-01 04:23:57.916319', 'step': 5181, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:23:57.953556', 'step': 5181, 'epoch': 3} {'type': 'loss', 'content': 8.877650543581694e-05, 'timestamp': '2025-10-01 04:23:57.961078', 'step': 5182, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:58.005590', 'step': 5182, 'epoch': 3} {'type': 'loss', 'content': 0.0004170667380094528, 'timestamp': '2025-10-01 04:23:58.017219', 'step': 5183, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:58.056742', 'step': 5183, 'epoch': 3} {'type': 'loss', 'content': 0.0004078445490449667, 'timestamp': '2025-10-01 04:23:58.088441', 'step': 5184, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:23:58.126519', 'step': 5184, 'epoch': 3} {'type': 'loss', 'content': 0.004486409481614828, 'timestamp': '2025-10-01 04:23:58.139889', 'step': 5185, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:23:58.179240', 'step': 5185, 'epoch': 3} {'type': 'loss', 'content': 0.00041791272815316916, 'timestamp': '2025-10-01 04:23:58.187368', 'step': 5186, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:23:58.232053', 'step': 5186, 'epoch': 3} {'type': 'loss', 'content': 0.005641444120556116, 'timestamp': '2025-10-01 04:23:58.243758', 'step': 5187, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:58.288225', 'step': 5187, 'epoch': 3} {'type': 'loss', 'content': 0.00231829727999866, 'timestamp': '2025-10-01 04:23:58.320595', 'step': 5188, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:58.355313', 'step': 5188, 'epoch': 3} {'type': 'loss', 'content': 0.0009530174429528415, 'timestamp': '2025-10-01 04:23:58.364599', 'step': 5189, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:23:58.412886', 'step': 5189, 'epoch': 3} {'type': 'loss', 'content': 0.01101574208587408, 'timestamp': '2025-10-01 04:23:58.425591', 'step': 5190, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:23:58.475858', 'step': 5190, 'epoch': 3} {'type': 'loss', 'content': 0.00029922733665443957, 'timestamp': '2025-10-01 04:23:58.489343', 'step': 5191, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:58.526460', 'step': 5191, 'epoch': 3} {'type': 'loss', 'content': 0.004215736873447895, 'timestamp': '2025-10-01 04:23:58.558277', 'step': 5192, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:58.599613', 'step': 5192, 'epoch': 3} {'type': 'loss', 'content': 0.008943832479417324, 'timestamp': '2025-10-01 04:23:58.608896', 'step': 5193, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:58.649472', 'step': 5193, 'epoch': 3} {'type': 'loss', 'content': 0.0003346681478433311, 'timestamp': '2025-10-01 04:23:58.661014', 'step': 5194, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:23:58.706623', 'step': 5194, 'epoch': 3} {'type': 'loss', 'content': 0.0006003111484460533, 'timestamp': '2025-10-01 04:23:58.714087', 'step': 5195, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:58.747992', 'step': 5195, 'epoch': 3} {'type': 'loss', 'content': 0.0066320626065135, 'timestamp': '2025-10-01 04:23:58.777125', 'step': 5196, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:23:58.817939', 'step': 5196, 'epoch': 3} {'type': 'loss', 'content': 0.0006902923341840506, 'timestamp': '2025-10-01 04:23:58.831258', 'step': 5197, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:58.867596', 'step': 5197, 'epoch': 3} {'type': 'loss', 'content': 0.0016914616571739316, 'timestamp': '2025-10-01 04:23:58.879081', 'step': 5198, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:58.915312', 'step': 5198, 'epoch': 3} {'type': 'loss', 'content': 0.001000818214379251, 'timestamp': '2025-10-01 04:23:58.926883', 'step': 5199, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:58.963514', 'step': 5199, 'epoch': 3} {'type': 'loss', 'content': 0.00016892758139874786, 'timestamp': '2025-10-01 04:23:58.995347', 'step': 5200, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:23:59.033859', 'step': 5200, 'epoch': 3} {'type': 'loss', 'content': 0.0027020585257560015, 'timestamp': '2025-10-01 04:23:59.044144', 'step': 5201, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:23:59.082763', 'step': 5201, 'epoch': 3} {'type': 'loss', 'content': 0.0012553384294733405, 'timestamp': '2025-10-01 04:23:59.095286', 'step': 5202, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:59.137100', 'step': 5202, 'epoch': 3} {'type': 'loss', 'content': 0.005937356501817703, 'timestamp': '2025-10-01 04:23:59.145217', 'step': 5203, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:59.179611', 'step': 5203, 'epoch': 3} {'type': 'loss', 'content': 0.0011814793106168509, 'timestamp': '2025-10-01 04:23:59.212195', 'step': 5204, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:59.248611', 'step': 5204, 'epoch': 3} {'type': 'loss', 'content': 0.0018593987915664911, 'timestamp': '2025-10-01 04:23:59.257005', 'step': 5205, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:23:59.291795', 'step': 5205, 'epoch': 3} {'type': 'loss', 'content': 0.0004584112612064928, 'timestamp': '2025-10-01 04:23:59.299747', 'step': 5206, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:23:59.335372', 'step': 5206, 'epoch': 3} {'type': 'loss', 'content': 0.00023591094941366464, 'timestamp': '2025-10-01 04:23:59.342566', 'step': 5207, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:59.376570', 'step': 5207, 'epoch': 3} {'type': 'loss', 'content': 0.0012810240732505918, 'timestamp': '2025-10-01 04:23:59.405818', 'step': 5208, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:23:59.443376', 'step': 5208, 'epoch': 3} {'type': 'loss', 'content': 0.0015720960218459368, 'timestamp': '2025-10-01 04:23:59.448998', 'step': 5209, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:23:59.486389', 'step': 5209, 'epoch': 3} {'type': 'loss', 'content': 0.0006151549750939012, 'timestamp': '2025-10-01 04:23:59.498921', 'step': 5210, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:23:59.537320', 'step': 5210, 'epoch': 3} {'type': 'loss', 'content': 0.00035330854007042944, 'timestamp': '2025-10-01 04:23:59.550836', 'step': 5211, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:59.583886', 'step': 5211, 'epoch': 3} {'type': 'loss', 'content': 0.0014882946852594614, 'timestamp': '2025-10-01 04:23:59.614557', 'step': 5212, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:23:59.653298', 'step': 5212, 'epoch': 3} {'type': 'loss', 'content': 0.0015189426485449076, 'timestamp': '2025-10-01 04:23:59.658986', 'step': 5213, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:23:59.693506', 'step': 5213, 'epoch': 3} {'type': 'loss', 'content': 0.006504413206130266, 'timestamp': '2025-10-01 04:23:59.701180', 'step': 5214, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:59.737187', 'step': 5214, 'epoch': 3} {'type': 'loss', 'content': 0.0006223347154445946, 'timestamp': '2025-10-01 04:23:59.748522', 'step': 5215, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:23:59.786281', 'step': 5215, 'epoch': 3} {'type': 'loss', 'content': 0.0016773035749793053, 'timestamp': '2025-10-01 04:23:59.820794', 'step': 5216, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:23:59.856063', 'step': 5216, 'epoch': 3} {'type': 'loss', 'content': 0.0018889469793066382, 'timestamp': '2025-10-01 04:23:59.864398', 'step': 5217, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:23:59.898011', 'step': 5217, 'epoch': 3} {'type': 'loss', 'content': 0.000519624853041023, 'timestamp': '2025-10-01 04:23:59.909551', 'step': 5218, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:23:59.946344', 'step': 5218, 'epoch': 3} {'type': 'loss', 'content': 0.00022528384579345584, 'timestamp': '2025-10-01 04:23:59.960343', 'step': 5219, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:23:59.995458', 'step': 5219, 'epoch': 3} {'type': 'loss', 'content': 0.0005886299186386168, 'timestamp': '2025-10-01 04:24:00.029220', 'step': 5220, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:24:00.063220', 'step': 5220, 'epoch': 3} {'type': 'loss', 'content': 0.0008329026168212295, 'timestamp': '2025-10-01 04:24:00.072353', 'step': 5221, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:24:00.111399', 'step': 5221, 'epoch': 3} {'type': 'loss', 'content': 0.0006557941087521613, 'timestamp': '2025-10-01 04:24:00.124139', 'step': 5222, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:24:00.162235', 'step': 5222, 'epoch': 3} {'type': 'loss', 'content': 0.0033899678383022547, 'timestamp': '2025-10-01 04:24:00.173717', 'step': 5223, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:24:00.213102', 'step': 5223, 'epoch': 3} {'type': 'loss', 'content': 0.003538465825840831, 'timestamp': '2025-10-01 04:24:00.246887', 'step': 5224, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:24:00.284688', 'step': 5224, 'epoch': 3} {'type': 'loss', 'content': 0.004071923904120922, 'timestamp': '2025-10-01 04:24:00.297536', 'step': 5225, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:24:00.333217', 'step': 5225, 'epoch': 3} {'type': 'loss', 'content': 0.0005370064754970372, 'timestamp': '2025-10-01 04:24:00.345722', 'step': 5226, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:24:00.387981', 'step': 5226, 'epoch': 3} {'type': 'loss', 'content': 0.0010696175741031766, 'timestamp': '2025-10-01 04:24:00.400781', 'step': 5227, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:24:00.437087', 'step': 5227, 'epoch': 3} {'type': 'loss', 'content': 0.0021710938308387995, 'timestamp': '2025-10-01 04:24:00.470645', 'step': 5228, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:24:00.507510', 'step': 5228, 'epoch': 3} {'type': 'loss', 'content': 0.011838321574032307, 'timestamp': '2025-10-01 04:24:00.515608', 'step': 5229, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:24:00.555732', 'step': 5229, 'epoch': 3} {'type': 'loss', 'content': 0.0007072535227052867, 'timestamp': '2025-10-01 04:24:00.569818', 'step': 5230, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:24:00.603280', 'step': 5230, 'epoch': 3} {'type': 'loss', 'content': 0.002118093892931938, 'timestamp': '2025-10-01 04:24:00.615738', 'step': 5231, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:24:00.652508', 'step': 5231, 'epoch': 3} {'type': 'loss', 'content': 0.0009051556116901338, 'timestamp': '2025-10-01 04:24:00.686045', 'step': 5232, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:24:00.723398', 'step': 5232, 'epoch': 3} {'type': 'loss', 'content': 0.0005286425584927201, 'timestamp': '2025-10-01 04:24:00.733585', 'step': 5233, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:24:00.776710', 'step': 5233, 'epoch': 3} {'type': 'loss', 'content': 0.0004886645474471152, 'timestamp': '2025-10-01 04:24:00.790309', 'step': 5234, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:24:00.824807', 'step': 5234, 'epoch': 3} {'type': 'loss', 'content': 0.0014246432110667229, 'timestamp': '2025-10-01 04:24:00.837333', 'step': 5235, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:24:00.900183', 'step': 5235, 'epoch': 3} {'type': 'loss', 'content': 0.0006328502204269171, 'timestamp': '2025-10-01 04:24:00.933729', 'step': 5236, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:24:01.008349', 'step': 5236, 'epoch': 3} {'type': 'loss', 'content': 0.004736087750643492, 'timestamp': '2025-10-01 04:24:01.021212', 'step': 5237, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:24:01.060719', 'step': 5237, 'epoch': 3} {'type': 'loss', 'content': 0.0014451518654823303, 'timestamp': '2025-10-01 04:24:01.073501', 'step': 5238, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:24:01.146359', 'step': 5238, 'epoch': 3} {'type': 'loss', 'content': 0.001546329353004694, 'timestamp': '2025-10-01 04:24:01.157887', 'step': 5239, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:24:01.203349', 'step': 5239, 'epoch': 3} {'type': 'loss', 'content': 0.0005997165571898222, 'timestamp': '2025-10-01 04:24:01.237847', 'step': 5240, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:24:01.278639', 'step': 5240, 'epoch': 3} {'type': 'loss', 'content': 0.0008156466064974666, 'timestamp': '2025-10-01 04:24:01.289604', 'step': 5241, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:24:01.332066', 'step': 5241, 'epoch': 3} {'type': 'loss', 'content': 0.008085295557975769, 'timestamp': '2025-10-01 04:24:01.343496', 'step': 5242, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:24:01.403873', 'step': 5242, 'epoch': 3} {'type': 'loss', 'content': 0.0007799306185916066, 'timestamp': '2025-10-01 04:24:01.417465', 'step': 5243, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:24:01.460119', 'step': 5243, 'epoch': 3} {'type': 'loss', 'content': 0.0014460949460044503, 'timestamp': '2025-10-01 04:24:01.489484', 'step': 5244, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:24:01.538376', 'step': 5244, 'epoch': 3} {'type': 'loss', 'content': 0.00031405859044753015, 'timestamp': '2025-10-01 04:24:01.546811', 'step': 5245, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:24:01.590175', 'step': 5245, 'epoch': 3} {'type': 'loss', 'content': 0.007580052595585585, 'timestamp': '2025-10-01 04:24:01.601699', 'step': 5246, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:24:01.656392', 'step': 5246, 'epoch': 3} {'type': 'loss', 'content': 0.0014818100025877357, 'timestamp': '2025-10-01 04:24:01.670460', 'step': 5247, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:24:01.716309', 'step': 5247, 'epoch': 3} {'type': 'loss', 'content': 0.004300449974834919, 'timestamp': '2025-10-01 04:24:01.745617', 'step': 5248, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:24:01.791692', 'step': 5248, 'epoch': 3} {'type': 'loss', 'content': 0.004347276408225298, 'timestamp': '2025-10-01 04:24:01.800105', 'step': 5249, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:24:01.846839', 'step': 5249, 'epoch': 3} {'type': 'loss', 'content': 0.0001758811267791316, 'timestamp': '2025-10-01 04:24:01.861059', 'step': 5250, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:24:01.900729', 'step': 5250, 'epoch': 3} {'type': 'loss', 'content': 0.0014570214552804828, 'timestamp': '2025-10-01 04:24:01.912386', 'step': 5251, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:24:01.959460', 'step': 5251, 'epoch': 3} {'type': 'loss', 'content': 0.0035112828481942415, 'timestamp': '2025-10-01 04:24:01.991246', 'step': 5252, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:24:02.026761', 'step': 5252, 'epoch': 3} {'type': 'loss', 'content': 0.004013071767985821, 'timestamp': '2025-10-01 04:24:02.036062', 'step': 5253, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:24:02.083065', 'step': 5253, 'epoch': 3} {'type': 'loss', 'content': 0.0006790632614865899, 'timestamp': '2025-10-01 04:24:02.095568', 'step': 5254, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:24:02.139237', 'step': 5254, 'epoch': 3} {'type': 'loss', 'content': 0.0017236361745744944, 'timestamp': '2025-10-01 04:24:02.152753', 'step': 5255, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:24:02.191006', 'step': 5255, 'epoch': 3} {'type': 'loss', 'content': 0.0028082074131816626, 'timestamp': '2025-10-01 04:24:02.222993', 'step': 5256, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:24:02.272507', 'step': 5256, 'epoch': 3} {'type': 'loss', 'content': 0.009714898653328419, 'timestamp': '2025-10-01 04:24:02.280917', 'step': 5257, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:24:02.336116', 'step': 5257, 'epoch': 3} {'type': 'loss', 'content': 0.0015005202731117606, 'timestamp': '2025-10-01 04:24:02.348895', 'step': 5258, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:24:02.390680', 'step': 5258, 'epoch': 3} {'type': 'loss', 'content': 0.00042938473052345216, 'timestamp': '2025-10-01 04:24:02.402311', 'step': 5259, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:24:02.456596', 'step': 5259, 'epoch': 3} {'type': 'loss', 'content': 0.00030001570121385157, 'timestamp': '2025-10-01 04:24:02.490317', 'step': 5260, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:24:02.537616', 'step': 5260, 'epoch': 3} {'type': 'loss', 'content': 0.0017199026187881827, 'timestamp': '2025-10-01 04:24:02.548872', 'step': 5261, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:24:02.588471', 'step': 5261, 'epoch': 3} {'type': 'loss', 'content': 0.004420668818056583, 'timestamp': '2025-10-01 04:24:02.601047', 'step': 5262, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:24:02.646289', 'step': 5262, 'epoch': 3} {'type': 'loss', 'content': 0.0014536109520122409, 'timestamp': '2025-10-01 04:24:02.658783', 'step': 5263, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:24:02.702460', 'step': 5263, 'epoch': 3} {'type': 'loss', 'content': 0.0004483695956878364, 'timestamp': '2025-10-01 04:24:02.737496', 'step': 5264, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:24:02.783270', 'step': 5264, 'epoch': 3} {'type': 'loss', 'content': 0.0004402680788189173, 'timestamp': '2025-10-01 04:24:02.796144', 'step': 5265, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:24:02.851912', 'step': 5265, 'epoch': 3} {'type': 'loss', 'content': 0.0010234402725473046, 'timestamp': '2025-10-01 04:24:02.867962', 'step': 5266, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:24:02.920682', 'step': 5266, 'epoch': 3} {'type': 'loss', 'content': 0.0027563702315092087, 'timestamp': '2025-10-01 04:24:02.936528', 'step': 5267, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:24:02.980167', 'step': 5267, 'epoch': 3} {'type': 'loss', 'content': 6.759057578165084e-05, 'timestamp': '2025-10-01 04:24:03.015095', 'step': 5268, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:24:03.075446', 'step': 5268, 'epoch': 3} {'type': 'loss', 'content': 0.0008538203546777368, 'timestamp': '2025-10-01 04:24:03.084369', 'step': 5269, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:24:03.130524', 'step': 5269, 'epoch': 3} {'type': 'loss', 'content': 0.0018674160819500685, 'timestamp': '2025-10-01 04:24:03.142878', 'step': 5270, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:24:03.192835', 'step': 5270, 'epoch': 3} {'type': 'loss', 'content': 0.0001643676368985325, 'timestamp': '2025-10-01 04:24:03.203566', 'step': 5271, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:24:03.251216', 'step': 5271, 'epoch': 3} {'type': 'loss', 'content': 0.0036167949438095093, 'timestamp': '2025-10-01 04:24:03.284934', 'step': 5272, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:24:03.326384', 'step': 5272, 'epoch': 3} {'type': 'loss', 'content': 0.0005834789481014013, 'timestamp': '2025-10-01 04:24:03.334803', 'step': 5273, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:24:03.375111', 'step': 5273, 'epoch': 3} {'type': 'loss', 'content': 0.0011925448197871447, 'timestamp': '2025-10-01 04:24:03.387679', 'step': 5274, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:24:03.432081', 'step': 5274, 'epoch': 3} {'type': 'loss', 'content': 0.0002059528196696192, 'timestamp': '2025-10-01 04:24:03.442924', 'step': 5275, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:24:03.487169', 'step': 5275, 'epoch': 3} {'type': 'loss', 'content': 0.0004393178678583354, 'timestamp': '2025-10-01 04:24:03.520870', 'step': 5276, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:24:03.565694', 'step': 5276, 'epoch': 3} {'type': 'loss', 'content': 0.00014195848780218512, 'timestamp': '2025-10-01 04:24:03.578526', 'step': 5277, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:24:03.628379', 'step': 5277, 'epoch': 3} {'type': 'loss', 'content': 0.002116728574037552, 'timestamp': '2025-10-01 04:24:03.642925', 'step': 5278, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:24:03.703244', 'step': 5278, 'epoch': 3} {'type': 'loss', 'content': 0.0038463538512587547, 'timestamp': '2025-10-01 04:24:03.712089', 'step': 5279, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:24:03.760320', 'step': 5279, 'epoch': 3} {'type': 'loss', 'content': 0.0009655703906901181, 'timestamp': '2025-10-01 04:24:03.791978', 'step': 5280, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:24:03.829760', 'step': 5280, 'epoch': 3} {'type': 'loss', 'content': 0.0003095255815424025, 'timestamp': '2025-10-01 04:24:03.838806', 'step': 5281, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:24:03.888565', 'step': 5281, 'epoch': 3} {'type': 'loss', 'content': 0.00025388941867277026, 'timestamp': '2025-10-01 04:24:03.901219', 'step': 5282, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:24:03.949170', 'step': 5282, 'epoch': 3} {'type': 'loss', 'content': 0.0009973098058253527, 'timestamp': '2025-10-01 04:24:03.963109', 'step': 5283, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:24:04.001134', 'step': 5283, 'epoch': 3} {'type': 'loss', 'content': 0.00011832024756586179, 'timestamp': '2025-10-01 04:24:04.030341', 'step': 5284, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:24:04.071976', 'step': 5284, 'epoch': 3} {'type': 'loss', 'content': 0.00015144082135520875, 'timestamp': '2025-10-01 04:24:04.082229', 'step': 5285, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:24:04.132080', 'step': 5285, 'epoch': 3} {'type': 'loss', 'content': 0.0005536427488550544, 'timestamp': '2025-10-01 04:24:04.146120', 'step': 5286, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:24:04.193563', 'step': 5286, 'epoch': 3} {'type': 'loss', 'content': 0.00011141123832203448, 'timestamp': '2025-10-01 04:24:04.204484', 'step': 5287, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:24:04.248286', 'step': 5287, 'epoch': 3} {'type': 'loss', 'content': 0.0008165222825482488, 'timestamp': '2025-10-01 04:24:04.281707', 'step': 5288, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:24:04.326419', 'step': 5288, 'epoch': 3} {'type': 'loss', 'content': 0.0009943764889612794, 'timestamp': '2025-10-01 04:24:04.339979', 'step': 5289, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:24:04.389708', 'step': 5289, 'epoch': 3} {'type': 'loss', 'content': 9.459357534069568e-05, 'timestamp': '2025-10-01 04:24:04.402468', 'step': 5290, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-10-01 04:24:07.362133', 'step': 5290, 'epoch': 3} {'type': 'pplx', 'content': 6.084457663067471, 'timestamp': '2025-10-01 04:24:07.365063', 'step': 5290, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:24:07.401539', 'step': 5290, 'epoch': 3} {'type': 'loss', 'content': 4.793950938619673e-05, 'timestamp': '2025-10-01 04:24:07.415120', 'step': 5291, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:24:07.466352', 'step': 5291, 'epoch': 3} {'type': 'loss', 'content': 0.00017051075701601803, 'timestamp': '2025-10-01 04:24:07.501263', 'step': 5292, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:24:07.556920', 'step': 5292, 'epoch': 3} {'type': 'loss', 'content': 7.490772986784577e-05, 'timestamp': '2025-10-01 04:24:07.569018', 'step': 5293, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:24:07.627241', 'step': 5293, 'epoch': 3} {'type': 'loss', 'content': 0.009083515964448452, 'timestamp': '2025-10-01 04:24:07.641259', 'step': 5294, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:24:07.699172', 'step': 5294, 'epoch': 3} {'type': 'loss', 'content': 0.0001253063092008233, 'timestamp': '2025-10-01 04:24:07.712683', 'step': 5295, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:24:07.759080', 'step': 5295, 'epoch': 3} {'type': 'loss', 'content': 0.001645623124204576, 'timestamp': '2025-10-01 04:24:07.792571', 'step': 5296, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:24:07.847126', 'step': 5296, 'epoch': 3} {'type': 'loss', 'content': 0.0006043976754881442, 'timestamp': '2025-10-01 04:24:07.860110', 'step': 5297, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:24:07.909434', 'step': 5297, 'epoch': 3} {'type': 'loss', 'content': 0.0006356246303766966, 'timestamp': '2025-10-01 04:24:07.922641', 'step': 5298, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:24:07.971484', 'step': 5298, 'epoch': 3} {'type': 'loss', 'content': 3.1468192901229486e-05, 'timestamp': '2025-10-01 04:24:07.984232', 'step': 5299, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-10-01 04:24:08.037586', 'step': 5299, 'epoch': 3} {'type': 'loss', 'content': 0.002580089494585991, 'timestamp': '2025-10-01 04:24:08.074932', 'step': 5300, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:24:08.118110', 'step': 5300, 'epoch': 3} {'type': 'loss', 'content': 0.00015100999735295773, 'timestamp': '2025-10-01 04:24:08.130921', 'step': 5301, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:24:08.192350', 'step': 5301, 'epoch': 3} {'type': 'loss', 'content': 0.001726097078062594, 'timestamp': '2025-10-01 04:24:08.208211', 'step': 5302, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:24:08.251473', 'step': 5302, 'epoch': 3} {'type': 'loss', 'content': 0.0008789841085672379, 'timestamp': '2025-10-01 04:24:08.264059', 'step': 5303, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:24:08.311030', 'step': 5303, 'epoch': 3} {'type': 'loss', 'content': 0.006610561162233353, 'timestamp': '2025-10-01 04:24:08.344677', 'step': 5304, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:24:08.397993', 'step': 5304, 'epoch': 3} {'type': 'loss', 'content': 0.00011634389375103638, 'timestamp': '2025-10-01 04:24:08.408994', 'step': 5305, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:24:08.458077', 'step': 5305, 'epoch': 3} {'type': 'loss', 'content': 0.015003308653831482, 'timestamp': '2025-10-01 04:24:08.470664', 'step': 5306, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:24:08.527794', 'step': 5306, 'epoch': 3} {'type': 'loss', 'content': 7.708043267484754e-05, 'timestamp': '2025-10-01 04:24:08.541768', 'step': 5307, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:24:08.585163', 'step': 5307, 'epoch': 3} {'type': 'loss', 'content': 0.0006020899163559079, 'timestamp': '2025-10-01 04:24:08.610977', 'step': 5308, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:24:08.655181', 'step': 5308, 'epoch': 3} {'type': 'loss', 'content': 0.0011617501731961966, 'timestamp': '2025-10-01 04:24:08.670451', 'step': 5309, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-10-01 04:24:08.712876', 'step': 5309, 'epoch': 3} {'type': 'loss', 'content': 0.00012492388486862183, 'timestamp': '2025-10-01 04:24:08.717761', 'step': 5310, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-10-01 04:24:08.771138', 'step': 5310, 'epoch': 3} {'type': 'loss', 'content': 0.00015766298747621477, 'timestamp': '2025-10-01 04:24:08.787410', 'step': 5311, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:24:08.833012', 'step': 5311, 'epoch': 3} {'type': 'loss', 'content': 0.0027825813740491867, 'timestamp': '2025-10-01 04:24:08.867278', 'step': 5312, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:24:08.910143', 'step': 5312, 'epoch': 3} {'type': 'loss', 'content': 0.00010294300591340289, 'timestamp': '2025-10-01 04:24:08.919424', 'step': 5313, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:24:08.968444', 'step': 5313, 'epoch': 3} {'type': 'loss', 'content': 0.0007244438165798783, 'timestamp': '2025-10-01 04:24:08.979915', 'step': 5314, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:24:09.023286', 'step': 5314, 'epoch': 3} {'type': 'loss', 'content': 0.0006628523115068674, 'timestamp': '2025-10-01 04:24:09.034166', 'step': 5315, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:24:09.079837', 'step': 5315, 'epoch': 3} {'type': 'loss', 'content': 0.00012591638369485736, 'timestamp': '2025-10-01 04:24:09.113407', 'step': 5316, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:24:09.147498', 'step': 5316, 'epoch': 3} {'type': 'loss', 'content': 0.0033983965404331684, 'timestamp': '2025-10-01 04:24:09.160916', 'step': 5317, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:24:09.203271', 'step': 5317, 'epoch': 3} {'type': 'loss', 'content': 8.285330841317773e-05, 'timestamp': '2025-10-01 04:24:09.215824', 'step': 5318, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:24:09.263958', 'step': 5318, 'epoch': 3} {'type': 'loss', 'content': 0.000225978103117086, 'timestamp': '2025-10-01 04:24:09.275637', 'step': 5319, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:24:09.325892', 'step': 5319, 'epoch': 3} {'type': 'loss', 'content': 0.00035075191408395767, 'timestamp': '2025-10-01 04:24:09.354966', 'step': 5320, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-10-01 04:24:09.411420', 'step': 5320, 'epoch': 3} {'type': 'loss', 'content': 0.0013047618558630347, 'timestamp': '2025-10-01 04:24:09.415090', 'step': 5321, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:24:09.461439', 'step': 5321, 'epoch': 3} {'type': 'loss', 'content': 5.934026921750046e-05, 'timestamp': '2025-10-01 04:24:09.473999', 'step': 5322, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:24:09.535286', 'step': 5322, 'epoch': 3} {'type': 'loss', 'content': 0.00391207542270422, 'timestamp': '2025-10-01 04:24:09.551411', 'step': 5323, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:24:09.601202', 'step': 5323, 'epoch': 3} {'type': 'loss', 'content': 0.00011834234464913607, 'timestamp': '2025-10-01 04:24:09.632931', 'step': 5324, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:24:09.688789', 'step': 5324, 'epoch': 3} {'type': 'loss', 'content': 3.603342338465154e-05, 'timestamp': '2025-10-01 04:24:09.701428', 'step': 5325, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:24:09.746825', 'step': 5325, 'epoch': 3} {'type': 'loss', 'content': 0.0013527447590604424, 'timestamp': '2025-10-01 04:24:09.759386', 'step': 5326, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:24:09.799934', 'step': 5326, 'epoch': 3} {'type': 'loss', 'content': 0.00021965143969282508, 'timestamp': '2025-10-01 04:24:09.810698', 'step': 5327, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:24:09.850564', 'step': 5327, 'epoch': 3} {'type': 'loss', 'content': 0.00026375128072686493, 'timestamp': '2025-10-01 04:24:09.885515', 'step': 5328, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:24:09.932208', 'step': 5328, 'epoch': 3} {'type': 'loss', 'content': 0.0002767608384601772, 'timestamp': '2025-10-01 04:24:09.941451', 'step': 5329, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:24:10.008323', 'step': 5329, 'epoch': 3} {'type': 'loss', 'content': 0.0017824809765443206, 'timestamp': '2025-10-01 04:24:10.020907', 'step': 5330, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:24:10.080271', 'step': 5330, 'epoch': 3} {'type': 'loss', 'content': 0.002543339505791664, 'timestamp': '2025-10-01 04:24:10.093101', 'step': 5331, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:24:10.144871', 'step': 5331, 'epoch': 3} {'type': 'loss', 'content': 0.0077201565727591515, 'timestamp': '2025-10-01 04:24:10.178313', 'step': 5332, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:24:10.260236', 'step': 5332, 'epoch': 3} {'type': 'loss', 'content': 0.0004084617248736322, 'timestamp': '2025-10-01 04:24:10.273615', 'step': 5333, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:24:10.361869', 'step': 5333, 'epoch': 3} {'type': 'loss', 'content': 0.00017093571659643203, 'timestamp': '2025-10-01 04:24:10.375403', 'step': 5334, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:24:10.431677', 'step': 5334, 'epoch': 3} {'type': 'loss', 'content': 0.04364308714866638, 'timestamp': '2025-10-01 04:24:10.445252', 'step': 5335, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:24:10.485096', 'step': 5335, 'epoch': 3} {'type': 'loss', 'content': 0.0006633953307755291, 'timestamp': '2025-10-01 04:24:10.518615', 'step': 5336, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:24:10.562066', 'step': 5336, 'epoch': 3} {'type': 'loss', 'content': 0.01324351504445076, 'timestamp': '2025-10-01 04:24:10.572561', 'step': 5337, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:24:10.610574', 'step': 5337, 'epoch': 3} {'type': 'loss', 'content': 0.0018490488873794675, 'timestamp': '2025-10-01 04:24:10.621447', 'step': 5338, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:24:10.656098', 'step': 5338, 'epoch': 3} {'type': 'loss', 'content': 0.0003320764808449894, 'timestamp': '2025-10-01 04:24:10.664266', 'step': 5339, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:24:10.698767', 'step': 5339, 'epoch': 3} {'type': 'loss', 'content': 0.0014769969275221229, 'timestamp': '2025-10-01 04:24:10.730519', 'step': 5340, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:24:10.767442', 'step': 5340, 'epoch': 3} {'type': 'loss', 'content': 0.0009617318282835186, 'timestamp': '2025-10-01 04:24:10.780829', 'step': 5341, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:24:10.815523', 'step': 5341, 'epoch': 3} {'type': 'loss', 'content': 0.00041798781603574753, 'timestamp': '2025-10-01 04:24:10.823116', 'step': 5342, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:24:10.863156', 'step': 5342, 'epoch': 3} {'type': 'loss', 'content': 0.001964788418263197, 'timestamp': '2025-10-01 04:24:10.877227', 'step': 5343, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:24:10.910165', 'step': 5343, 'epoch': 3} {'type': 'loss', 'content': 0.00043101137271150947, 'timestamp': '2025-10-01 04:24:10.939335', 'step': 5344, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:24:10.974199', 'step': 5344, 'epoch': 3} {'type': 'loss', 'content': 0.009940596297383308, 'timestamp': '2025-10-01 04:24:10.982517', 'step': 5345, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:24:11.016866', 'step': 5345, 'epoch': 3} {'type': 'loss', 'content': 0.0008398623904213309, 'timestamp': '2025-10-01 04:24:11.029609', 'step': 5346, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:24:11.064979', 'step': 5346, 'epoch': 3} {'type': 'loss', 'content': 0.0037006381899118423, 'timestamp': '2025-10-01 04:24:11.072882', 'step': 5347, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:24:11.117617', 'step': 5347, 'epoch': 3} {'type': 'loss', 'content': 0.00033058816916309297, 'timestamp': '2025-10-01 04:24:11.152103', 'step': 5348, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:24:11.194803', 'step': 5348, 'epoch': 3} {'type': 'loss', 'content': 0.021475061774253845, 'timestamp': '2025-10-01 04:24:11.203936', 'step': 5349, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:24:11.240961', 'step': 5349, 'epoch': 3} {'type': 'loss', 'content': 0.008198228664696217, 'timestamp': '2025-10-01 04:24:11.250019', 'step': 5350, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:24:11.285430', 'step': 5350, 'epoch': 3} {'type': 'loss', 'content': 0.0022210590541362762, 'timestamp': '2025-10-01 04:24:11.296070', 'step': 5351, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:24:11.337679', 'step': 5351, 'epoch': 3} {'type': 'loss', 'content': 0.040309492498636246, 'timestamp': '2025-10-01 04:24:11.368975', 'step': 5352, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:24:11.411637', 'step': 5352, 'epoch': 3} {'type': 'loss', 'content': 0.0004474174347706139, 'timestamp': '2025-10-01 04:24:11.422679', 'step': 5353, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:24:11.479841', 'step': 5353, 'epoch': 3} {'type': 'loss', 'content': 0.000979223637841642, 'timestamp': '2025-10-01 04:24:11.493483', 'step': 5354, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:24:11.537604', 'step': 5354, 'epoch': 3} {'type': 'loss', 'content': 0.002622871659696102, 'timestamp': '2025-10-01 04:24:11.551563', 'step': 5355, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-10-01 04:24:11.608574', 'step': 5355, 'epoch': 3} {'type': 'loss', 'content': 0.0017199957510456443, 'timestamp': '2025-10-01 04:24:11.645905', 'step': 5356, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:24:11.685720', 'step': 5356, 'epoch': 3} {'type': 'loss', 'content': 0.015504877083003521, 'timestamp': '2025-10-01 04:24:11.697903', 'step': 5357, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:24:11.737228', 'step': 5357, 'epoch': 3} {'type': 'loss', 'content': 0.006553714629262686, 'timestamp': '2025-10-01 04:24:11.750007', 'step': 5358, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:24:11.787606', 'step': 5358, 'epoch': 3} {'type': 'loss', 'content': 0.005416512489318848, 'timestamp': '2025-10-01 04:24:11.798772', 'step': 5359, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:24:11.849522', 'step': 5359, 'epoch': 3} {'type': 'loss', 'content': 0.0074325925670564175, 'timestamp': '2025-10-01 04:24:11.884504', 'step': 5360, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:24:11.934798', 'step': 5360, 'epoch': 3} {'type': 'loss', 'content': 0.007007421460002661, 'timestamp': '2025-10-01 04:24:11.948313', 'step': 5361, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:24:11.991496', 'step': 5361, 'epoch': 3} {'type': 'loss', 'content': 0.002924438565969467, 'timestamp': '2025-10-01 04:24:12.000474', 'step': 5362, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:24:12.046985', 'step': 5362, 'epoch': 3} {'type': 'loss', 'content': 0.002014034427702427, 'timestamp': '2025-10-01 04:24:12.059718', 'step': 5363, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:24:12.096235', 'step': 5363, 'epoch': 3} {'type': 'loss', 'content': 0.011110998690128326, 'timestamp': '2025-10-01 04:24:12.125369', 'step': 5364, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-10-01 04:24:12.157584', 'step': 5364, 'epoch': 3} {'type': 'loss', 'content': 0.00929843820631504, 'timestamp': '2025-10-01 04:24:12.162713', 'step': 5365, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:24:12.194470', 'step': 5365, 'epoch': 3} {'type': 'loss', 'content': 0.01261622179299593, 'timestamp': '2025-10-01 04:24:12.202816', 'step': 5366, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:24:12.235341', 'step': 5366, 'epoch': 3} {'type': 'loss', 'content': 0.006301419343799353, 'timestamp': '2025-10-01 04:24:12.246225', 'step': 5367, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:24:12.279347', 'step': 5367, 'epoch': 3} {'type': 'loss', 'content': 0.00391644611954689, 'timestamp': '2025-10-01 04:24:12.308160', 'step': 5368, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:24:12.341117', 'step': 5368, 'epoch': 3} {'type': 'loss', 'content': 0.004323533270508051, 'timestamp': '2025-10-01 04:24:12.351273', 'step': 5369, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 13763770790336}, 'timestamp': '2025-10-01 04:24:12.398523', 'step': 5369, 'epoch': 3} {'type': 'loss', 'content': 0.0075971828773617744, 'timestamp': '2025-10-01 04:24:12.415870', 'step': 5370, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-10-01 04:24:12.460627', 'step': 5370, 'epoch': 3} {'type': 'loss', 'content': 0.020333992317318916, 'timestamp': '2025-10-01 04:24:12.476930', 'step': 5371, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:24:12.516255', 'step': 5371, 'epoch': 3} {'type': 'loss', 'content': 0.009756685234606266, 'timestamp': '2025-10-01 04:24:12.550757', 'step': 5372, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:24:12.588213', 'step': 5372, 'epoch': 3} {'type': 'loss', 'content': 0.0005899005336686969, 'timestamp': '2025-10-01 04:24:12.598599', 'step': 5373, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:24:12.676691', 'step': 5373, 'epoch': 3} {'type': 'loss', 'content': 0.003595922840759158, 'timestamp': '2025-10-01 04:24:12.690231', 'step': 5374, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:24:12.752623', 'step': 5374, 'epoch': 3} {'type': 'loss', 'content': 0.009392709471285343, 'timestamp': '2025-10-01 04:24:12.769261', 'step': 5375, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:24:12.858643', 'step': 5375, 'epoch': 3} {'type': 'loss', 'content': 0.008601628243923187, 'timestamp': '2025-10-01 04:24:12.893178', 'step': 5376, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:24:12.967955', 'step': 5376, 'epoch': 3} {'type': 'loss', 'content': 0.00537619274109602, 'timestamp': '2025-10-01 04:24:12.981332', 'step': 5377, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:24:13.042586', 'step': 5377, 'epoch': 3} {'type': 'loss', 'content': 0.01148083247244358, 'timestamp': '2025-10-01 04:24:13.053451', 'step': 5378, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:24:13.125094', 'step': 5378, 'epoch': 3} {'type': 'loss', 'content': 0.00954387802630663, 'timestamp': '2025-10-01 04:24:13.139146', 'step': 5379, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:24:13.189665', 'step': 5379, 'epoch': 3} {'type': 'loss', 'content': 0.008286572061479092, 'timestamp': '2025-10-01 04:24:13.223178', 'step': 5380, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:24:13.276348', 'step': 5380, 'epoch': 3} {'type': 'loss', 'content': 0.0013868106761947274, 'timestamp': '2025-10-01 04:24:13.289722', 'step': 5381, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:24:13.352691', 'step': 5381, 'epoch': 3} {'type': 'loss', 'content': 0.002302577719092369, 'timestamp': '2025-10-01 04:24:13.365519', 'step': 5382, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:24:13.426248', 'step': 5382, 'epoch': 3} {'type': 'loss', 'content': 0.0018614489817991853, 'timestamp': '2025-10-01 04:24:13.434639', 'step': 5383, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:24:13.491338', 'step': 5383, 'epoch': 3} {'type': 'loss', 'content': 0.003347143530845642, 'timestamp': '2025-10-01 04:24:13.525904', 'step': 5384, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:24:13.589460', 'step': 5384, 'epoch': 3} {'type': 'loss', 'content': 0.000540277105756104, 'timestamp': '2025-10-01 04:24:13.604877', 'step': 5385, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:24:13.662038', 'step': 5385, 'epoch': 3} {'type': 'loss', 'content': 0.005502732936292887, 'timestamp': '2025-10-01 04:24:13.674593', 'step': 5386, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:24:13.731139', 'step': 5386, 'epoch': 3} {'type': 'loss', 'content': 0.005238107871264219, 'timestamp': '2025-10-01 04:24:13.743926', 'step': 5387, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:24:13.780194', 'step': 5387, 'epoch': 3} {'type': 'loss', 'content': 0.0015437916154041886, 'timestamp': '2025-10-01 04:24:13.813712', 'step': 5388, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:24:13.866010', 'step': 5388, 'epoch': 3} {'type': 'loss', 'content': 0.0011621613521128893, 'timestamp': '2025-10-01 04:24:13.876324', 'step': 5389, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:24:13.930956', 'step': 5389, 'epoch': 3} {'type': 'loss', 'content': 0.01002065371721983, 'timestamp': '2025-10-01 04:24:13.942459', 'step': 5390, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:24:14.005763', 'step': 5390, 'epoch': 3} {'type': 'loss', 'content': 0.006555043626576662, 'timestamp': '2025-10-01 04:24:14.016456', 'step': 5391, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:24:14.063529', 'step': 5391, 'epoch': 3} {'type': 'loss', 'content': 0.0013517828192561865, 'timestamp': '2025-10-01 04:24:14.095141', 'step': 5392, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:24:14.155152', 'step': 5392, 'epoch': 3} {'type': 'loss', 'content': 0.0031422749161720276, 'timestamp': '2025-10-01 04:24:14.164445', 'step': 5393, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:24:14.231095', 'step': 5393, 'epoch': 3} {'type': 'loss', 'content': 0.005673611536622047, 'timestamp': '2025-10-01 04:24:14.242792', 'step': 5394, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:24:14.289396', 'step': 5394, 'epoch': 3} {'type': 'loss', 'content': 0.002438697963953018, 'timestamp': '2025-10-01 04:24:14.301935', 'step': 5395, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:24:14.368157', 'step': 5395, 'epoch': 3} {'type': 'loss', 'content': 0.003369108308106661, 'timestamp': '2025-10-01 04:24:14.403153', 'step': 5396, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:24:14.459576', 'step': 5396, 'epoch': 3} {'type': 'loss', 'content': 0.023969609290361404, 'timestamp': '2025-10-01 04:24:14.473026', 'step': 5397, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:24:14.532915', 'step': 5397, 'epoch': 3} {'type': 'loss', 'content': 0.030428415164351463, 'timestamp': '2025-10-01 04:24:14.545685', 'step': 5398, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:24:14.604335', 'step': 5398, 'epoch': 3} {'type': 'loss', 'content': 0.01132383942604065, 'timestamp': '2025-10-01 04:24:14.615106', 'step': 5399, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:24:14.673566', 'step': 5399, 'epoch': 3} {'type': 'loss', 'content': 0.024569733068346977, 'timestamp': '2025-10-01 04:24:14.706190', 'step': 5400, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:24:14.765712', 'step': 5400, 'epoch': 3} {'type': 'loss', 'content': 0.0020378141198307276, 'timestamp': '2025-10-01 04:24:14.779076', 'step': 5401, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:24:14.840110', 'step': 5401, 'epoch': 3} {'type': 'loss', 'content': 0.0033752743620425463, 'timestamp': '2025-10-01 04:24:14.854162', 'step': 5402, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 13289167064320}, 'timestamp': '2025-10-01 04:24:14.932466', 'step': 5402, 'epoch': 3} {'type': 'loss', 'content': 0.004570630844682455, 'timestamp': '2025-10-01 04:24:14.949007', 'step': 5403, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:24:15.006996', 'step': 5403, 'epoch': 3} {'type': 'loss', 'content': 0.005151030607521534, 'timestamp': '2025-10-01 04:24:15.039529', 'step': 5404, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:24:15.095811', 'step': 5404, 'epoch': 3} {'type': 'loss', 'content': 0.0007375701097771525, 'timestamp': '2025-10-01 04:24:15.106152', 'step': 5405, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-10-01 04:24:18.687922', 'step': 5405, 'epoch': 3} {'type': 'pplx', 'content': 6.193329987624794, 'timestamp': '2025-10-01 04:24:18.693876', 'step': 5405, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:24:18.754835', 'step': 5405, 'epoch': 3} {'type': 'loss', 'content': 0.005084317177534103, 'timestamp': '2025-10-01 04:24:18.766843', 'step': 5406, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:24:18.810046', 'step': 5406, 'epoch': 3} {'type': 'loss', 'content': 0.0038444260135293007, 'timestamp': '2025-10-01 04:24:18.820150', 'step': 5407, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:24:18.879027', 'step': 5407, 'epoch': 3} {'type': 'loss', 'content': 0.00546163460239768, 'timestamp': '2025-10-01 04:24:18.912337', 'step': 5408, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:24:18.970955', 'step': 5408, 'epoch': 3} {'type': 'loss', 'content': 0.004424616694450378, 'timestamp': '2025-10-01 04:24:18.978548', 'step': 5409, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:24:19.050196', 'step': 5409, 'epoch': 3} {'type': 'loss', 'content': 0.005965071264654398, 'timestamp': '2025-10-01 04:24:19.060056', 'step': 5410, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:24:19.118228', 'step': 5410, 'epoch': 3} {'type': 'loss', 'content': 0.003437751904129982, 'timestamp': '2025-10-01 04:24:19.129915', 'step': 5411, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:24:19.196552', 'step': 5411, 'epoch': 3} {'type': 'loss', 'content': 0.0541568249464035, 'timestamp': '2025-10-01 04:24:19.228295', 'step': 5412, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:24:19.293738', 'step': 5412, 'epoch': 3} {'type': 'loss', 'content': 0.037522610276937485, 'timestamp': '2025-10-01 04:24:19.299029', 'step': 5413, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:24:19.352755', 'step': 5413, 'epoch': 3} {'type': 'loss', 'content': 0.007879835553467274, 'timestamp': '2025-10-01 04:24:19.366360', 'step': 5414, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:24:19.412076', 'step': 5414, 'epoch': 3} {'type': 'loss', 'content': 0.007575256749987602, 'timestamp': '2025-10-01 04:24:19.421801', 'step': 5415, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:24:19.467541', 'step': 5415, 'epoch': 3} {'type': 'loss', 'content': 0.007213911507278681, 'timestamp': '2025-10-01 04:24:19.502474', 'step': 5416, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:24:19.553305', 'step': 5416, 'epoch': 3} {'type': 'loss', 'content': 0.006511066574603319, 'timestamp': '2025-10-01 04:24:19.561790', 'step': 5417, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:24:19.610848', 'step': 5417, 'epoch': 3} {'type': 'loss', 'content': 0.0033236127346754074, 'timestamp': '2025-10-01 04:24:19.618772', 'step': 5418, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:24:19.669120', 'step': 5418, 'epoch': 3} {'type': 'loss', 'content': 0.002101871417835355, 'timestamp': '2025-10-01 04:24:19.680694', 'step': 5419, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:24:19.719385', 'step': 5419, 'epoch': 3} {'type': 'loss', 'content': 0.007651519495993853, 'timestamp': '2025-10-01 04:24:19.751211', 'step': 5420, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:24:19.801205', 'step': 5420, 'epoch': 3} {'type': 'loss', 'content': 0.0020623819436877966, 'timestamp': '2025-10-01 04:24:19.813848', 'step': 5421, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:24:19.874858', 'step': 5421, 'epoch': 3} {'type': 'loss', 'content': 0.003336213529109955, 'timestamp': '2025-10-01 04:24:19.887466', 'step': 5422, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:24:19.946653', 'step': 5422, 'epoch': 3} {'type': 'loss', 'content': 0.003660374553874135, 'timestamp': '2025-10-01 04:24:19.959209', 'step': 5423, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:24:20.018074', 'step': 5423, 'epoch': 3} {'type': 'loss', 'content': 0.00549287861213088, 'timestamp': '2025-10-01 04:24:20.057561', 'step': 5424, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:24:20.114700', 'step': 5424, 'epoch': 3} {'type': 'loss', 'content': 0.05047179386019707, 'timestamp': '2025-10-01 04:24:20.128220', 'step': 5425, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:24:20.182998', 'step': 5425, 'epoch': 3} {'type': 'loss', 'content': 0.007800444960594177, 'timestamp': '2025-10-01 04:24:20.190400', 'step': 5426, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-10-01 04:24:20.240958', 'step': 5426, 'epoch': 3} {'type': 'loss', 'content': 0.10627952218055725, 'timestamp': '2025-10-01 04:24:20.245535', 'step': 5427, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-10-01 04:24:20.297049', 'step': 5427, 'epoch': 3} {'type': 'loss', 'content': 0.0730108842253685, 'timestamp': '2025-10-01 04:24:20.326232', 'step': 5428, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:24:20.380895', 'step': 5428, 'epoch': 3} {'type': 'loss', 'content': 0.0056351423263549805, 'timestamp': '2025-10-01 04:24:20.390157', 'step': 5429, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:24:20.442849', 'step': 5429, 'epoch': 3} {'type': 'loss', 'content': 0.0048440201207995415, 'timestamp': '2025-10-01 04:24:20.452730', 'step': 5430, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:24:20.502084', 'step': 5430, 'epoch': 3} {'type': 'loss', 'content': 0.005652627442032099, 'timestamp': '2025-10-01 04:24:20.511580', 'step': 5431, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:24:20.555959', 'step': 5431, 'epoch': 3} {'type': 'loss', 'content': 0.006298520602285862, 'timestamp': '2025-10-01 04:24:20.586462', 'step': 5432, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:24:20.644703', 'step': 5432, 'epoch': 3} {'type': 'loss', 'content': 0.005774347111582756, 'timestamp': '2025-10-01 04:24:20.654194', 'step': 5433, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:24:20.712252', 'step': 5433, 'epoch': 3} {'type': 'loss', 'content': 0.016085920855402946, 'timestamp': '2025-10-01 04:24:20.720802', 'step': 5434, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:24:20.790356', 'step': 5434, 'epoch': 3} {'type': 'loss', 'content': 0.006684123072773218, 'timestamp': '2025-10-01 04:24:20.802313', 'step': 5435, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:24:20.857994', 'step': 5435, 'epoch': 3} {'type': 'loss', 'content': 0.005314816255122423, 'timestamp': '2025-10-01 04:24:20.891143', 'step': 5436, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:24:20.952615', 'step': 5436, 'epoch': 3} {'type': 'loss', 'content': 0.03055710718035698, 'timestamp': '2025-10-01 04:24:20.962249', 'step': 5437, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:24:21.028890', 'step': 5437, 'epoch': 3} {'type': 'loss', 'content': 0.008745490573346615, 'timestamp': '2025-10-01 04:24:21.042420', 'step': 5438, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:24:21.123505', 'step': 5438, 'epoch': 3} {'type': 'loss', 'content': 0.011318737640976906, 'timestamp': '2025-10-01 04:24:21.130314', 'step': 5439, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:24:21.209964', 'step': 5439, 'epoch': 3} {'type': 'loss', 'content': 0.005292117595672607, 'timestamp': '2025-10-01 04:24:21.239943', 'step': 5440, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:24:21.332420', 'step': 5440, 'epoch': 3} {'type': 'loss', 'content': 0.00215301220305264, 'timestamp': '2025-10-01 04:24:21.336768', 'step': 5441, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:24:21.388908', 'step': 5441, 'epoch': 3} {'type': 'loss', 'content': 0.01711714081466198, 'timestamp': '2025-10-01 04:24:21.397006', 'step': 5442, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:24:21.456843', 'step': 5442, 'epoch': 3} {'type': 'loss', 'content': 0.01182369701564312, 'timestamp': '2025-10-01 04:24:21.470405', 'step': 5443, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:24:21.520378', 'step': 5443, 'epoch': 3} {'type': 'loss', 'content': 0.009899038821458817, 'timestamp': '2025-10-01 04:24:21.552752', 'step': 5444, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:24:21.598562', 'step': 5444, 'epoch': 3} {'type': 'loss', 'content': 0.004453026223927736, 'timestamp': '2025-10-01 04:24:21.605469', 'step': 5445, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:24:21.647077', 'step': 5445, 'epoch': 3} {'type': 'loss', 'content': 0.014642270281910896, 'timestamp': '2025-10-01 04:24:21.660707', 'step': 5446, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 14712978242368}, 'timestamp': '2025-10-01 04:24:21.710357', 'step': 5446, 'epoch': 3} {'type': 'loss', 'content': 0.007288447115570307, 'timestamp': '2025-10-01 04:24:21.728137', 'step': 5447, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:24:21.772523', 'step': 5447, 'epoch': 3} {'type': 'loss', 'content': 0.015585031360387802, 'timestamp': '2025-10-01 04:24:21.807544', 'step': 5448, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 400], 'flops': 11865355886272}, 'timestamp': '2025-10-01 04:24:21.850304', 'step': 5448, 'epoch': 3} {'type': 'loss', 'content': 0.014378228224813938, 'timestamp': '2025-10-01 04:24:21.865693', 'step': 5449, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:24:21.902329', 'step': 5449, 'epoch': 3} {'type': 'loss', 'content': 0.00515323132276535, 'timestamp': '2025-10-01 04:24:21.915071', 'step': 5450, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:24:21.961449', 'step': 5450, 'epoch': 3} {'type': 'loss', 'content': 0.016072405502200127, 'timestamp': '2025-10-01 04:24:21.974229', 'step': 5451, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:24:22.032735', 'step': 5451, 'epoch': 3} {'type': 'loss', 'content': 0.017450502142310143, 'timestamp': '2025-10-01 04:24:22.068586', 'step': 5452, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:24:22.109138', 'step': 5452, 'epoch': 3} {'type': 'loss', 'content': 0.0013844125205650926, 'timestamp': '2025-10-01 04:24:22.120253', 'step': 5453, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:24:22.173021', 'step': 5453, 'epoch': 3} {'type': 'loss', 'content': 0.008425706997513771, 'timestamp': '2025-10-01 04:24:22.185625', 'step': 5454, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:24:22.261926', 'step': 5454, 'epoch': 3} {'type': 'loss', 'content': 0.004553755279630423, 'timestamp': '2025-10-01 04:24:22.276052', 'step': 5455, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:24:22.330346', 'step': 5455, 'epoch': 3} {'type': 'loss', 'content': 0.0032538871746510267, 'timestamp': '2025-10-01 04:24:22.362372', 'step': 5456, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:24:22.425604', 'step': 5456, 'epoch': 3} {'type': 'loss', 'content': 0.00573766091838479, 'timestamp': '2025-10-01 04:24:22.436201', 'step': 5457, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:24:22.512556', 'step': 5457, 'epoch': 3} {'type': 'loss', 'content': 0.008339236490428448, 'timestamp': '2025-10-01 04:24:22.526576', 'step': 5458, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:24:22.598516', 'step': 5458, 'epoch': 3} {'type': 'loss', 'content': 0.004803844261914492, 'timestamp': '2025-10-01 04:24:22.611299', 'step': 5459, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:24:22.655142', 'step': 5459, 'epoch': 3} {'type': 'loss', 'content': 0.004606284666806459, 'timestamp': '2025-10-01 04:24:22.687035', 'step': 5460, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:24:22.731113', 'step': 5460, 'epoch': 3} {'type': 'loss', 'content': 0.004080477636307478, 'timestamp': '2025-10-01 04:24:22.740217', 'step': 5461, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:24:22.774815', 'step': 5461, 'epoch': 3} {'type': 'loss', 'content': 0.0018637393368408084, 'timestamp': '2025-10-01 04:24:22.786436', 'step': 5462, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:24:22.824770', 'step': 5462, 'epoch': 3} {'type': 'loss', 'content': 0.0020233020186424255, 'timestamp': '2025-10-01 04:24:22.833190', 'step': 5463, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:24:22.875681', 'step': 5463, 'epoch': 3} {'type': 'loss', 'content': 0.031122850254178047, 'timestamp': '2025-10-01 04:24:22.910634', 'step': 5464, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:24:22.954903', 'step': 5464, 'epoch': 3} {'type': 'loss', 'content': 0.003831378649920225, 'timestamp': '2025-10-01 04:24:22.970503', 'step': 5465, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:24:23.009424', 'step': 5465, 'epoch': 3} {'type': 'loss', 'content': 0.0026969502214342356, 'timestamp': '2025-10-01 04:24:23.023475', 'step': 5466, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:24:23.063847', 'step': 5466, 'epoch': 3} {'type': 'loss', 'content': 0.00751885399222374, 'timestamp': '2025-10-01 04:24:23.077966', 'step': 5467, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:24:23.111252', 'step': 5467, 'epoch': 3} {'type': 'loss', 'content': 0.0023064289707690477, 'timestamp': '2025-10-01 04:24:23.144980', 'step': 5468, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:24:23.185957', 'step': 5468, 'epoch': 3} {'type': 'loss', 'content': 0.0005153215606696904, 'timestamp': '2025-10-01 04:24:23.196545', 'step': 5469, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:24:23.230597', 'step': 5469, 'epoch': 3} {'type': 'loss', 'content': 0.00027331229648552835, 'timestamp': '2025-10-01 04:24:23.243357', 'step': 5470, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:24:23.291175', 'step': 5470, 'epoch': 3} {'type': 'loss', 'content': 0.0014309536200016737, 'timestamp': '2025-10-01 04:24:23.307322', 'step': 5471, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:24:23.352272', 'step': 5471, 'epoch': 3} {'type': 'loss', 'content': 0.004918236751109362, 'timestamp': '2025-10-01 04:24:23.381506', 'step': 5472, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-10-01 04:24:23.435845', 'step': 5472, 'epoch': 3} {'type': 'loss', 'content': 0.008230761624872684, 'timestamp': '2025-10-01 04:24:23.451721', 'step': 5473, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:24:23.508286', 'step': 5473, 'epoch': 3} {'type': 'loss', 'content': 0.0023156101815402508, 'timestamp': '2025-10-01 04:24:23.520161', 'step': 5474, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:24:23.577511', 'step': 5474, 'epoch': 3} {'type': 'loss', 'content': 0.007585729938000441, 'timestamp': '2025-10-01 04:24:23.589241', 'step': 5475, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:24:23.635303', 'step': 5475, 'epoch': 3} {'type': 'loss', 'content': 0.009170310571789742, 'timestamp': '2025-10-01 04:24:23.667159', 'step': 5476, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 12814563338304}, 'timestamp': '2025-10-01 04:24:23.743468', 'step': 5476, 'epoch': 3} {'type': 'loss', 'content': 0.0024224722292274237, 'timestamp': '2025-10-01 04:24:23.759385', 'step': 5477, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:24:23.816042', 'step': 5477, 'epoch': 3} {'type': 'loss', 'content': 0.002916931640356779, 'timestamp': '2025-10-01 04:24:23.830123', 'step': 5478, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:24:23.895059', 'step': 5478, 'epoch': 3} {'type': 'loss', 'content': 0.006915598642081022, 'timestamp': '2025-10-01 04:24:23.909134', 'step': 5479, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 14238374516352}, 'timestamp': '2025-10-01 04:24:23.970716', 'step': 5479, 'epoch': 3} {'type': 'loss', 'content': 0.0044959308579564095, 'timestamp': '2025-10-01 04:24:24.009214', 'step': 5480, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:24:24.056537', 'step': 5480, 'epoch': 3} {'type': 'loss', 'content': 0.0054198033176362514, 'timestamp': '2025-10-01 04:24:24.065911', 'step': 5481, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:24:24.119758', 'step': 5481, 'epoch': 3} {'type': 'loss', 'content': 0.004357680678367615, 'timestamp': '2025-10-01 04:24:24.132349', 'step': 5482, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:24:24.181201', 'step': 5482, 'epoch': 3} {'type': 'loss', 'content': 0.011778258718550205, 'timestamp': '2025-10-01 04:24:24.195242', 'step': 5483, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:24:24.250777', 'step': 5483, 'epoch': 3} {'type': 'loss', 'content': 0.0042909434996545315, 'timestamp': '2025-10-01 04:24:24.285927', 'step': 5484, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:24:24.337307', 'step': 5484, 'epoch': 3} {'type': 'loss', 'content': 0.004506285302340984, 'timestamp': '2025-10-01 04:24:24.347036', 'step': 5485, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:24:24.391892', 'step': 5485, 'epoch': 3} {'type': 'loss', 'content': 0.0039129238575696945, 'timestamp': '2025-10-01 04:24:24.405491', 'step': 5486, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:24:24.458476', 'step': 5486, 'epoch': 3} {'type': 'loss', 'content': 0.0028871966060250998, 'timestamp': '2025-10-01 04:24:24.472524', 'step': 5487, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:24:24.518757', 'step': 5487, 'epoch': 3} {'type': 'loss', 'content': 0.0012523906771093607, 'timestamp': '2025-10-01 04:24:24.552236', 'step': 5488, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:24:24.606593', 'step': 5488, 'epoch': 3} {'type': 'loss', 'content': 0.0040623885579407215, 'timestamp': '2025-10-01 04:24:24.616871', 'step': 5489, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:24:24.667101', 'step': 5489, 'epoch': 3} {'type': 'loss', 'content': 0.018081141635775566, 'timestamp': '2025-10-01 04:24:24.678663', 'step': 5490, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:24:24.729139', 'step': 5490, 'epoch': 3} {'type': 'loss', 'content': 0.002489799167960882, 'timestamp': '2025-10-01 04:24:24.739844', 'step': 5491, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:24:24.797232', 'step': 5491, 'epoch': 3} {'type': 'loss', 'content': 0.012280495837330818, 'timestamp': '2025-10-01 04:24:24.829794', 'step': 5492, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-10-01 04:24:24.868349', 'step': 5492, 'epoch': 3} {'type': 'loss', 'content': 0.0032575256191194057, 'timestamp': '2025-10-01 04:24:24.880564', 'step': 5493, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:24:24.933246', 'step': 5493, 'epoch': 3} {'type': 'loss', 'content': 0.008565392345190048, 'timestamp': '2025-10-01 04:24:24.940643', 'step': 5494, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:24:24.998325', 'step': 5494, 'epoch': 3} {'type': 'loss', 'content': 0.0019903990905731916, 'timestamp': '2025-10-01 04:24:25.011834', 'step': 5495, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:24:25.064621', 'step': 5495, 'epoch': 3} {'type': 'loss', 'content': 0.001956341555342078, 'timestamp': '2025-10-01 04:24:25.098311', 'step': 5496, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:24:25.147366', 'step': 5496, 'epoch': 3} {'type': 'loss', 'content': 0.0070889657363295555, 'timestamp': '2025-10-01 04:24:25.157844', 'step': 5497, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:24:25.209387', 'step': 5497, 'epoch': 3} {'type': 'loss', 'content': 0.0019742052536457777, 'timestamp': '2025-10-01 04:24:25.222205', 'step': 5498, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:24:25.278173', 'step': 5498, 'epoch': 3} {'type': 'loss', 'content': 0.0010288851335644722, 'timestamp': '2025-10-01 04:24:25.291749', 'step': 5499, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:24:25.340682', 'step': 5499, 'epoch': 3} {'type': 'loss', 'content': 0.0029652214143425226, 'timestamp': '2025-10-01 04:24:25.374202', 'step': 5500, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 5500', 'timestamp': '2025-10-01 04:24:30.719115', 'step': 5500, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:24:30.772383', 'step': 5500, 'epoch': 3} {'type': 'loss', 'content': 0.0005860340315848589, 'timestamp': '2025-10-01 04:24:30.785853', 'step': 5501, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:24:30.837332', 'step': 5501, 'epoch': 3} {'type': 'loss', 'content': 0.003824775805696845, 'timestamp': '2025-10-01 04:24:30.849279', 'step': 5502, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 17560600598464}, 'timestamp': '2025-10-01 04:24:30.923945', 'step': 5502, 'epoch': 3} {'type': 'loss', 'content': 0.0023927565198391676, 'timestamp': '2025-10-01 04:24:30.945175', 'step': 5503, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:24:30.993742', 'step': 5503, 'epoch': 3} {'type': 'loss', 'content': 0.0073613799177110195, 'timestamp': '2025-10-01 04:24:31.028268', 'step': 5504, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-10-01 04:24:31.075092', 'step': 5504, 'epoch': 3} {'type': 'loss', 'content': 0.002768108155578375, 'timestamp': '2025-10-01 04:24:31.084415', 'step': 5505, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:24:31.124703', 'step': 5505, 'epoch': 3} {'type': 'loss', 'content': 0.0001357414002995938, 'timestamp': '2025-10-01 04:24:31.137270', 'step': 5506, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:24:31.185400', 'step': 5506, 'epoch': 3} {'type': 'loss', 'content': 0.0009535987628623843, 'timestamp': '2025-10-01 04:24:31.198921', 'step': 5507, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:24:31.242708', 'step': 5507, 'epoch': 3} {'type': 'loss', 'content': 0.0029519288800656796, 'timestamp': '2025-10-01 04:24:31.276434', 'step': 5508, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:24:31.322336', 'step': 5508, 'epoch': 3} {'type': 'loss', 'content': 0.004851772449910641, 'timestamp': '2025-10-01 04:24:31.335226', 'step': 5509, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:24:31.378989', 'step': 5509, 'epoch': 3} {'type': 'loss', 'content': 0.007886880077421665, 'timestamp': '2025-10-01 04:24:31.390387', 'step': 5510, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:24:31.433022', 'step': 5510, 'epoch': 3} {'type': 'loss', 'content': 0.012297947891056538, 'timestamp': '2025-10-01 04:24:31.445565', 'step': 5511, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:24:31.493209', 'step': 5511, 'epoch': 3} {'type': 'loss', 'content': 0.01271713338792324, 'timestamp': '2025-10-01 04:24:31.528185', 'step': 5512, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:24:31.597699', 'step': 5512, 'epoch': 3} {'type': 'loss', 'content': 0.004880187567323446, 'timestamp': '2025-10-01 04:24:31.610544', 'step': 5513, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:24:31.661516', 'step': 5513, 'epoch': 3} {'type': 'loss', 'content': 0.00449975673109293, 'timestamp': '2025-10-01 04:24:31.674318', 'step': 5514, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:24:31.727191', 'step': 5514, 'epoch': 3} {'type': 'loss', 'content': 0.005049306899309158, 'timestamp': '2025-10-01 04:24:31.741203', 'step': 5515, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:24:31.782592', 'step': 5515, 'epoch': 3} {'type': 'loss', 'content': 0.004584277980029583, 'timestamp': '2025-10-01 04:24:31.816831', 'step': 5516, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:24:31.851916', 'step': 5516, 'epoch': 3} {'type': 'loss', 'content': 0.044918738305568695, 'timestamp': '2025-10-01 04:24:31.862330', 'step': 5517, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-10-01 04:24:31.903745', 'step': 5517, 'epoch': 3} {'type': 'loss', 'content': 0.01198052242398262, 'timestamp': '2025-10-01 04:24:31.915264', 'step': 5518, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-10-01 04:24:31.967300', 'step': 5518, 'epoch': 3} {'type': 'loss', 'content': 0.030341144651174545, 'timestamp': '2025-10-01 04:24:31.978096', 'step': 5519, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 11390752160256}, 'timestamp': '2025-10-01 04:24:32.021895', 'step': 5519, 'epoch': 3} {'type': 'loss', 'content': 0.0008790760766714811, 'timestamp': '2025-10-01 04:24:32.057001', 'step': 5520, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-10-01 04:24:35.410432', 'step': 5520, 'epoch': 3} {'type': 'pplx', 'content': 5.962782005038004, 'timestamp': '2025-10-01 04:24:35.414564', 'step': 5520, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:24:35.462983', 'step': 5520, 'epoch': 3} {'type': 'loss', 'content': 0.013343345373868942, 'timestamp': '2025-10-01 04:24:35.476405', 'step': 5521, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 15187581968384}, 'timestamp': '2025-10-01 04:24:35.560617', 'step': 5521, 'epoch': 3} {'type': 'loss', 'content': 0.008724113926291466, 'timestamp': '2025-10-01 04:24:35.578559', 'step': 5522, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 12339959612288}, 'timestamp': '2025-10-01 04:24:35.630677', 'step': 5522, 'epoch': 3} {'type': 'loss', 'content': 0.00465447548776865, 'timestamp': '2025-10-01 04:24:35.646826', 'step': 5523, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:24:35.714309', 'step': 5523, 'epoch': 3} {'type': 'loss', 'content': 0.0010397362057119608, 'timestamp': '2025-10-01 04:24:35.748886', 'step': 5524, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-10-01 04:24:35.811598', 'step': 5524, 'epoch': 3} {'type': 'loss', 'content': 0.003975869156420231, 'timestamp': '2025-10-01 04:24:35.822180', 'step': 5525, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:24:35.875300', 'step': 5525, 'epoch': 3} {'type': 'loss', 'content': 0.004421168472617865, 'timestamp': '2025-10-01 04:24:35.888855', 'step': 5526, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:24:35.929945', 'step': 5526, 'epoch': 3} {'type': 'loss', 'content': 0.005012675188481808, 'timestamp': '2025-10-01 04:24:35.943522', 'step': 5527, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 352], 'flops': 10441544708224}, 'timestamp': '2025-10-01 04:24:36.000053', 'step': 5527, 'epoch': 3} {'type': 'loss', 'content': 0.0019066582899540663, 'timestamp': '2025-10-01 04:24:36.035078', 'step': 5528, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-10-01 04:24:36.087915', 'step': 5528, 'epoch': 3} {'type': 'loss', 'content': 0.0024140230379998684, 'timestamp': '2025-10-01 04:24:36.100725', 'step': 5529, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 320], 'flops': 9492337256192}, 'timestamp': '2025-10-01 04:24:36.146678', 'step': 5529, 'epoch': 3} {'type': 'loss', 'content': 0.006315488833934069, 'timestamp': '2025-10-01 04:24:36.159399', 'step': 5530, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 368], 'flops': 10916148434240}, 'timestamp': '2025-10-01 04:24:36.211368', 'step': 5530, 'epoch': 3} {'type': 'loss', 'content': 0.007864706218242645, 'timestamp': '2025-10-01 04:24:36.225375', 'step': 5531, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-10-01 04:24:36.276172', 'step': 5531, 'epoch': 3} {'type': 'loss', 'content': 0.015821628272533417, 'timestamp': '2025-10-01 04:24:36.305268', 'step': 5532, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 464], 'batch_size': 8, 'flops': 9175621452416}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 336], 'batch_size': 8, 'flops': 6644415553152}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 400], 'batch_size': 8, 'flops': 7910018502784}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 416], 'batch_size': 8, 'flops': 8226419240192}, {'type': 'perplexity', 'in_batch_dim': [8, 384], 'batch_size': 8, 'flops': 7593617765376}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 368], 'batch_size': 8, 'flops': 7277217027968}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 480], 'batch_size': 8, 'flops': 9492022189824}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 240], 'batch_size': 8, 'flops': 4746011128704}, {'type': 'perplexity', 'in_batch_dim': [8, 272], 'batch_size': 8, 'flops': 5378812603520}, {'type': 'perplexity', 'in_batch_dim': [8, 288], 'batch_size': 8, 'flops': 5695213340928}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 352], 'batch_size': 8, 'flops': 6960816290560}, {'type': 'perplexity', 'in_batch_dim': [8, 304], 'batch_size': 8, 'flops': 6011614078336}, {'type': 'perplexity', 'in_batch_dim': [8, 320], 'batch_size': 8, 'flops': 6328014815744}, {'type': 'perplexity', 'in_batch_dim': [8, 496], 'batch_size': 8, 'flops': 9808422927232}, {'type': 'perplexity', 'in_batch_dim': [3, 240], 'batch_size': 8, 'flops': 4746011128704}], 'timestamp': '2025-10-01 04:24:39.213512', 'step': 5532, 'epoch': 3} {'type': 'pplx', 'content': 5.987665207809609, 'timestamp': '2025-10-01 04:24:39.217090', 'step': 5532, 'epoch': 3} {'type': 'best_pplx', 'content': 5.332488542738945, 'timestamp': '2025-10-01 04:24:39.221353', 'step': 5532, 'epoch': 3} {'type': 'best_step', 'content': 1380, 'timestamp': '2025-10-01 04:24:39.225724', 'step': 5532, 'epoch': 3} {'type': 'total_pplx_flops', 'content': 24378677094380800, 'timestamp': '2025-10-01 04:24:39.228971', 'step': 5532, 'epoch': 3} {'type': 'total_train_flops', 'content': 49118667663965760, 'timestamp': '2025-10-01 04:24:39.232893', 'step': 5532, 'epoch': 3}