| {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1277531617792}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1277531617792}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}], 'timestamp': '2025-09-02 14:21:27.218893', 'step': 0, 'epoch': 0} |
| {'type': 'pplx', 'content': 783.2801604021665, 'timestamp': '2025-09-02 14:21:27.221018', 'step': 0, 'epoch': 0} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:27.319113', 'step': 0, 'epoch': 1} |
| {'type': 'loss', 'content': 0.5385962724685669, 'timestamp': '2025-09-02 14:21:27.323611', 'step': 1, 'epoch': 1} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-02 14:21:27.486133', 'step': 1, 'epoch': 1} |
| {'type': 'loss', 'content': 0.6766868829727173, 'timestamp': '2025-09-02 14:21:27.488392', 'step': 2, 'epoch': 1} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:27.543411', 'step': 2, 'epoch': 1} |
| {'type': 'loss', 'content': 0.581355094909668, 'timestamp': '2025-09-02 14:21:27.545223', 'step': 3, 'epoch': 1} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:27.598898', 'step': 3, 'epoch': 1} |
| {'type': 'loss', 'content': 0.4923665225505829, 'timestamp': '2025-09-02 14:21:27.641063', 'step': 4, 'epoch': 1} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 80], 'flops': 1600009780544.0}, 'timestamp': '2025-09-02 14:21:27.717530', 'step': 4, 'epoch': 1} |
| {'type': 'loss', 'content': 0.47263336181640625, 'timestamp': '2025-09-02 14:21:27.720862', 'step': 5, 'epoch': 1} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 80], 'flops': 1600009780544.0}, 'timestamp': '2025-09-02 14:21:27.891709', 'step': 5, 'epoch': 1} |
| {'type': 'loss', 'content': 0.4729557931423187, 'timestamp': '2025-09-02 14:21:27.893739', 'step': 6, 'epoch': 1} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:28.110405', 'step': 6, 'epoch': 1} |
| {'type': 'loss', 'content': 0.5970560312271118, 'timestamp': '2025-09-02 14:21:28.112760', 'step': 7, 'epoch': 1} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:28.167553', 'step': 7, 'epoch': 1} |
| {'type': 'loss', 'content': 0.5336737632751465, 'timestamp': '2025-09-02 14:21:28.173811', 'step': 8, 'epoch': 1} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 80], 'flops': 1600009780544.0}, 'timestamp': '2025-09-02 14:21:28.228855', 'step': 8, 'epoch': 1} |
| {'type': 'loss', 'content': 0.5517171621322632, 'timestamp': '2025-09-02 14:21:28.231165', 'step': 9, 'epoch': 1} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:28.285731', 'step': 9, 'epoch': 1} |
| {'type': 'loss', 'content': 0.560826301574707, 'timestamp': '2025-09-02 14:21:28.288103', 'step': 10, 'epoch': 1} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-02 14:21:28.355143', 'step': 10, 'epoch': 1} |
| {'type': 'loss', 'content': 0.4922583997249603, 'timestamp': '2025-09-02 14:21:28.357371', 'step': 11, 'epoch': 1} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:28.412119', 'step': 11, 'epoch': 1} |
| {'type': 'loss', 'content': 0.5552511215209961, 'timestamp': '2025-09-02 14:21:28.418386', 'step': 12, 'epoch': 1} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:28.472140', 'step': 12, 'epoch': 1} |
| {'type': 'loss', 'content': 0.5018913745880127, 'timestamp': '2025-09-02 14:21:28.474202', 'step': 13, 'epoch': 1} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:28.528455', 'step': 13, 'epoch': 1} |
| {'type': 'loss', 'content': 0.5736156702041626, 'timestamp': '2025-09-02 14:21:28.531852', 'step': 14, 'epoch': 1} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:28.586217', 'step': 14, 'epoch': 1} |
| {'type': 'loss', 'content': 0.5742080211639404, 'timestamp': '2025-09-02 14:21:28.588333', 'step': 15, 'epoch': 1} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:28.642064', 'step': 15, 'epoch': 1} |
| {'type': 'loss', 'content': 0.6092644333839417, 'timestamp': '2025-09-02 14:21:28.647970', 'step': 16, 'epoch': 1} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:28.700927', 'step': 16, 'epoch': 1} |
| {'type': 'loss', 'content': 0.3531661331653595, 'timestamp': '2025-09-02 14:21:28.703190', 'step': 17, 'epoch': 1} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-02 14:21:28.756482', 'step': 17, 'epoch': 1} |
| {'type': 'loss', 'content': 0.5819375514984131, 'timestamp': '2025-09-02 14:21:28.758973', 'step': 18, 'epoch': 1} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-02 14:21:28.812931', 'step': 18, 'epoch': 1} |
| {'type': 'loss', 'content': 0.4801793396472931, 'timestamp': '2025-09-02 14:21:28.815217', 'step': 19, 'epoch': 1} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:28.868817', 'step': 19, 'epoch': 1} |
| {'type': 'loss', 'content': 0.455944687128067, 'timestamp': '2025-09-02 14:21:28.874893', 'step': 20, 'epoch': 1} |
| {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1277531617792}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1277531617792}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}], 'timestamp': '2025-09-02 14:21:28.965740', 'step': 20, 'epoch': 1} |
| {'type': 'pplx', 'content': 696.4661659481613, 'timestamp': '2025-09-02 14:21:28.967294', 'step': 20, 'epoch': 1} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:29.018472', 'step': 20, 'epoch': 1} |
| {'type': 'loss', 'content': 0.24104715883731842, 'timestamp': '2025-09-02 14:21:29.020355', 'step': 21, 'epoch': 1} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 80], 'flops': 1600009780544.0}, 'timestamp': '2025-09-02 14:21:29.073203', 'step': 21, 'epoch': 1} |
| {'type': 'loss', 'content': 0.37299883365631104, 'timestamp': '2025-09-02 14:21:29.075133', 'step': 22, 'epoch': 1} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:29.128468', 'step': 22, 'epoch': 1} |
| {'type': 'loss', 'content': 0.5388383865356445, 'timestamp': '2025-09-02 14:21:29.130426', 'step': 23, 'epoch': 1} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:29.184124', 'step': 23, 'epoch': 1} |
| {'type': 'loss', 'content': 0.43882712721824646, 'timestamp': '2025-09-02 14:21:29.189601', 'step': 24, 'epoch': 1} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-02 14:21:29.248067', 'step': 24, 'epoch': 1} |
| {'type': 'loss', 'content': 0.5819587111473083, 'timestamp': '2025-09-02 14:21:29.250596', 'step': 25, 'epoch': 1} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-02 14:21:29.319504', 'step': 25, 'epoch': 1} |
| {'type': 'loss', 'content': 0.512592613697052, 'timestamp': '2025-09-02 14:21:29.322077', 'step': 26, 'epoch': 1} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:29.374850', 'step': 26, 'epoch': 1} |
| {'type': 'loss', 'content': 0.2706955373287201, 'timestamp': '2025-09-02 14:21:29.377126', 'step': 27, 'epoch': 1} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-02 14:21:29.430306', 'step': 27, 'epoch': 1} |
| {'type': 'loss', 'content': 0.6439404487609863, 'timestamp': '2025-09-02 14:21:29.436441', 'step': 28, 'epoch': 1} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:29.489491', 'step': 28, 'epoch': 1} |
| {'type': 'loss', 'content': 0.2643767297267914, 'timestamp': '2025-09-02 14:21:29.491360', 'step': 29, 'epoch': 1} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:29.544178', 'step': 29, 'epoch': 1} |
| {'type': 'loss', 'content': 0.5361843705177307, 'timestamp': '2025-09-02 14:21:29.546344', 'step': 30, 'epoch': 1} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:29.599006', 'step': 30, 'epoch': 1} |
| {'type': 'loss', 'content': 0.28547146916389465, 'timestamp': '2025-09-02 14:21:29.601547', 'step': 31, 'epoch': 1} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-02 14:21:29.654959', 'step': 31, 'epoch': 1} |
| {'type': 'loss', 'content': 0.4324412941932678, 'timestamp': '2025-09-02 14:21:29.660354', 'step': 32, 'epoch': 1} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-02 14:21:29.714266', 'step': 32, 'epoch': 1} |
| {'type': 'loss', 'content': 0.5039660334587097, 'timestamp': '2025-09-02 14:21:29.716507', 'step': 33, 'epoch': 1} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 80], 'flops': 1600009780544.0}, 'timestamp': '2025-09-02 14:21:29.769526', 'step': 33, 'epoch': 1} |
| {'type': 'loss', 'content': 0.14017771184444427, 'timestamp': '2025-09-02 14:21:29.772206', 'step': 34, 'epoch': 1} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-02 14:21:29.825620', 'step': 34, 'epoch': 1} |
| {'type': 'loss', 'content': 0.49862122535705566, 'timestamp': '2025-09-02 14:21:29.827919', 'step': 35, 'epoch': 1} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-02 14:21:29.881779', 'step': 35, 'epoch': 1} |
| {'type': 'loss', 'content': 0.34906959533691406, 'timestamp': '2025-09-02 14:21:29.887914', 'step': 36, 'epoch': 1} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-02 14:21:29.940574', 'step': 36, 'epoch': 1} |
| {'type': 'loss', 'content': 0.44707420468330383, 'timestamp': '2025-09-02 14:21:29.943230', 'step': 37, 'epoch': 1} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:29.997237', 'step': 37, 'epoch': 1} |
| {'type': 'loss', 'content': 0.3127749264240265, 'timestamp': '2025-09-02 14:21:29.999320', 'step': 38, 'epoch': 1} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 80], 'flops': 1600009780544.0}, 'timestamp': '2025-09-02 14:21:30.052231', 'step': 38, 'epoch': 1} |
| {'type': 'loss', 'content': 0.09935981035232544, 'timestamp': '2025-09-02 14:21:30.054156', 'step': 39, 'epoch': 1} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 80], 'flops': 1600009780544.0}, 'timestamp': '2025-09-02 14:21:30.107743', 'step': 39, 'epoch': 1} |
| {'type': 'loss', 'content': 0.2434508055448532, 'timestamp': '2025-09-02 14:21:30.113846', 'step': 40, 'epoch': 1} |
| {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1277531617792}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1277531617792}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}], 'timestamp': '2025-09-02 14:21:30.195551', 'step': 40, 'epoch': 1} |
| {'type': 'pplx', 'content': 579.2292569502187, 'timestamp': '2025-09-02 14:21:30.197522', 'step': 40, 'epoch': 1} |
| {'type': 'info', 'content': 'Checkpoint saved at step 40', 'timestamp': '2025-09-02 14:21:30.710346', 'step': 40, 'epoch': 1} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:30.765378', 'step': 40, 'epoch': 1} |
| {'type': 'loss', 'content': 0.3171207308769226, 'timestamp': '2025-09-02 14:21:30.767879', 'step': 41, 'epoch': 1} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 80], 'flops': 1600009780544.0}, 'timestamp': '2025-09-02 14:21:30.822271', 'step': 41, 'epoch': 1} |
| {'type': 'loss', 'content': 0.19817563891410828, 'timestamp': '2025-09-02 14:21:30.824183', 'step': 42, 'epoch': 1} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:30.877644', 'step': 42, 'epoch': 1} |
| {'type': 'loss', 'content': 0.2943657636642456, 'timestamp': '2025-09-02 14:21:30.879601', 'step': 43, 'epoch': 1} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:30.932748', 'step': 43, 'epoch': 1} |
| {'type': 'loss', 'content': 0.18553060293197632, 'timestamp': '2025-09-02 14:21:30.938789', 'step': 44, 'epoch': 1} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-02 14:21:30.991444', 'step': 44, 'epoch': 1} |
| {'type': 'loss', 'content': 0.08597612380981445, 'timestamp': '2025-09-02 14:21:30.993748', 'step': 45, 'epoch': 1} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-02 14:21:31.046742', 'step': 45, 'epoch': 1} |
| {'type': 'loss', 'content': 0.409289687871933, 'timestamp': '2025-09-02 14:21:31.049401', 'step': 46, 'epoch': 1} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-02 14:21:31.102957', 'step': 46, 'epoch': 1} |
| {'type': 'loss', 'content': 0.17468379437923431, 'timestamp': '2025-09-02 14:21:31.105373', 'step': 47, 'epoch': 1} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:31.159384', 'step': 47, 'epoch': 1} |
| {'type': 'loss', 'content': 0.3233559727668762, 'timestamp': '2025-09-02 14:21:31.165479', 'step': 48, 'epoch': 1} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [3, 96], 'flops': 1440008813856.0}, 'timestamp': '2025-09-02 14:21:31.240994', 'step': 48, 'epoch': 1} |
| {'type': 'loss', 'content': 0.06934315711259842, 'timestamp': '2025-09-02 14:21:31.243148', 'step': 49, 'epoch': 1} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:31.299325', 'step': 49, 'epoch': 2} |
| {'type': 'loss', 'content': 0.06656854599714279, 'timestamp': '2025-09-02 14:21:31.301741', 'step': 50, 'epoch': 2} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-02 14:21:31.354292', 'step': 50, 'epoch': 2} |
| {'type': 'loss', 'content': 0.2959572374820709, 'timestamp': '2025-09-02 14:21:31.356574', 'step': 51, 'epoch': 2} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:31.409997', 'step': 51, 'epoch': 2} |
| {'type': 'loss', 'content': 0.22123625874519348, 'timestamp': '2025-09-02 14:21:31.415918', 'step': 52, 'epoch': 2} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:31.467670', 'step': 52, 'epoch': 2} |
| {'type': 'loss', 'content': 0.10318443924188614, 'timestamp': '2025-09-02 14:21:31.469825', 'step': 53, 'epoch': 2} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 80], 'flops': 1600009780544.0}, 'timestamp': '2025-09-02 14:21:31.522936', 'step': 53, 'epoch': 2} |
| {'type': 'loss', 'content': 0.1703936606645584, 'timestamp': '2025-09-02 14:21:31.525153', 'step': 54, 'epoch': 2} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 80], 'flops': 1600009780544.0}, 'timestamp': '2025-09-02 14:21:31.576924', 'step': 54, 'epoch': 2} |
| {'type': 'loss', 'content': 0.05911420285701752, 'timestamp': '2025-09-02 14:21:31.578891', 'step': 55, 'epoch': 2} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:31.630999', 'step': 55, 'epoch': 2} |
| {'type': 'loss', 'content': 0.2049095332622528, 'timestamp': '2025-09-02 14:21:31.636767', 'step': 56, 'epoch': 2} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:31.688731', 'step': 56, 'epoch': 2} |
| {'type': 'loss', 'content': 0.20546555519104004, 'timestamp': '2025-09-02 14:21:31.690842', 'step': 57, 'epoch': 2} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 80], 'flops': 1600009780544.0}, 'timestamp': '2025-09-02 14:21:31.743309', 'step': 57, 'epoch': 2} |
| {'type': 'loss', 'content': 0.22024767100811005, 'timestamp': '2025-09-02 14:21:31.745384', 'step': 58, 'epoch': 2} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:31.797821', 'step': 58, 'epoch': 2} |
| {'type': 'loss', 'content': 0.17062577605247498, 'timestamp': '2025-09-02 14:21:31.799783', 'step': 59, 'epoch': 2} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-02 14:21:31.852630', 'step': 59, 'epoch': 2} |
| {'type': 'loss', 'content': 0.3129100203514099, 'timestamp': '2025-09-02 14:21:31.858379', 'step': 60, 'epoch': 2} |
| {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1277531617792}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1277531617792}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}], 'timestamp': '2025-09-02 14:21:31.937294', 'step': 60, 'epoch': 2} |
| {'type': 'pplx', 'content': 477.4788498900263, 'timestamp': '2025-09-02 14:21:31.939127', 'step': 60, 'epoch': 2} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:31.989776', 'step': 60, 'epoch': 2} |
| {'type': 'loss', 'content': 0.15515823662281036, 'timestamp': '2025-09-02 14:21:31.991902', 'step': 61, 'epoch': 2} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:32.044498', 'step': 61, 'epoch': 2} |
| {'type': 'loss', 'content': 0.24828961491584778, 'timestamp': '2025-09-02 14:21:32.046706', 'step': 62, 'epoch': 2} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:32.100661', 'step': 62, 'epoch': 2} |
| {'type': 'loss', 'content': 0.25642669200897217, 'timestamp': '2025-09-02 14:21:32.102726', 'step': 63, 'epoch': 2} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:32.155571', 'step': 63, 'epoch': 2} |
| {'type': 'loss', 'content': 0.3674210011959076, 'timestamp': '2025-09-02 14:21:32.161762', 'step': 64, 'epoch': 2} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:32.214541', 'step': 64, 'epoch': 2} |
| {'type': 'loss', 'content': 0.2649133801460266, 'timestamp': '2025-09-02 14:21:32.216729', 'step': 65, 'epoch': 2} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:32.269613', 'step': 65, 'epoch': 2} |
| {'type': 'loss', 'content': 0.12124574929475784, 'timestamp': '2025-09-02 14:21:32.272059', 'step': 66, 'epoch': 2} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-02 14:21:32.326226', 'step': 66, 'epoch': 2} |
| {'type': 'loss', 'content': 0.30608734488487244, 'timestamp': '2025-09-02 14:21:32.328508', 'step': 67, 'epoch': 2} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-02 14:21:32.382321', 'step': 67, 'epoch': 2} |
| {'type': 'loss', 'content': 0.2580068111419678, 'timestamp': '2025-09-02 14:21:32.388153', 'step': 68, 'epoch': 2} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:32.439926', 'step': 68, 'epoch': 2} |
| {'type': 'loss', 'content': 0.19641490280628204, 'timestamp': '2025-09-02 14:21:32.442029', 'step': 69, 'epoch': 2} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:32.494756', 'step': 69, 'epoch': 2} |
| {'type': 'loss', 'content': 0.045092999935150146, 'timestamp': '2025-09-02 14:21:32.497259', 'step': 70, 'epoch': 2} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 80], 'flops': 1600009780544.0}, 'timestamp': '2025-09-02 14:21:32.550489', 'step': 70, 'epoch': 2} |
| {'type': 'loss', 'content': 0.1465393602848053, 'timestamp': '2025-09-02 14:21:32.552692', 'step': 71, 'epoch': 2} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:32.605194', 'step': 71, 'epoch': 2} |
| {'type': 'loss', 'content': 0.32573023438453674, 'timestamp': '2025-09-02 14:21:32.610975', 'step': 72, 'epoch': 2} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:32.663627', 'step': 72, 'epoch': 2} |
| {'type': 'loss', 'content': 0.1642458736896515, 'timestamp': '2025-09-02 14:21:32.666105', 'step': 73, 'epoch': 2} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-02 14:21:32.719092', 'step': 73, 'epoch': 2} |
| {'type': 'loss', 'content': 0.38462355732917786, 'timestamp': '2025-09-02 14:21:32.721260', 'step': 74, 'epoch': 2} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-02 14:21:32.774061', 'step': 74, 'epoch': 2} |
| {'type': 'loss', 'content': 0.33901724219322205, 'timestamp': '2025-09-02 14:21:32.776279', 'step': 75, 'epoch': 2} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:32.831369', 'step': 75, 'epoch': 2} |
| {'type': 'loss', 'content': 0.10003411769866943, 'timestamp': '2025-09-02 14:21:32.837153', 'step': 76, 'epoch': 2} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-02 14:21:32.889153', 'step': 76, 'epoch': 2} |
| {'type': 'loss', 'content': 0.4535551965236664, 'timestamp': '2025-09-02 14:21:32.892094', 'step': 77, 'epoch': 2} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:32.945640', 'step': 77, 'epoch': 2} |
| {'type': 'loss', 'content': 0.10993928462266922, 'timestamp': '2025-09-02 14:21:32.947926', 'step': 78, 'epoch': 2} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:33.000424', 'step': 78, 'epoch': 2} |
| {'type': 'loss', 'content': 0.36250942945480347, 'timestamp': '2025-09-02 14:21:33.002801', 'step': 79, 'epoch': 2} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:33.055474', 'step': 79, 'epoch': 2} |
| {'type': 'loss', 'content': 0.19292058050632477, 'timestamp': '2025-09-02 14:21:33.061308', 'step': 80, 'epoch': 2} |
| {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1277531617792}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1277531617792}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}], 'timestamp': '2025-09-02 14:21:33.140533', 'step': 80, 'epoch': 2} |
| {'type': 'pplx', 'content': 422.44428303088716, 'timestamp': '2025-09-02 14:21:33.142501', 'step': 80, 'epoch': 2} |
| {'type': 'info', 'content': 'Checkpoint saved at step 80', 'timestamp': '2025-09-02 14:21:33.481597', 'step': 80, 'epoch': 2} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-02 14:21:33.533557', 'step': 80, 'epoch': 2} |
| {'type': 'loss', 'content': 0.30610671639442444, 'timestamp': '2025-09-02 14:21:33.535636', 'step': 81, 'epoch': 2} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-02 14:21:33.588420', 'step': 81, 'epoch': 2} |
| {'type': 'loss', 'content': 0.357920378446579, 'timestamp': '2025-09-02 14:21:33.590531', 'step': 82, 'epoch': 2} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 80], 'flops': 1600009780544.0}, 'timestamp': '2025-09-02 14:21:33.644496', 'step': 82, 'epoch': 2} |
| {'type': 'loss', 'content': 0.054498761892318726, 'timestamp': '2025-09-02 14:21:33.646639', 'step': 83, 'epoch': 2} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-02 14:21:33.699338', 'step': 83, 'epoch': 2} |
| {'type': 'loss', 'content': 0.33806470036506653, 'timestamp': '2025-09-02 14:21:33.706365', 'step': 84, 'epoch': 2} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-02 14:21:33.762674', 'step': 84, 'epoch': 2} |
| {'type': 'loss', 'content': 0.2212488353252411, 'timestamp': '2025-09-02 14:21:33.765269', 'step': 85, 'epoch': 2} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-02 14:21:33.819856', 'step': 85, 'epoch': 2} |
| {'type': 'loss', 'content': 0.30659064650535583, 'timestamp': '2025-09-02 14:21:33.821908', 'step': 86, 'epoch': 2} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:33.874898', 'step': 86, 'epoch': 2} |
| {'type': 'loss', 'content': 0.20278169214725494, 'timestamp': '2025-09-02 14:21:33.877278', 'step': 87, 'epoch': 2} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 80], 'flops': 1600009780544.0}, 'timestamp': '2025-09-02 14:21:33.931080', 'step': 87, 'epoch': 2} |
| {'type': 'loss', 'content': 0.052840761840343475, 'timestamp': '2025-09-02 14:21:33.938516', 'step': 88, 'epoch': 2} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 80], 'flops': 1600009780544.0}, 'timestamp': '2025-09-02 14:21:33.990276', 'step': 88, 'epoch': 2} |
| {'type': 'loss', 'content': 0.14356793463230133, 'timestamp': '2025-09-02 14:21:33.992558', 'step': 89, 'epoch': 2} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:34.046990', 'step': 89, 'epoch': 2} |
| {'type': 'loss', 'content': 0.263113796710968, 'timestamp': '2025-09-02 14:21:34.049034', 'step': 90, 'epoch': 2} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 80], 'flops': 1600009780544.0}, 'timestamp': '2025-09-02 14:21:34.101853', 'step': 90, 'epoch': 2} |
| {'type': 'loss', 'content': 0.09962937980890274, 'timestamp': '2025-09-02 14:21:34.103913', 'step': 91, 'epoch': 2} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:34.156805', 'step': 91, 'epoch': 2} |
| {'type': 'loss', 'content': 0.22076894342899323, 'timestamp': '2025-09-02 14:21:34.162841', 'step': 92, 'epoch': 2} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:34.214512', 'step': 92, 'epoch': 2} |
| {'type': 'loss', 'content': 0.11865776777267456, 'timestamp': '2025-09-02 14:21:34.216650', 'step': 93, 'epoch': 2} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-02 14:21:34.269983', 'step': 93, 'epoch': 2} |
| {'type': 'loss', 'content': 0.04955621063709259, 'timestamp': '2025-09-02 14:21:34.272362', 'step': 94, 'epoch': 2} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-02 14:21:34.325387', 'step': 94, 'epoch': 2} |
| {'type': 'loss', 'content': 0.2833959758281708, 'timestamp': '2025-09-02 14:21:34.327637', 'step': 95, 'epoch': 2} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-02 14:21:34.380314', 'step': 95, 'epoch': 2} |
| {'type': 'loss', 'content': 0.14075198769569397, 'timestamp': '2025-09-02 14:21:34.386188', 'step': 96, 'epoch': 2} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:34.438158', 'step': 96, 'epoch': 2} |
| {'type': 'loss', 'content': 0.2728186249732971, 'timestamp': '2025-09-02 14:21:34.440488', 'step': 97, 'epoch': 2} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [3, 96], 'flops': 1440008813856.0}, 'timestamp': '2025-09-02 14:21:34.492694', 'step': 97, 'epoch': 2} |
| {'type': 'loss', 'content': 0.05829225108027458, 'timestamp': '2025-09-02 14:21:34.495023', 'step': 98, 'epoch': 2} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:34.552002', 'step': 98, 'epoch': 3} |
| {'type': 'loss', 'content': 0.04649998992681503, 'timestamp': '2025-09-02 14:21:34.554111', 'step': 99, 'epoch': 3} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-02 14:21:34.606479', 'step': 99, 'epoch': 3} |
| {'type': 'loss', 'content': 0.2196275293827057, 'timestamp': '2025-09-02 14:21:34.612537', 'step': 100, 'epoch': 3} |
| {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1277531617792}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1277531617792}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}], 'timestamp': '2025-09-02 14:21:34.691567', 'step': 100, 'epoch': 3} |
| {'type': 'pplx', 'content': 385.67704823453323, 'timestamp': '2025-09-02 14:21:34.693239', 'step': 100, 'epoch': 3} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:34.743431', 'step': 100, 'epoch': 3} |
| {'type': 'loss', 'content': 0.16100770235061646, 'timestamp': '2025-09-02 14:21:34.745756', 'step': 101, 'epoch': 3} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:34.798165', 'step': 101, 'epoch': 3} |
| {'type': 'loss', 'content': 0.057087093591690063, 'timestamp': '2025-09-02 14:21:34.800354', 'step': 102, 'epoch': 3} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 80], 'flops': 1600009780544.0}, 'timestamp': '2025-09-02 14:21:34.852971', 'step': 102, 'epoch': 3} |
| {'type': 'loss', 'content': 0.1204967051744461, 'timestamp': '2025-09-02 14:21:34.855298', 'step': 103, 'epoch': 3} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 80], 'flops': 1600009780544.0}, 'timestamp': '2025-09-02 14:21:34.907540', 'step': 103, 'epoch': 3} |
| {'type': 'loss', 'content': 0.025741925463080406, 'timestamp': '2025-09-02 14:21:34.916978', 'step': 104, 'epoch': 3} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:34.974538', 'step': 104, 'epoch': 3} |
| {'type': 'loss', 'content': 0.15131041407585144, 'timestamp': '2025-09-02 14:21:34.976999', 'step': 105, 'epoch': 3} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:35.030347', 'step': 105, 'epoch': 3} |
| {'type': 'loss', 'content': 0.175421342253685, 'timestamp': '2025-09-02 14:21:35.032600', 'step': 106, 'epoch': 3} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 80], 'flops': 1600009780544.0}, 'timestamp': '2025-09-02 14:21:35.085546', 'step': 106, 'epoch': 3} |
| {'type': 'loss', 'content': 0.1869324892759323, 'timestamp': '2025-09-02 14:21:35.087937', 'step': 107, 'epoch': 3} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:35.140523', 'step': 107, 'epoch': 3} |
| {'type': 'loss', 'content': 0.10756679624319077, 'timestamp': '2025-09-02 14:21:35.146622', 'step': 108, 'epoch': 3} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-02 14:21:35.198780', 'step': 108, 'epoch': 3} |
| {'type': 'loss', 'content': 0.2679283916950226, 'timestamp': '2025-09-02 14:21:35.201213', 'step': 109, 'epoch': 3} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:35.253617', 'step': 109, 'epoch': 3} |
| {'type': 'loss', 'content': 0.10408151149749756, 'timestamp': '2025-09-02 14:21:35.255677', 'step': 110, 'epoch': 3} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:35.308261', 'step': 110, 'epoch': 3} |
| {'type': 'loss', 'content': 0.20936067402362823, 'timestamp': '2025-09-02 14:21:35.310436', 'step': 111, 'epoch': 3} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:35.363982', 'step': 111, 'epoch': 3} |
| {'type': 'loss', 'content': 0.19980737566947937, 'timestamp': '2025-09-02 14:21:35.369585', 'step': 112, 'epoch': 3} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:35.421116', 'step': 112, 'epoch': 3} |
| {'type': 'loss', 'content': 0.35190096497535706, 'timestamp': '2025-09-02 14:21:35.423358', 'step': 113, 'epoch': 3} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:35.475617', 'step': 113, 'epoch': 3} |
| {'type': 'loss', 'content': 0.21982204914093018, 'timestamp': '2025-09-02 14:21:35.478010', 'step': 114, 'epoch': 3} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:35.537447', 'step': 114, 'epoch': 3} |
| {'type': 'loss', 'content': 0.1011718213558197, 'timestamp': '2025-09-02 14:21:35.540057', 'step': 115, 'epoch': 3} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-02 14:21:35.593259', 'step': 115, 'epoch': 3} |
| {'type': 'loss', 'content': 0.2807498276233673, 'timestamp': '2025-09-02 14:21:35.598954', 'step': 116, 'epoch': 3} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-02 14:21:35.652237', 'step': 116, 'epoch': 3} |
| {'type': 'loss', 'content': 0.23568834364414215, 'timestamp': '2025-09-02 14:21:35.655230', 'step': 117, 'epoch': 3} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:35.708534', 'step': 117, 'epoch': 3} |
| {'type': 'loss', 'content': 0.1713218241930008, 'timestamp': '2025-09-02 14:21:35.710990', 'step': 118, 'epoch': 3} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:35.764188', 'step': 118, 'epoch': 3} |
| {'type': 'loss', 'content': 0.02880828082561493, 'timestamp': '2025-09-02 14:21:35.766843', 'step': 119, 'epoch': 3} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 80], 'flops': 1600009780544.0}, 'timestamp': '2025-09-02 14:21:35.820109', 'step': 119, 'epoch': 3} |
| {'type': 'loss', 'content': 0.14222285151481628, 'timestamp': '2025-09-02 14:21:35.827311', 'step': 120, 'epoch': 3} |
| {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1277531617792}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1277531617792}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}], 'timestamp': '2025-09-02 14:21:35.907846', 'step': 120, 'epoch': 3} |
| {'type': 'pplx', 'content': 369.52687483597305, 'timestamp': '2025-09-02 14:21:35.909688', 'step': 120, 'epoch': 3} |
| {'type': 'info', 'content': 'Checkpoint saved at step 120', 'timestamp': '2025-09-02 14:21:36.264156', 'step': 120, 'epoch': 3} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:36.318359', 'step': 120, 'epoch': 3} |
| {'type': 'loss', 'content': 0.2532464563846588, 'timestamp': '2025-09-02 14:21:36.320998', 'step': 121, 'epoch': 3} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:36.376072', 'step': 121, 'epoch': 3} |
| {'type': 'loss', 'content': 0.12490029633045197, 'timestamp': '2025-09-02 14:21:36.378530', 'step': 122, 'epoch': 3} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-02 14:21:36.432983', 'step': 122, 'epoch': 3} |
| {'type': 'loss', 'content': 0.3516947031021118, 'timestamp': '2025-09-02 14:21:36.435316', 'step': 123, 'epoch': 3} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-02 14:21:36.489397', 'step': 123, 'epoch': 3} |
| {'type': 'loss', 'content': 0.30067041516304016, 'timestamp': '2025-09-02 14:21:36.495016', 'step': 124, 'epoch': 3} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:36.546911', 'step': 124, 'epoch': 3} |
| {'type': 'loss', 'content': 0.07397190481424332, 'timestamp': '2025-09-02 14:21:36.549560', 'step': 125, 'epoch': 3} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-02 14:21:36.602824', 'step': 125, 'epoch': 3} |
| {'type': 'loss', 'content': 0.4031347632408142, 'timestamp': '2025-09-02 14:21:36.605164', 'step': 126, 'epoch': 3} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:36.658461', 'step': 126, 'epoch': 3} |
| {'type': 'loss', 'content': 0.09089026600122452, 'timestamp': '2025-09-02 14:21:36.661112', 'step': 127, 'epoch': 3} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:36.713866', 'step': 127, 'epoch': 3} |
| {'type': 'loss', 'content': 0.30912989377975464, 'timestamp': '2025-09-02 14:21:36.719655', 'step': 128, 'epoch': 3} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:36.771777', 'step': 128, 'epoch': 3} |
| {'type': 'loss', 'content': 0.1636476218700409, 'timestamp': '2025-09-02 14:21:36.774375', 'step': 129, 'epoch': 3} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-02 14:21:36.828868', 'step': 129, 'epoch': 3} |
| {'type': 'loss', 'content': 0.28050050139427185, 'timestamp': '2025-09-02 14:21:36.831522', 'step': 130, 'epoch': 3} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-02 14:21:36.886307', 'step': 130, 'epoch': 3} |
| {'type': 'loss', 'content': 0.3338044583797455, 'timestamp': '2025-09-02 14:21:36.889198', 'step': 131, 'epoch': 3} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 80], 'flops': 1600009780544.0}, 'timestamp': '2025-09-02 14:21:36.942986', 'step': 131, 'epoch': 3} |
| {'type': 'loss', 'content': 0.02723563462495804, 'timestamp': '2025-09-02 14:21:36.949242', 'step': 132, 'epoch': 3} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-02 14:21:37.001954', 'step': 132, 'epoch': 3} |
| {'type': 'loss', 'content': 0.29408735036849976, 'timestamp': '2025-09-02 14:21:37.004398', 'step': 133, 'epoch': 3} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-02 14:21:37.057272', 'step': 133, 'epoch': 3} |
| {'type': 'loss', 'content': 0.17805826663970947, 'timestamp': '2025-09-02 14:21:37.059576', 'step': 134, 'epoch': 3} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-02 14:21:37.112542', 'step': 134, 'epoch': 3} |
| {'type': 'loss', 'content': 0.2853803336620331, 'timestamp': '2025-09-02 14:21:37.114810', 'step': 135, 'epoch': 3} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:37.168159', 'step': 135, 'epoch': 3} |
| {'type': 'loss', 'content': 0.17095552384853363, 'timestamp': '2025-09-02 14:21:37.174048', 'step': 136, 'epoch': 3} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 80], 'flops': 1600009780544.0}, 'timestamp': '2025-09-02 14:21:37.226360', 'step': 136, 'epoch': 3} |
| {'type': 'loss', 'content': 0.042801082134246826, 'timestamp': '2025-09-02 14:21:37.229034', 'step': 137, 'epoch': 3} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 80], 'flops': 1600009780544.0}, 'timestamp': '2025-09-02 14:21:37.281519', 'step': 137, 'epoch': 3} |
| {'type': 'loss', 'content': 0.11929337680339813, 'timestamp': '2025-09-02 14:21:37.283765', 'step': 138, 'epoch': 3} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:37.337123', 'step': 138, 'epoch': 3} |
| {'type': 'loss', 'content': 0.25038737058639526, 'timestamp': '2025-09-02 14:21:37.339700', 'step': 139, 'epoch': 3} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 80], 'flops': 1600009780544.0}, 'timestamp': '2025-09-02 14:21:37.393955', 'step': 139, 'epoch': 3} |
| {'type': 'loss', 'content': 0.07605995982885361, 'timestamp': '2025-09-02 14:21:37.399851', 'step': 140, 'epoch': 3} |
| {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1277531617792}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1277531617792}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}], 'timestamp': '2025-09-02 14:21:37.480347', 'step': 140, 'epoch': 3} |
| {'type': 'pplx', 'content': 350.91691874873715, 'timestamp': '2025-09-02 14:21:37.482176', 'step': 140, 'epoch': 3} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:37.533333', 'step': 140, 'epoch': 3} |
| {'type': 'loss', 'content': 0.1686297506093979, 'timestamp': '2025-09-02 14:21:37.536435', 'step': 141, 'epoch': 3} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:37.589719', 'step': 141, 'epoch': 3} |
| {'type': 'loss', 'content': 0.09982181340456009, 'timestamp': '2025-09-02 14:21:37.591858', 'step': 142, 'epoch': 3} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-02 14:21:37.645067', 'step': 142, 'epoch': 3} |
| {'type': 'loss', 'content': 0.03836599364876747, 'timestamp': '2025-09-02 14:21:37.648799', 'step': 143, 'epoch': 3} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-02 14:21:37.701720', 'step': 143, 'epoch': 3} |
| {'type': 'loss', 'content': 0.24430139362812042, 'timestamp': '2025-09-02 14:21:37.708012', 'step': 144, 'epoch': 3} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-02 14:21:37.760013', 'step': 144, 'epoch': 3} |
| {'type': 'loss', 'content': 0.11822221428155899, 'timestamp': '2025-09-02 14:21:37.762496', 'step': 145, 'epoch': 3} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 1920011723136.0}, 'timestamp': '2025-09-02 14:21:37.816249', 'step': 145, 'epoch': 3} |
| {'type': 'loss', 'content': 0.2439824491739273, 'timestamp': '2025-09-02 14:21:37.818502', 'step': 146, 'epoch': 3} |
| {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [3, 96], 'flops': 1440008813856.0}, 'timestamp': '2025-09-02 14:21:37.871310', 'step': 146, 'epoch': 3} |
| {'type': 'loss', 'content': 0.046064943075180054, 'timestamp': '2025-09-02 14:21:37.874122', 'step': 147, 'epoch': 3} |
| {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1277531617792}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1277531617792}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}], 'timestamp': '2025-09-02 14:21:37.966063', 'step': 147, 'epoch': 3} |
| {'type': 'pplx', 'content': 346.3451241147277, 'timestamp': '2025-09-02 14:21:37.967824', 'step': 147, 'epoch': 3} |
| {'type': 'best_pplx', 'content': 346.3451241147277, 'timestamp': '2025-09-02 14:21:37.969722', 'step': 147, 'epoch': 3} |
| {'type': 'best_step', 'content': 147, 'timestamp': '2025-09-02 14:21:37.971378', 'step': 147, 'epoch': 3} |
| {'type': 'total_pplx_flops', 'content': 37367799668352, 'timestamp': '2025-09-02 14:21:37.973196', 'step': 147, 'epoch': 3} |
| {'type': 'total_train_flops', 'content': 292321784506464.0, 'timestamp': '2025-09-02 14:21:37.975557', 'step': 147, 'epoch': 3} |
|
|