diff --git "a/logs.jsonl" "b/logs.jsonl" new file mode 100644--- /dev/null +++ "b/logs.jsonl" @@ -0,0 +1,5611 @@ +{'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}], 'timestamp': '2025-09-30 22:09:45.720522', 'step': 0, 'epoch': 0} +{'type': 'pplx', 'content': 226674977.87649825, 'timestamp': '2025-09-30 22:09:45.725910', 'step': 0, 'epoch': 0} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:09:45.829654', 'step': 0, 'epoch': 1} +{'type': 'loss', 'content': 0.7057779431343079, 'timestamp': '2025-09-30 22:09:45.833808', 'step': 1, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:45.927323', 'step': 1, 'epoch': 1} +{'type': 'loss', 'content': 0.6982383131980896, 'timestamp': '2025-09-30 22:09:45.931697', 'step': 2, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:45.989452', 'step': 2, 'epoch': 1} +{'type': 'loss', 'content': 0.7418850064277649, 'timestamp': '2025-09-30 22:09:46.001612', 'step': 3, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:46.069644', 'step': 3, 'epoch': 1} +{'type': 'loss', 'content': 0.7169809341430664, 'timestamp': '2025-09-30 22:09:46.122890', 'step': 4, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:46.194754', 'step': 4, 'epoch': 1} +{'type': 'loss', 'content': 0.5053693652153015, 'timestamp': '2025-09-30 22:09:46.207488', 'step': 5, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:46.291138', 'step': 5, 'epoch': 1} +{'type': 'loss', 'content': 0.5166937708854675, 'timestamp': '2025-09-30 22:09:46.303245', 'step': 6, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:09:46.381366', 'step': 6, 'epoch': 1} +{'type': 'loss', 'content': 0.5014775991439819, 'timestamp': '2025-09-30 22:09:46.385931', 'step': 7, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:46.454635', 'step': 7, 'epoch': 1} +{'type': 'loss', 'content': 0.514745831489563, 'timestamp': '2025-09-30 22:09:46.469695', 'step': 8, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:46.539001', 'step': 8, 'epoch': 1} +{'type': 'loss', 'content': 0.3292957842350006, 'timestamp': '2025-09-30 22:09:46.546966', 'step': 9, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:46.612140', 'step': 9, 'epoch': 1} +{'type': 'loss', 'content': 0.3126233220100403, 'timestamp': '2025-09-30 22:09:46.617552', 'step': 10, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:46.675520', 'step': 10, 'epoch': 1} +{'type': 'loss', 'content': 0.3234768509864807, 'timestamp': '2025-09-30 22:09:46.690861', 'step': 11, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:46.756152', 'step': 11, 'epoch': 1} +{'type': 'loss', 'content': 0.3332277834415436, 'timestamp': '2025-09-30 22:09:46.765624', 'step': 12, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:46.823098', 'step': 12, 'epoch': 1} +{'type': 'loss', 'content': 0.16887438297271729, 'timestamp': '2025-09-30 22:09:46.828694', 'step': 13, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:46.891179', 'step': 13, 'epoch': 1} +{'type': 'loss', 'content': 0.15498413145542145, 'timestamp': '2025-09-30 22:09:46.894434', 'step': 14, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:46.961785', 'step': 14, 'epoch': 1} +{'type': 'loss', 'content': 0.15807364881038666, 'timestamp': '2025-09-30 22:09:46.965388', 'step': 15, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:47.021211', 'step': 15, 'epoch': 1} +{'type': 'loss', 'content': 0.16345378756523132, 'timestamp': '2025-09-30 22:09:47.037888', 'step': 16, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:47.092321', 'step': 16, 'epoch': 1} +{'type': 'loss', 'content': 0.06315434724092484, 'timestamp': '2025-09-30 22:09:47.095399', 'step': 17, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:47.155095', 'step': 17, 'epoch': 1} +{'type': 'loss', 'content': 0.07140235602855682, 'timestamp': '2025-09-30 22:09:47.159326', 'step': 18, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:09:47.217343', 'step': 18, 'epoch': 1} +{'type': 'loss', 'content': 0.07542860507965088, 'timestamp': '2025-09-30 22:09:47.231838', 'step': 19, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:47.297426', 'step': 19, 'epoch': 1} +{'type': 'loss', 'content': 0.07850368320941925, 'timestamp': '2025-09-30 22:09:47.314640', 'step': 20, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:09:47.380582', 'step': 20, 'epoch': 1} +{'type': 'loss', 'content': 0.05402212589979172, 'timestamp': '2025-09-30 22:09:47.393830', 'step': 21, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:47.463154', 'step': 21, 'epoch': 1} +{'type': 'loss', 'content': 0.04844330623745918, 'timestamp': '2025-09-30 22:09:47.477962', 'step': 22, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:47.538665', 'step': 22, 'epoch': 1} +{'type': 'loss', 'content': 0.04047473892569542, 'timestamp': '2025-09-30 22:09:47.542223', 'step': 23, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:47.599055', 'step': 23, 'epoch': 1} +{'type': 'loss', 'content': 0.03956778720021248, 'timestamp': '2025-09-30 22:09:47.605916', 'step': 24, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:47.666928', 'step': 24, 'epoch': 1} +{'type': 'loss', 'content': 0.03282862529158592, 'timestamp': '2025-09-30 22:09:47.680006', 'step': 25, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:09:47.747238', 'step': 25, 'epoch': 1} +{'type': 'loss', 'content': 0.047466158866882324, 'timestamp': '2025-09-30 22:09:47.751128', 'step': 26, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:09:47.809803', 'step': 26, 'epoch': 1} +{'type': 'loss', 'content': 0.02898944355547428, 'timestamp': '2025-09-30 22:09:47.813747', 'step': 27, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:47.883244', 'step': 27, 'epoch': 1} +{'type': 'loss', 'content': 0.02849601022899151, 'timestamp': '2025-09-30 22:09:47.892740', 'step': 28, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:47.948731', 'step': 28, 'epoch': 1} +{'type': 'loss', 'content': 0.04645688086748123, 'timestamp': '2025-09-30 22:09:47.957611', 'step': 29, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:48.014118', 'step': 29, 'epoch': 1} +{'type': 'loss', 'content': 0.02362995222210884, 'timestamp': '2025-09-30 22:09:48.017619', 'step': 30, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:48.092874', 'step': 30, 'epoch': 1} +{'type': 'loss', 'content': 0.02474798448383808, 'timestamp': '2025-09-30 22:09:48.098478', 'step': 31, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:48.158112', 'step': 31, 'epoch': 1} +{'type': 'loss', 'content': 0.025140417739748955, 'timestamp': '2025-09-30 22:09:48.165508', 'step': 32, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:48.235509', 'step': 32, 'epoch': 1} +{'type': 'loss', 'content': 0.004965892527252436, 'timestamp': '2025-09-30 22:09:48.249272', 'step': 33, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:48.306709', 'step': 33, 'epoch': 1} +{'type': 'loss', 'content': 0.03571665287017822, 'timestamp': '2025-09-30 22:09:48.321337', 'step': 34, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:48.396615', 'step': 34, 'epoch': 1} +{'type': 'loss', 'content': 0.005427081603556871, 'timestamp': '2025-09-30 22:09:48.400758', 'step': 35, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:48.463184', 'step': 35, 'epoch': 1} +{'type': 'loss', 'content': 0.022603018209338188, 'timestamp': '2025-09-30 22:09:48.470249', 'step': 36, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:48.526578', 'step': 36, 'epoch': 1} +{'type': 'loss', 'content': 0.017443154007196426, 'timestamp': '2025-09-30 22:09:48.530394', 'step': 37, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:48.586995', 'step': 37, 'epoch': 1} +{'type': 'loss', 'content': 0.023570295423269272, 'timestamp': '2025-09-30 22:09:48.600109', 'step': 38, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:09:48.662782', 'step': 38, 'epoch': 1} +{'type': 'loss', 'content': 0.008299198001623154, 'timestamp': '2025-09-30 22:09:48.666655', 'step': 39, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:48.724407', 'step': 39, 'epoch': 1} +{'type': 'loss', 'content': 0.023102333769202232, 'timestamp': '2025-09-30 22:09:48.740457', 'step': 40, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:48.806911', 'step': 40, 'epoch': 1} +{'type': 'loss', 'content': 0.006827209610491991, 'timestamp': '2025-09-30 22:09:48.810978', 'step': 41, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:09:48.869752', 'step': 41, 'epoch': 1} +{'type': 'loss', 'content': 0.02204090915620327, 'timestamp': '2025-09-30 22:09:48.876097', 'step': 42, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:48.935298', 'step': 42, 'epoch': 1} +{'type': 'loss', 'content': 0.036014724522829056, 'timestamp': '2025-09-30 22:09:48.938917', 'step': 43, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:09:49.019588', 'step': 43, 'epoch': 1} +{'type': 'loss', 'content': 0.005299022886902094, 'timestamp': '2025-09-30 22:09:49.030945', 'step': 44, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:49.092096', 'step': 44, 'epoch': 1} +{'type': 'loss', 'content': 0.007433965802192688, 'timestamp': '2025-09-30 22:09:49.095682', 'step': 45, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:49.153079', 'step': 45, 'epoch': 1} +{'type': 'loss', 'content': 0.007419214118272066, 'timestamp': '2025-09-30 22:09:49.156971', 'step': 46, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:49.226418', 'step': 46, 'epoch': 1} +{'type': 'loss', 'content': 0.03738881275057793, 'timestamp': '2025-09-30 22:09:49.231831', 'step': 47, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:49.296369', 'step': 47, 'epoch': 1} +{'type': 'loss', 'content': 0.021278386935591698, 'timestamp': '2025-09-30 22:09:49.308245', 'step': 48, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:49.369779', 'step': 48, 'epoch': 1} +{'type': 'loss', 'content': 0.008051230572164059, 'timestamp': '2025-09-30 22:09:49.382874', 'step': 49, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:09:49.448881', 'step': 49, 'epoch': 1} +{'type': 'loss', 'content': 0.03269955888390541, 'timestamp': '2025-09-30 22:09:49.455582', 'step': 50, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:49.521657', 'step': 50, 'epoch': 1} +{'type': 'loss', 'content': 0.009572034701704979, 'timestamp': '2025-09-30 22:09:49.535895', 'step': 51, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:49.607418', 'step': 51, 'epoch': 1} +{'type': 'loss', 'content': 0.03262675181031227, 'timestamp': '2025-09-30 22:09:49.618732', 'step': 52, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:49.698096', 'step': 52, 'epoch': 1} +{'type': 'loss', 'content': 0.031087083742022514, 'timestamp': '2025-09-30 22:09:49.702960', 'step': 53, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:09:49.774471', 'step': 53, 'epoch': 1} +{'type': 'loss', 'content': 0.028552474454045296, 'timestamp': '2025-09-30 22:09:49.790307', 'step': 54, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:09:49.858357', 'step': 54, 'epoch': 1} +{'type': 'loss', 'content': 0.021799175068736076, 'timestamp': '2025-09-30 22:09:49.866314', 'step': 55, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:09:49.946732', 'step': 55, 'epoch': 1} +{'type': 'loss', 'content': 0.03151797503232956, 'timestamp': '2025-09-30 22:09:49.956360', 'step': 56, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:50.025091', 'step': 56, 'epoch': 1} +{'type': 'loss', 'content': 0.02139838971197605, 'timestamp': '2025-09-30 22:09:50.036752', 'step': 57, 'epoch': 1} +{'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}], 'timestamp': '2025-09-30 22:09:51.671705', 'step': 57, 'epoch': 1} +{'type': 'pplx', 'content': 33459661.644647755, 'timestamp': '2025-09-30 22:09:51.677492', 'step': 57, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:51.732715', 'step': 57, 'epoch': 1} +{'type': 'loss', 'content': 0.020825445652008057, 'timestamp': '2025-09-30 22:09:51.736354', 'step': 58, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:51.799744', 'step': 58, 'epoch': 1} +{'type': 'loss', 'content': 0.019294634461402893, 'timestamp': '2025-09-30 22:09:51.804206', 'step': 59, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:51.863673', 'step': 59, 'epoch': 1} +{'type': 'loss', 'content': 0.020074540749192238, 'timestamp': '2025-09-30 22:09:51.879375', 'step': 60, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:51.942712', 'step': 60, 'epoch': 1} +{'type': 'loss', 'content': 0.019697507843375206, 'timestamp': '2025-09-30 22:09:51.946783', 'step': 61, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:09:52.013178', 'step': 61, 'epoch': 1} +{'type': 'loss', 'content': 0.02132870815694332, 'timestamp': '2025-09-30 22:09:52.017164', 'step': 62, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:09:52.092840', 'step': 62, 'epoch': 1} +{'type': 'loss', 'content': 0.01894804835319519, 'timestamp': '2025-09-30 22:09:52.095762', 'step': 63, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:52.174530', 'step': 63, 'epoch': 1} +{'type': 'loss', 'content': 0.016838889569044113, 'timestamp': '2025-09-30 22:09:52.182844', 'step': 64, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:52.261938', 'step': 64, 'epoch': 1} +{'type': 'loss', 'content': 0.01592377945780754, 'timestamp': '2025-09-30 22:09:52.271881', 'step': 65, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:09:52.339697', 'step': 65, 'epoch': 1} +{'type': 'loss', 'content': 0.02463250793516636, 'timestamp': '2025-09-30 22:09:52.343673', 'step': 66, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:52.413263', 'step': 66, 'epoch': 1} +{'type': 'loss', 'content': 0.018470000475645065, 'timestamp': '2025-09-30 22:09:52.424819', 'step': 67, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:52.493307', 'step': 67, 'epoch': 1} +{'type': 'loss', 'content': 0.022894982248544693, 'timestamp': '2025-09-30 22:09:52.507145', 'step': 68, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:09:52.577240', 'step': 68, 'epoch': 1} +{'type': 'loss', 'content': 0.029143383726477623, 'timestamp': '2025-09-30 22:09:52.581726', 'step': 69, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:52.649330', 'step': 69, 'epoch': 1} +{'type': 'loss', 'content': 0.020210010930895805, 'timestamp': '2025-09-30 22:09:52.652808', 'step': 70, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:52.721839', 'step': 70, 'epoch': 1} +{'type': 'loss', 'content': 0.011494003236293793, 'timestamp': '2025-09-30 22:09:52.726534', 'step': 71, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:52.789538', 'step': 71, 'epoch': 1} +{'type': 'loss', 'content': 0.028703901916742325, 'timestamp': '2025-09-30 22:09:52.796646', 'step': 72, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:09:52.854779', 'step': 72, 'epoch': 1} +{'type': 'loss', 'content': 0.018709659576416016, 'timestamp': '2025-09-30 22:09:52.857531', 'step': 73, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:52.929717', 'step': 73, 'epoch': 1} +{'type': 'loss', 'content': 0.029944155365228653, 'timestamp': '2025-09-30 22:09:52.933498', 'step': 74, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:52.993304', 'step': 74, 'epoch': 1} +{'type': 'loss', 'content': 0.020325303077697754, 'timestamp': '2025-09-30 22:09:52.997865', 'step': 75, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:09:53.069769', 'step': 75, 'epoch': 1} +{'type': 'loss', 'content': 0.02151350863277912, 'timestamp': '2025-09-30 22:09:53.078068', 'step': 76, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:53.137966', 'step': 76, 'epoch': 1} +{'type': 'loss', 'content': 0.0210565198212862, 'timestamp': '2025-09-30 22:09:53.142397', 'step': 77, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:53.203236', 'step': 77, 'epoch': 1} +{'type': 'loss', 'content': 0.03604406490921974, 'timestamp': '2025-09-30 22:09:53.217868', 'step': 78, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:53.277099', 'step': 78, 'epoch': 1} +{'type': 'loss', 'content': 0.036332327872514725, 'timestamp': '2025-09-30 22:09:53.281235', 'step': 79, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:09:53.342746', 'step': 79, 'epoch': 1} +{'type': 'loss', 'content': 0.028764527291059494, 'timestamp': '2025-09-30 22:09:53.351222', 'step': 80, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:53.420316', 'step': 80, 'epoch': 1} +{'type': 'loss', 'content': 0.019019123166799545, 'timestamp': '2025-09-30 22:09:53.424641', 'step': 81, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:53.480453', 'step': 81, 'epoch': 1} +{'type': 'loss', 'content': 0.02148059569299221, 'timestamp': '2025-09-30 22:09:53.486176', 'step': 82, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:53.545094', 'step': 82, 'epoch': 1} +{'type': 'loss', 'content': 0.04821930453181267, 'timestamp': '2025-09-30 22:09:53.549757', 'step': 83, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:53.621024', 'step': 83, 'epoch': 1} +{'type': 'loss', 'content': 0.02800285816192627, 'timestamp': '2025-09-30 22:09:53.628236', 'step': 84, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:53.694859', 'step': 84, 'epoch': 1} +{'type': 'loss', 'content': 0.0229884572327137, 'timestamp': '2025-09-30 22:09:53.702009', 'step': 85, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:53.769420', 'step': 85, 'epoch': 1} +{'type': 'loss', 'content': 0.014066072180867195, 'timestamp': '2025-09-30 22:09:53.773399', 'step': 86, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:53.840322', 'step': 86, 'epoch': 1} +{'type': 'loss', 'content': 0.017103631049394608, 'timestamp': '2025-09-30 22:09:53.843455', 'step': 87, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:53.903605', 'step': 87, 'epoch': 1} +{'type': 'loss', 'content': 0.020594418048858643, 'timestamp': '2025-09-30 22:09:53.919037', 'step': 88, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:53.986826', 'step': 88, 'epoch': 1} +{'type': 'loss', 'content': 0.020471200346946716, 'timestamp': '2025-09-30 22:09:53.991020', 'step': 89, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:09:54.054065', 'step': 89, 'epoch': 1} +{'type': 'loss', 'content': 0.023065567016601562, 'timestamp': '2025-09-30 22:09:54.059760', 'step': 90, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:54.134584', 'step': 90, 'epoch': 1} +{'type': 'loss', 'content': 0.026240045204758644, 'timestamp': '2025-09-30 22:09:54.147631', 'step': 91, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:09:54.215981', 'step': 91, 'epoch': 1} +{'type': 'loss', 'content': 0.024411886930465698, 'timestamp': '2025-09-30 22:09:54.233867', 'step': 92, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:54.308215', 'step': 92, 'epoch': 1} +{'type': 'loss', 'content': 0.022564176470041275, 'timestamp': '2025-09-30 22:09:54.313481', 'step': 93, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:54.376766', 'step': 93, 'epoch': 1} +{'type': 'loss', 'content': 0.02625674568116665, 'timestamp': '2025-09-30 22:09:54.381908', 'step': 94, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:54.442608', 'step': 94, 'epoch': 1} +{'type': 'loss', 'content': 0.026146892458200455, 'timestamp': '2025-09-30 22:09:54.448685', 'step': 95, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:54.517320', 'step': 95, 'epoch': 1} +{'type': 'loss', 'content': 0.023254042491316795, 'timestamp': '2025-09-30 22:09:54.525273', 'step': 96, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:54.582927', 'step': 96, 'epoch': 1} +{'type': 'loss', 'content': 0.021697642281651497, 'timestamp': '2025-09-30 22:09:54.587639', 'step': 97, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:54.648320', 'step': 97, 'epoch': 1} +{'type': 'loss', 'content': 0.020044708624482155, 'timestamp': '2025-09-30 22:09:54.660964', 'step': 98, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:54.731315', 'step': 98, 'epoch': 1} +{'type': 'loss', 'content': 0.019530335441231728, 'timestamp': '2025-09-30 22:09:54.734833', 'step': 99, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:54.794372', 'step': 99, 'epoch': 1} +{'type': 'loss', 'content': 0.02076215110719204, 'timestamp': '2025-09-30 22:09:54.803435', 'step': 100, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:54.864805', 'step': 100, 'epoch': 1} +{'type': 'loss', 'content': 0.018885493278503418, 'timestamp': '2025-09-30 22:09:54.869711', 'step': 101, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:54.928484', 'step': 101, 'epoch': 1} +{'type': 'loss', 'content': 0.015139119699597359, 'timestamp': '2025-09-30 22:09:54.932241', 'step': 102, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:54.991083', 'step': 102, 'epoch': 1} +{'type': 'loss', 'content': 0.020732766017317772, 'timestamp': '2025-09-30 22:09:54.994243', 'step': 103, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:55.058758', 'step': 103, 'epoch': 1} +{'type': 'loss', 'content': 0.031727004796266556, 'timestamp': '2025-09-30 22:09:55.066276', 'step': 104, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:55.123302', 'step': 104, 'epoch': 1} +{'type': 'loss', 'content': 0.03581748530268669, 'timestamp': '2025-09-30 22:09:55.132399', 'step': 105, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:55.190214', 'step': 105, 'epoch': 1} +{'type': 'loss', 'content': 0.0061849248595535755, 'timestamp': '2025-09-30 22:09:55.193819', 'step': 106, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:09:55.257208', 'step': 106, 'epoch': 1} +{'type': 'loss', 'content': 0.02108680084347725, 'timestamp': '2025-09-30 22:09:55.263242', 'step': 107, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:09:55.333196', 'step': 107, 'epoch': 1} +{'type': 'loss', 'content': 0.054573871195316315, 'timestamp': '2025-09-30 22:09:55.352127', 'step': 108, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:55.419704', 'step': 108, 'epoch': 1} +{'type': 'loss', 'content': 0.056397486478090286, 'timestamp': '2025-09-30 22:09:55.423709', 'step': 109, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-30 22:09:55.503381', 'step': 109, 'epoch': 1} +{'type': 'loss', 'content': 0.045181386172771454, 'timestamp': '2025-09-30 22:09:55.507646', 'step': 110, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:55.565463', 'step': 110, 'epoch': 1} +{'type': 'loss', 'content': 0.006864451337605715, 'timestamp': '2025-09-30 22:09:55.570595', 'step': 111, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:55.629518', 'step': 111, 'epoch': 1} +{'type': 'loss', 'content': 0.035181932151317596, 'timestamp': '2025-09-30 22:09:55.638791', 'step': 112, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:09:55.698419', 'step': 112, 'epoch': 1} +{'type': 'loss', 'content': 0.02644294500350952, 'timestamp': '2025-09-30 22:09:55.702122', 'step': 113, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:55.762278', 'step': 113, 'epoch': 1} +{'type': 'loss', 'content': 0.024839377030730247, 'timestamp': '2025-09-30 22:09:55.767722', 'step': 114, 'epoch': 1} +{'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}], 'timestamp': '2025-09-30 22:09:57.404156', 'step': 114, 'epoch': 1} +{'type': 'pplx', 'content': 34367165.57846854, 'timestamp': '2025-09-30 22:09:57.417807', 'step': 114, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:09:57.476760', 'step': 114, 'epoch': 1} +{'type': 'loss', 'content': 0.009317861869931221, 'timestamp': '2025-09-30 22:09:57.481123', 'step': 115, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:09:57.549026', 'step': 115, 'epoch': 1} +{'type': 'loss', 'content': 0.03304336592555046, 'timestamp': '2025-09-30 22:09:57.556362', 'step': 116, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:09:57.621741', 'step': 116, 'epoch': 1} +{'type': 'loss', 'content': 0.02041228488087654, 'timestamp': '2025-09-30 22:09:57.625489', 'step': 117, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:57.707532', 'step': 117, 'epoch': 1} +{'type': 'loss', 'content': 0.020021427422761917, 'timestamp': '2025-09-30 22:09:57.711221', 'step': 118, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:09:57.770352', 'step': 118, 'epoch': 1} +{'type': 'loss', 'content': 0.01669391058385372, 'timestamp': '2025-09-30 22:09:57.774354', 'step': 119, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:09:57.833648', 'step': 119, 'epoch': 1} +{'type': 'loss', 'content': 0.019399311393499374, 'timestamp': '2025-09-30 22:09:57.840377', 'step': 120, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:57.902145', 'step': 120, 'epoch': 1} +{'type': 'loss', 'content': 0.027869191020727158, 'timestamp': '2025-09-30 22:09:57.905157', 'step': 121, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:09:57.963613', 'step': 121, 'epoch': 1} +{'type': 'loss', 'content': 0.026405667886137962, 'timestamp': '2025-09-30 22:09:57.967225', 'step': 122, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:09:58.036235', 'step': 122, 'epoch': 1} +{'type': 'loss', 'content': 0.025716735050082207, 'timestamp': '2025-09-30 22:09:58.040067', 'step': 123, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:58.128759', 'step': 123, 'epoch': 1} +{'type': 'loss', 'content': 0.021928545087575912, 'timestamp': '2025-09-30 22:09:58.136284', 'step': 124, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:58.194040', 'step': 124, 'epoch': 1} +{'type': 'loss', 'content': 0.024018412455916405, 'timestamp': '2025-09-30 22:09:58.203869', 'step': 125, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:58.275576', 'step': 125, 'epoch': 1} +{'type': 'loss', 'content': 0.024753393605351448, 'timestamp': '2025-09-30 22:09:58.279054', 'step': 126, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:09:58.335015', 'step': 126, 'epoch': 1} +{'type': 'loss', 'content': 0.025028575211763382, 'timestamp': '2025-09-30 22:09:58.339229', 'step': 127, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:09:58.409024', 'step': 127, 'epoch': 1} +{'type': 'loss', 'content': 0.022491442039608955, 'timestamp': '2025-09-30 22:09:58.415306', 'step': 128, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:58.472633', 'step': 128, 'epoch': 1} +{'type': 'loss', 'content': 0.021387087181210518, 'timestamp': '2025-09-30 22:09:58.475769', 'step': 129, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:58.544050', 'step': 129, 'epoch': 1} +{'type': 'loss', 'content': 0.030490174889564514, 'timestamp': '2025-09-30 22:09:58.547113', 'step': 130, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:58.608056', 'step': 130, 'epoch': 1} +{'type': 'loss', 'content': 0.021837439388036728, 'timestamp': '2025-09-30 22:09:58.611804', 'step': 131, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:09:58.672124', 'step': 131, 'epoch': 1} +{'type': 'loss', 'content': 0.022853758186101913, 'timestamp': '2025-09-30 22:09:58.679715', 'step': 132, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:09:58.743010', 'step': 132, 'epoch': 1} +{'type': 'loss', 'content': 0.02393500693142414, 'timestamp': '2025-09-30 22:09:58.747907', 'step': 133, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:58.812859', 'step': 133, 'epoch': 1} +{'type': 'loss', 'content': 0.024709191173315048, 'timestamp': '2025-09-30 22:09:58.817248', 'step': 134, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:09:58.874630', 'step': 134, 'epoch': 1} +{'type': 'loss', 'content': 0.02340940572321415, 'timestamp': '2025-09-30 22:09:58.878363', 'step': 135, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:58.955588', 'step': 135, 'epoch': 1} +{'type': 'loss', 'content': 0.021506870165467262, 'timestamp': '2025-09-30 22:09:58.971620', 'step': 136, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:09:59.036619', 'step': 136, 'epoch': 1} +{'type': 'loss', 'content': 0.01850222609937191, 'timestamp': '2025-09-30 22:09:59.041465', 'step': 137, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:09:59.114440', 'step': 137, 'epoch': 1} +{'type': 'loss', 'content': 0.025246715173125267, 'timestamp': '2025-09-30 22:09:59.119401', 'step': 138, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:59.185394', 'step': 138, 'epoch': 1} +{'type': 'loss', 'content': 0.019027644768357277, 'timestamp': '2025-09-30 22:09:59.188937', 'step': 139, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:09:59.254993', 'step': 139, 'epoch': 1} +{'type': 'loss', 'content': 0.023021994158625603, 'timestamp': '2025-09-30 22:09:59.268285', 'step': 140, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:09:59.333791', 'step': 140, 'epoch': 1} +{'type': 'loss', 'content': 0.018915778025984764, 'timestamp': '2025-09-30 22:09:59.338389', 'step': 141, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:59.397001', 'step': 141, 'epoch': 1} +{'type': 'loss', 'content': 0.02081509307026863, 'timestamp': '2025-09-30 22:09:59.400430', 'step': 142, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:59.475598', 'step': 142, 'epoch': 1} +{'type': 'loss', 'content': 0.02820817194879055, 'timestamp': '2025-09-30 22:09:59.478597', 'step': 143, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:09:59.539292', 'step': 143, 'epoch': 1} +{'type': 'loss', 'content': 0.01714860461652279, 'timestamp': '2025-09-30 22:09:59.545917', 'step': 144, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:09:59.613640', 'step': 144, 'epoch': 1} +{'type': 'loss', 'content': 0.01967170275747776, 'timestamp': '2025-09-30 22:09:59.626738', 'step': 145, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:09:59.695249', 'step': 145, 'epoch': 1} +{'type': 'loss', 'content': 0.015452763997018337, 'timestamp': '2025-09-30 22:09:59.698910', 'step': 146, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:59.758042', 'step': 146, 'epoch': 1} +{'type': 'loss', 'content': 0.031086495146155357, 'timestamp': '2025-09-30 22:09:59.761287', 'step': 147, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:59.832230', 'step': 147, 'epoch': 1} +{'type': 'loss', 'content': 0.035771798342466354, 'timestamp': '2025-09-30 22:09:59.838403', 'step': 148, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:09:59.896383', 'step': 148, 'epoch': 1} +{'type': 'loss', 'content': 0.052823007106781006, 'timestamp': '2025-09-30 22:09:59.904646', 'step': 149, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:09:59.974528', 'step': 149, 'epoch': 1} +{'type': 'loss', 'content': 0.0519816055893898, 'timestamp': '2025-09-30 22:09:59.979756', 'step': 150, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:00.039600', 'step': 150, 'epoch': 1} +{'type': 'loss', 'content': 0.020907681435346603, 'timestamp': '2025-09-30 22:10:00.044125', 'step': 151, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:00.110151', 'step': 151, 'epoch': 1} +{'type': 'loss', 'content': 0.005571091081947088, 'timestamp': '2025-09-30 22:10:00.122291', 'step': 152, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:00.190990', 'step': 152, 'epoch': 1} +{'type': 'loss', 'content': 0.006688456982374191, 'timestamp': '2025-09-30 22:10:00.193396', 'step': 153, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:00.257532', 'step': 153, 'epoch': 1} +{'type': 'loss', 'content': 0.02993638627231121, 'timestamp': '2025-09-30 22:10:00.260041', 'step': 154, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:00.326910', 'step': 154, 'epoch': 1} +{'type': 'loss', 'content': 0.03653806075453758, 'timestamp': '2025-09-30 22:10:00.336903', 'step': 155, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:00.399218', 'step': 155, 'epoch': 1} +{'type': 'loss', 'content': 0.02178972400724888, 'timestamp': '2025-09-30 22:10:00.406459', 'step': 156, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:00.471221', 'step': 156, 'epoch': 1} +{'type': 'loss', 'content': 0.03773200884461403, 'timestamp': '2025-09-30 22:10:00.481225', 'step': 157, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:00.544734', 'step': 157, 'epoch': 1} +{'type': 'loss', 'content': 0.06024942919611931, 'timestamp': '2025-09-30 22:10:00.555024', 'step': 158, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:00.626258', 'step': 158, 'epoch': 1} +{'type': 'loss', 'content': 0.051464516669511795, 'timestamp': '2025-09-30 22:10:00.629304', 'step': 159, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:00.695658', 'step': 159, 'epoch': 1} +{'type': 'loss', 'content': 0.03535311296582222, 'timestamp': '2025-09-30 22:10:00.705942', 'step': 160, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:00.770799', 'step': 160, 'epoch': 1} +{'type': 'loss', 'content': 0.021262094378471375, 'timestamp': '2025-09-30 22:10:00.773120', 'step': 161, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:00.836550', 'step': 161, 'epoch': 1} +{'type': 'loss', 'content': 0.02948129177093506, 'timestamp': '2025-09-30 22:10:00.840114', 'step': 162, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:00.903882', 'step': 162, 'epoch': 1} +{'type': 'loss', 'content': 0.020708870142698288, 'timestamp': '2025-09-30 22:10:00.918926', 'step': 163, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:00.987530', 'step': 163, 'epoch': 1} +{'type': 'loss', 'content': 0.01856974884867668, 'timestamp': '2025-09-30 22:10:01.003210', 'step': 164, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:01.086270', 'step': 164, 'epoch': 1} +{'type': 'loss', 'content': 0.02260742522776127, 'timestamp': '2025-09-30 22:10:01.089350', 'step': 165, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:01.159681', 'step': 165, 'epoch': 1} +{'type': 'loss', 'content': 0.024135034531354904, 'timestamp': '2025-09-30 22:10:01.167552', 'step': 166, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:01.253829', 'step': 166, 'epoch': 1} +{'type': 'loss', 'content': 0.024836549535393715, 'timestamp': '2025-09-30 22:10:01.257725', 'step': 167, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:01.317725', 'step': 167, 'epoch': 1} +{'type': 'loss', 'content': 0.021925508975982666, 'timestamp': '2025-09-30 22:10:01.324948', 'step': 168, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:01.383629', 'step': 168, 'epoch': 1} +{'type': 'loss', 'content': 0.026183176785707474, 'timestamp': '2025-09-30 22:10:01.386896', 'step': 169, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:01.445667', 'step': 169, 'epoch': 1} +{'type': 'loss', 'content': 0.027825552970170975, 'timestamp': '2025-09-30 22:10:01.448737', 'step': 170, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:01.514985', 'step': 170, 'epoch': 1} +{'type': 'loss', 'content': 0.02336815930902958, 'timestamp': '2025-09-30 22:10:01.524465', 'step': 171, 'epoch': 1} +{'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}], 'timestamp': '2025-09-30 22:10:03.122432', 'step': 171, 'epoch': 1} +{'type': 'pplx', 'content': 30444627.67679443, 'timestamp': '2025-09-30 22:10:03.130412', 'step': 171, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:03.189713', 'step': 171, 'epoch': 1} +{'type': 'loss', 'content': 0.025213051587343216, 'timestamp': '2025-09-30 22:10:03.207338', 'step': 172, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:03.270568', 'step': 172, 'epoch': 1} +{'type': 'loss', 'content': 0.026924652978777885, 'timestamp': '2025-09-30 22:10:03.289893', 'step': 173, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:03.358458', 'step': 173, 'epoch': 1} +{'type': 'loss', 'content': 0.019906342029571533, 'timestamp': '2025-09-30 22:10:03.367731', 'step': 174, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:03.431420', 'step': 174, 'epoch': 1} +{'type': 'loss', 'content': 0.02854965254664421, 'timestamp': '2025-09-30 22:10:03.435169', 'step': 175, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:03.502168', 'step': 175, 'epoch': 1} +{'type': 'loss', 'content': 0.029346134513616562, 'timestamp': '2025-09-30 22:10:03.513567', 'step': 176, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:03.569994', 'step': 176, 'epoch': 1} +{'type': 'loss', 'content': 0.027899976819753647, 'timestamp': '2025-09-30 22:10:03.595350', 'step': 177, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:03.660257', 'step': 177, 'epoch': 1} +{'type': 'loss', 'content': 0.031159192323684692, 'timestamp': '2025-09-30 22:10:03.664061', 'step': 178, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:03.758193', 'step': 178, 'epoch': 1} +{'type': 'loss', 'content': 0.025665393099188805, 'timestamp': '2025-09-30 22:10:03.762946', 'step': 179, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:03.828469', 'step': 179, 'epoch': 1} +{'type': 'loss', 'content': 0.02343871258199215, 'timestamp': '2025-09-30 22:10:03.862207', 'step': 180, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:03.934534', 'step': 180, 'epoch': 1} +{'type': 'loss', 'content': 0.023453569039702415, 'timestamp': '2025-09-30 22:10:03.941948', 'step': 181, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:04.015301', 'step': 181, 'epoch': 1} +{'type': 'loss', 'content': 0.025829827412962914, 'timestamp': '2025-09-30 22:10:04.020573', 'step': 182, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:04.091297', 'step': 182, 'epoch': 1} +{'type': 'loss', 'content': 0.029110519215464592, 'timestamp': '2025-09-30 22:10:04.102375', 'step': 183, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:04.171598', 'step': 183, 'epoch': 1} +{'type': 'loss', 'content': 0.02779129333794117, 'timestamp': '2025-09-30 22:10:04.182335', 'step': 184, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:04.244865', 'step': 184, 'epoch': 1} +{'type': 'loss', 'content': 0.026309384033083916, 'timestamp': '2025-09-30 22:10:04.248569', 'step': 185, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:04.317008', 'step': 185, 'epoch': 1} +{'type': 'loss', 'content': 0.020971303805708885, 'timestamp': '2025-09-30 22:10:04.319635', 'step': 186, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:04.389723', 'step': 186, 'epoch': 1} +{'type': 'loss', 'content': 0.026080194860696793, 'timestamp': '2025-09-30 22:10:04.398670', 'step': 187, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:04.462204', 'step': 187, 'epoch': 1} +{'type': 'loss', 'content': 0.022094666957855225, 'timestamp': '2025-09-30 22:10:04.474646', 'step': 188, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:04.546212', 'step': 188, 'epoch': 1} +{'type': 'loss', 'content': 0.019831160083413124, 'timestamp': '2025-09-30 22:10:04.548835', 'step': 189, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:04.611573', 'step': 189, 'epoch': 1} +{'type': 'loss', 'content': 0.02252844348549843, 'timestamp': '2025-09-30 22:10:04.615230', 'step': 190, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:04.698424', 'step': 190, 'epoch': 1} +{'type': 'loss', 'content': 0.021682869642972946, 'timestamp': '2025-09-30 22:10:04.700932', 'step': 191, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:04.769139', 'step': 191, 'epoch': 1} +{'type': 'loss', 'content': 0.02150973677635193, 'timestamp': '2025-09-30 22:10:04.775344', 'step': 192, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:04.834112', 'step': 192, 'epoch': 1} +{'type': 'loss', 'content': 0.014492852613329887, 'timestamp': '2025-09-30 22:10:04.843621', 'step': 193, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:04.918930', 'step': 193, 'epoch': 1} +{'type': 'loss', 'content': 0.015967510640621185, 'timestamp': '2025-09-30 22:10:04.923925', 'step': 194, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:04.980759', 'step': 194, 'epoch': 1} +{'type': 'loss', 'content': 0.01770109124481678, 'timestamp': '2025-09-30 22:10:04.994927', 'step': 195, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:05.070096', 'step': 195, 'epoch': 1} +{'type': 'loss', 'content': 0.02010829746723175, 'timestamp': '2025-09-30 22:10:05.080898', 'step': 196, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:05.151418', 'step': 196, 'epoch': 1} +{'type': 'loss', 'content': 0.010940426029264927, 'timestamp': '2025-09-30 22:10:05.165479', 'step': 197, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:05.227588', 'step': 197, 'epoch': 1} +{'type': 'loss', 'content': 0.02102290280163288, 'timestamp': '2025-09-30 22:10:05.233173', 'step': 198, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:05.295898', 'step': 198, 'epoch': 1} +{'type': 'loss', 'content': 0.038564298301935196, 'timestamp': '2025-09-30 22:10:05.299015', 'step': 199, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:05.358240', 'step': 199, 'epoch': 1} +{'type': 'loss', 'content': 0.030264217406511307, 'timestamp': '2025-09-30 22:10:05.372402', 'step': 200, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:05.436584', 'step': 200, 'epoch': 1} +{'type': 'loss', 'content': 0.019798846915364265, 'timestamp': '2025-09-30 22:10:05.440647', 'step': 201, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:05.511249', 'step': 201, 'epoch': 1} +{'type': 'loss', 'content': 0.02034393884241581, 'timestamp': '2025-09-30 22:10:05.522972', 'step': 202, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:05.588730', 'step': 202, 'epoch': 1} +{'type': 'loss', 'content': 0.019814888015389442, 'timestamp': '2025-09-30 22:10:05.599717', 'step': 203, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:05.665164', 'step': 203, 'epoch': 1} +{'type': 'loss', 'content': 0.01977935992181301, 'timestamp': '2025-09-30 22:10:05.671120', 'step': 204, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:05.733763', 'step': 204, 'epoch': 1} +{'type': 'loss', 'content': 0.029726387932896614, 'timestamp': '2025-09-30 22:10:05.736828', 'step': 205, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:05.793001', 'step': 205, 'epoch': 1} +{'type': 'loss', 'content': 0.008051145821809769, 'timestamp': '2025-09-30 22:10:05.796198', 'step': 206, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:05.855235', 'step': 206, 'epoch': 1} +{'type': 'loss', 'content': 0.03421095758676529, 'timestamp': '2025-09-30 22:10:05.859122', 'step': 207, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:05.918769', 'step': 207, 'epoch': 1} +{'type': 'loss', 'content': 0.01968538574874401, 'timestamp': '2025-09-30 22:10:05.925875', 'step': 208, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:05.982794', 'step': 208, 'epoch': 1} +{'type': 'loss', 'content': 0.03160746023058891, 'timestamp': '2025-09-30 22:10:05.988337', 'step': 209, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:06.054850', 'step': 209, 'epoch': 1} +{'type': 'loss', 'content': 0.04332786053419113, 'timestamp': '2025-09-30 22:10:06.062819', 'step': 210, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:06.139111', 'step': 210, 'epoch': 1} +{'type': 'loss', 'content': 0.020400291308760643, 'timestamp': '2025-09-30 22:10:06.146696', 'step': 211, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:06.214028', 'step': 211, 'epoch': 1} +{'type': 'loss', 'content': 0.05565622076392174, 'timestamp': '2025-09-30 22:10:06.223281', 'step': 212, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:06.280632', 'step': 212, 'epoch': 1} +{'type': 'loss', 'content': 0.03217899426817894, 'timestamp': '2025-09-30 22:10:06.287053', 'step': 213, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:06.350273', 'step': 213, 'epoch': 1} +{'type': 'loss', 'content': 0.01874413527548313, 'timestamp': '2025-09-30 22:10:06.356717', 'step': 214, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:06.422063', 'step': 214, 'epoch': 1} +{'type': 'loss', 'content': 0.037103354930877686, 'timestamp': '2025-09-30 22:10:06.429637', 'step': 215, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:06.493219', 'step': 215, 'epoch': 1} +{'type': 'loss', 'content': 0.030940750613808632, 'timestamp': '2025-09-30 22:10:06.505199', 'step': 216, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:06.578758', 'step': 216, 'epoch': 1} +{'type': 'loss', 'content': 0.0197443924844265, 'timestamp': '2025-09-30 22:10:06.585913', 'step': 217, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:06.658187', 'step': 217, 'epoch': 1} +{'type': 'loss', 'content': 0.020792236551642418, 'timestamp': '2025-09-30 22:10:06.660699', 'step': 218, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:06.717629', 'step': 218, 'epoch': 1} +{'type': 'loss', 'content': 0.02086322009563446, 'timestamp': '2025-09-30 22:10:06.720244', 'step': 219, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:06.795016', 'step': 219, 'epoch': 1} +{'type': 'loss', 'content': 0.01735932007431984, 'timestamp': '2025-09-30 22:10:06.801454', 'step': 220, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:06.859512', 'step': 220, 'epoch': 1} +{'type': 'loss', 'content': 0.023645946756005287, 'timestamp': '2025-09-30 22:10:06.862985', 'step': 221, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:06.923013', 'step': 221, 'epoch': 1} +{'type': 'loss', 'content': 0.026022594422101974, 'timestamp': '2025-09-30 22:10:06.926308', 'step': 222, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:06.984919', 'step': 222, 'epoch': 1} +{'type': 'loss', 'content': 0.02121904492378235, 'timestamp': '2025-09-30 22:10:06.991504', 'step': 223, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:07.066529', 'step': 223, 'epoch': 1} +{'type': 'loss', 'content': 0.027569929137825966, 'timestamp': '2025-09-30 22:10:07.073649', 'step': 224, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:07.131861', 'step': 224, 'epoch': 1} +{'type': 'loss', 'content': 0.022572649642825127, 'timestamp': '2025-09-30 22:10:07.139150', 'step': 225, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:07.196403', 'step': 225, 'epoch': 1} +{'type': 'loss', 'content': 0.02417564019560814, 'timestamp': '2025-09-30 22:10:07.198523', 'step': 226, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:07.255106', 'step': 226, 'epoch': 1} +{'type': 'loss', 'content': 0.027327995747327805, 'timestamp': '2025-09-30 22:10:07.260826', 'step': 227, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:07.335925', 'step': 227, 'epoch': 1} +{'type': 'loss', 'content': 0.018605349585413933, 'timestamp': '2025-09-30 22:10:07.346966', 'step': 228, 'epoch': 1} +{'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}], 'timestamp': '2025-09-30 22:10:08.896406', 'step': 228, 'epoch': 1} +{'type': 'pplx', 'content': 30884345.685848907, 'timestamp': '2025-09-30 22:10:08.897993', 'step': 228, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:08.949580', 'step': 228, 'epoch': 1} +{'type': 'loss', 'content': 0.021486392244696617, 'timestamp': '2025-09-30 22:10:08.952193', 'step': 229, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:09.007817', 'step': 229, 'epoch': 1} +{'type': 'loss', 'content': 0.02346794866025448, 'timestamp': '2025-09-30 22:10:09.010228', 'step': 230, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:09.066408', 'step': 230, 'epoch': 1} +{'type': 'loss', 'content': 0.022495094686746597, 'timestamp': '2025-09-30 22:10:09.068495', 'step': 231, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:09.130931', 'step': 231, 'epoch': 1} +{'type': 'loss', 'content': 0.025627722963690758, 'timestamp': '2025-09-30 22:10:09.136540', 'step': 232, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:09.192596', 'step': 232, 'epoch': 1} +{'type': 'loss', 'content': 0.025128453969955444, 'timestamp': '2025-09-30 22:10:09.194667', 'step': 233, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:09.257595', 'step': 233, 'epoch': 1} +{'type': 'loss', 'content': 0.021439703181385994, 'timestamp': '2025-09-30 22:10:09.262908', 'step': 234, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:09.320130', 'step': 234, 'epoch': 1} +{'type': 'loss', 'content': 0.025162285193800926, 'timestamp': '2025-09-30 22:10:09.322981', 'step': 235, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:09.379439', 'step': 235, 'epoch': 1} +{'type': 'loss', 'content': 0.026900988072156906, 'timestamp': '2025-09-30 22:10:09.386142', 'step': 236, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:09.441059', 'step': 236, 'epoch': 1} +{'type': 'loss', 'content': 0.024046484380960464, 'timestamp': '2025-09-30 22:10:09.445390', 'step': 237, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:09.516816', 'step': 237, 'epoch': 1} +{'type': 'loss', 'content': 0.023616518825292587, 'timestamp': '2025-09-30 22:10:09.520114', 'step': 238, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:09.578308', 'step': 238, 'epoch': 1} +{'type': 'loss', 'content': 0.025127580389380455, 'timestamp': '2025-09-30 22:10:09.581575', 'step': 239, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:09.651946', 'step': 239, 'epoch': 1} +{'type': 'loss', 'content': 0.02287052944302559, 'timestamp': '2025-09-30 22:10:09.659087', 'step': 240, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:09.721058', 'step': 240, 'epoch': 1} +{'type': 'loss', 'content': 0.022945618256926537, 'timestamp': '2025-09-30 22:10:09.723823', 'step': 241, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:09.785061', 'step': 241, 'epoch': 1} +{'type': 'loss', 'content': 0.023390578106045723, 'timestamp': '2025-09-30 22:10:09.791062', 'step': 242, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:09.854408', 'step': 242, 'epoch': 1} +{'type': 'loss', 'content': 0.022397786378860474, 'timestamp': '2025-09-30 22:10:09.858275', 'step': 243, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:09.915704', 'step': 243, 'epoch': 1} +{'type': 'loss', 'content': 0.021386751905083656, 'timestamp': '2025-09-30 22:10:09.923545', 'step': 244, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:10.001700', 'step': 244, 'epoch': 1} +{'type': 'loss', 'content': 0.02710045874118805, 'timestamp': '2025-09-30 22:10:10.004521', 'step': 245, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:10.076145', 'step': 245, 'epoch': 1} +{'type': 'loss', 'content': 0.026470202952623367, 'timestamp': '2025-09-30 22:10:10.083954', 'step': 246, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:10.146486', 'step': 246, 'epoch': 1} +{'type': 'loss', 'content': 0.025309713557362556, 'timestamp': '2025-09-30 22:10:10.149331', 'step': 247, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:10.207779', 'step': 247, 'epoch': 1} +{'type': 'loss', 'content': 0.023809578269720078, 'timestamp': '2025-09-30 22:10:10.218339', 'step': 248, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:10.277901', 'step': 248, 'epoch': 1} +{'type': 'loss', 'content': 0.028678497299551964, 'timestamp': '2025-09-30 22:10:10.286335', 'step': 249, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:10.367509', 'step': 249, 'epoch': 1} +{'type': 'loss', 'content': 0.019650932401418686, 'timestamp': '2025-09-30 22:10:10.370450', 'step': 250, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:10.429746', 'step': 250, 'epoch': 1} +{'type': 'loss', 'content': 0.025458762422204018, 'timestamp': '2025-09-30 22:10:10.438017', 'step': 251, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:10.495845', 'step': 251, 'epoch': 1} +{'type': 'loss', 'content': 0.017796490341424942, 'timestamp': '2025-09-30 22:10:10.505651', 'step': 252, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:10.567499', 'step': 252, 'epoch': 1} +{'type': 'loss', 'content': 0.015613814815878868, 'timestamp': '2025-09-30 22:10:10.575269', 'step': 253, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:10.640852', 'step': 253, 'epoch': 1} +{'type': 'loss', 'content': 0.026363762095570564, 'timestamp': '2025-09-30 22:10:10.648408', 'step': 254, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:10.708360', 'step': 254, 'epoch': 1} +{'type': 'loss', 'content': 0.018938321620225906, 'timestamp': '2025-09-30 22:10:10.719034', 'step': 255, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-30 22:10:10.779213', 'step': 255, 'epoch': 1} +{'type': 'loss', 'content': 0.019102880731225014, 'timestamp': '2025-09-30 22:10:10.792170', 'step': 256, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:10.853523', 'step': 256, 'epoch': 1} +{'type': 'loss', 'content': 0.015843430534005165, 'timestamp': '2025-09-30 22:10:10.861130', 'step': 257, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:10.919829', 'step': 257, 'epoch': 1} +{'type': 'loss', 'content': 0.033805202692747116, 'timestamp': '2025-09-30 22:10:10.930292', 'step': 258, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:10.995113', 'step': 258, 'epoch': 1} +{'type': 'loss', 'content': 0.024396615102887154, 'timestamp': '2025-09-30 22:10:10.998228', 'step': 259, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:11.058884', 'step': 259, 'epoch': 1} +{'type': 'loss', 'content': 0.020021233707666397, 'timestamp': '2025-09-30 22:10:11.070816', 'step': 260, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:11.140519', 'step': 260, 'epoch': 1} +{'type': 'loss', 'content': 0.02062203176319599, 'timestamp': '2025-09-30 22:10:11.146244', 'step': 261, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:11.216345', 'step': 261, 'epoch': 1} +{'type': 'loss', 'content': 0.021955931559205055, 'timestamp': '2025-09-30 22:10:11.218795', 'step': 262, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:11.277173', 'step': 262, 'epoch': 1} +{'type': 'loss', 'content': 0.021168585866689682, 'timestamp': '2025-09-30 22:10:11.280607', 'step': 263, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:11.337964', 'step': 263, 'epoch': 1} +{'type': 'loss', 'content': 0.014190520159900188, 'timestamp': '2025-09-30 22:10:11.344624', 'step': 264, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:11.404989', 'step': 264, 'epoch': 1} +{'type': 'loss', 'content': 0.021121378988027573, 'timestamp': '2025-09-30 22:10:11.408576', 'step': 265, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:11.466890', 'step': 265, 'epoch': 1} +{'type': 'loss', 'content': 0.019114112481474876, 'timestamp': '2025-09-30 22:10:11.470548', 'step': 266, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:11.535659', 'step': 266, 'epoch': 1} +{'type': 'loss', 'content': 0.025738313794136047, 'timestamp': '2025-09-30 22:10:11.539012', 'step': 267, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:11.606169', 'step': 267, 'epoch': 1} +{'type': 'loss', 'content': 0.03802191838622093, 'timestamp': '2025-09-30 22:10:11.612777', 'step': 268, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:11.672111', 'step': 268, 'epoch': 1} +{'type': 'loss', 'content': 0.018177002668380737, 'timestamp': '2025-09-30 22:10:11.674706', 'step': 269, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:11.747999', 'step': 269, 'epoch': 1} +{'type': 'loss', 'content': 0.0222756527364254, 'timestamp': '2025-09-30 22:10:11.752497', 'step': 270, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:11.820640', 'step': 270, 'epoch': 1} +{'type': 'loss', 'content': 0.02620212733745575, 'timestamp': '2025-09-30 22:10:11.827843', 'step': 271, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:11.890053', 'step': 271, 'epoch': 1} +{'type': 'loss', 'content': 0.019811127334833145, 'timestamp': '2025-09-30 22:10:11.902278', 'step': 272, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:11.969423', 'step': 272, 'epoch': 1} +{'type': 'loss', 'content': 0.03354780375957489, 'timestamp': '2025-09-30 22:10:11.977684', 'step': 273, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:12.043117', 'step': 273, 'epoch': 1} +{'type': 'loss', 'content': 0.011064850725233555, 'timestamp': '2025-09-30 22:10:12.052548', 'step': 274, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:12.123721', 'step': 274, 'epoch': 1} +{'type': 'loss', 'content': 0.02540605701506138, 'timestamp': '2025-09-30 22:10:12.131500', 'step': 275, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:12.199338', 'step': 275, 'epoch': 1} +{'type': 'loss', 'content': 0.022669294849038124, 'timestamp': '2025-09-30 22:10:12.206727', 'step': 276, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:12.268274', 'step': 276, 'epoch': 1} +{'type': 'loss', 'content': 0.010532691143453121, 'timestamp': '2025-09-30 22:10:12.271962', 'step': 277, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:12.339414', 'step': 277, 'epoch': 1} +{'type': 'loss', 'content': 0.03192717209458351, 'timestamp': '2025-09-30 22:10:12.346985', 'step': 278, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:12.407191', 'step': 278, 'epoch': 1} +{'type': 'loss', 'content': 0.027782125398516655, 'timestamp': '2025-09-30 22:10:12.413865', 'step': 279, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:12.469832', 'step': 279, 'epoch': 1} +{'type': 'loss', 'content': 0.04358547180891037, 'timestamp': '2025-09-30 22:10:12.484214', 'step': 280, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:12.545511', 'step': 280, 'epoch': 1} +{'type': 'loss', 'content': 0.018636440858244896, 'timestamp': '2025-09-30 22:10:12.556165', 'step': 281, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:12.620115', 'step': 281, 'epoch': 1} +{'type': 'loss', 'content': 0.01732656918466091, 'timestamp': '2025-09-30 22:10:12.623301', 'step': 282, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:12.680350', 'step': 282, 'epoch': 1} +{'type': 'loss', 'content': 0.021487489342689514, 'timestamp': '2025-09-30 22:10:12.689464', 'step': 283, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:12.750692', 'step': 283, 'epoch': 1} +{'type': 'loss', 'content': 0.031494829803705215, 'timestamp': '2025-09-30 22:10:12.757457', 'step': 284, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:12.818583', 'step': 284, 'epoch': 1} +{'type': 'loss', 'content': 0.02197718620300293, 'timestamp': '2025-09-30 22:10:12.820809', 'step': 285, 'epoch': 1} +{'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}], 'timestamp': '2025-09-30 22:10:14.137228', 'step': 285, 'epoch': 1} +{'type': 'pplx', 'content': 32380318.740329083, 'timestamp': '2025-09-30 22:10:14.140588', 'step': 285, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:14.192593', 'step': 285, 'epoch': 1} +{'type': 'loss', 'content': 0.012143162079155445, 'timestamp': '2025-09-30 22:10:14.195045', 'step': 286, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-30 22:10:14.249538', 'step': 286, 'epoch': 1} +{'type': 'loss', 'content': 0.018916593864560127, 'timestamp': '2025-09-30 22:10:14.252578', 'step': 287, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:14.309892', 'step': 287, 'epoch': 1} +{'type': 'loss', 'content': 0.03274720534682274, 'timestamp': '2025-09-30 22:10:14.316172', 'step': 288, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:14.369291', 'step': 288, 'epoch': 1} +{'type': 'loss', 'content': 0.02828342653810978, 'timestamp': '2025-09-30 22:10:14.373423', 'step': 289, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:14.429706', 'step': 289, 'epoch': 1} +{'type': 'loss', 'content': 0.013313318602740765, 'timestamp': '2025-09-30 22:10:14.434084', 'step': 290, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:14.497217', 'step': 290, 'epoch': 1} +{'type': 'loss', 'content': 0.022141721099615097, 'timestamp': '2025-09-30 22:10:14.499249', 'step': 291, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:14.557870', 'step': 291, 'epoch': 1} +{'type': 'loss', 'content': 0.026357505470514297, 'timestamp': '2025-09-30 22:10:14.566973', 'step': 292, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:14.621517', 'step': 292, 'epoch': 1} +{'type': 'loss', 'content': 0.01601177267730236, 'timestamp': '2025-09-30 22:10:14.631578', 'step': 293, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:14.685483', 'step': 293, 'epoch': 1} +{'type': 'loss', 'content': 0.028203275054693222, 'timestamp': '2025-09-30 22:10:14.690289', 'step': 294, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:14.748068', 'step': 294, 'epoch': 1} +{'type': 'loss', 'content': 0.019105268642306328, 'timestamp': '2025-09-30 22:10:14.751600', 'step': 295, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:14.806342', 'step': 295, 'epoch': 1} +{'type': 'loss', 'content': 0.016064850613474846, 'timestamp': '2025-09-30 22:10:14.814490', 'step': 296, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:14.870963', 'step': 296, 'epoch': 1} +{'type': 'loss', 'content': 0.020204851403832436, 'timestamp': '2025-09-30 22:10:14.872966', 'step': 297, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:14.926632', 'step': 297, 'epoch': 1} +{'type': 'loss', 'content': 0.026143083348870277, 'timestamp': '2025-09-30 22:10:14.932947', 'step': 298, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:14.991347', 'step': 298, 'epoch': 1} +{'type': 'loss', 'content': 0.015985313802957535, 'timestamp': '2025-09-30 22:10:14.997778', 'step': 299, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:15.061141', 'step': 299, 'epoch': 1} +{'type': 'loss', 'content': 0.01684342697262764, 'timestamp': '2025-09-30 22:10:15.068483', 'step': 300, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:15.124062', 'step': 300, 'epoch': 1} +{'type': 'loss', 'content': 0.02533099241554737, 'timestamp': '2025-09-30 22:10:15.128923', 'step': 301, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:15.184753', 'step': 301, 'epoch': 1} +{'type': 'loss', 'content': 0.015226058661937714, 'timestamp': '2025-09-30 22:10:15.191351', 'step': 302, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:15.249276', 'step': 302, 'epoch': 1} +{'type': 'loss', 'content': 0.013342132791876793, 'timestamp': '2025-09-30 22:10:15.253023', 'step': 303, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:15.308744', 'step': 303, 'epoch': 1} +{'type': 'loss', 'content': 0.01799563132226467, 'timestamp': '2025-09-30 22:10:15.315259', 'step': 304, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:15.371443', 'step': 304, 'epoch': 1} +{'type': 'loss', 'content': 0.02207523211836815, 'timestamp': '2025-09-30 22:10:15.374857', 'step': 305, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:15.433040', 'step': 305, 'epoch': 1} +{'type': 'loss', 'content': 0.026255745440721512, 'timestamp': '2025-09-30 22:10:15.435725', 'step': 306, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:15.497514', 'step': 306, 'epoch': 1} +{'type': 'loss', 'content': 0.02717163972556591, 'timestamp': '2025-09-30 22:10:15.501327', 'step': 307, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:15.556699', 'step': 307, 'epoch': 1} +{'type': 'loss', 'content': 0.018035126850008965, 'timestamp': '2025-09-30 22:10:15.564100', 'step': 308, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:15.619353', 'step': 308, 'epoch': 1} +{'type': 'loss', 'content': 0.009531312622129917, 'timestamp': '2025-09-30 22:10:15.622592', 'step': 309, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:15.680727', 'step': 309, 'epoch': 1} +{'type': 'loss', 'content': 0.030687615275382996, 'timestamp': '2025-09-30 22:10:15.692879', 'step': 310, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:15.752017', 'step': 310, 'epoch': 1} +{'type': 'loss', 'content': 0.029424799606204033, 'timestamp': '2025-09-30 22:10:15.754868', 'step': 311, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:15.819561', 'step': 311, 'epoch': 1} +{'type': 'loss', 'content': 0.019238030537962914, 'timestamp': '2025-09-30 22:10:15.826004', 'step': 312, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:15.886926', 'step': 312, 'epoch': 1} +{'type': 'loss', 'content': 0.024589749053120613, 'timestamp': '2025-09-30 22:10:15.894310', 'step': 313, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:15.957840', 'step': 313, 'epoch': 1} +{'type': 'loss', 'content': 0.017089612782001495, 'timestamp': '2025-09-30 22:10:15.961295', 'step': 314, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:16.027620', 'step': 314, 'epoch': 1} +{'type': 'loss', 'content': 0.029858587309718132, 'timestamp': '2025-09-30 22:10:16.030633', 'step': 315, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:16.090178', 'step': 315, 'epoch': 1} +{'type': 'loss', 'content': 0.008975454606115818, 'timestamp': '2025-09-30 22:10:16.099786', 'step': 316, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:16.157823', 'step': 316, 'epoch': 1} +{'type': 'loss', 'content': 0.01512650866061449, 'timestamp': '2025-09-30 22:10:16.160346', 'step': 317, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:16.215635', 'step': 317, 'epoch': 1} +{'type': 'loss', 'content': 0.027577145025134087, 'timestamp': '2025-09-30 22:10:16.219071', 'step': 318, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:16.274323', 'step': 318, 'epoch': 1} +{'type': 'loss', 'content': 0.017553279176354408, 'timestamp': '2025-09-30 22:10:16.277723', 'step': 319, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:16.336932', 'step': 319, 'epoch': 1} +{'type': 'loss', 'content': 0.019269872456789017, 'timestamp': '2025-09-30 22:10:16.343333', 'step': 320, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:16.397626', 'step': 320, 'epoch': 1} +{'type': 'loss', 'content': 0.019443640485405922, 'timestamp': '2025-09-30 22:10:16.405909', 'step': 321, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:16.465354', 'step': 321, 'epoch': 1} +{'type': 'loss', 'content': 0.03317169472575188, 'timestamp': '2025-09-30 22:10:16.467887', 'step': 322, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:16.528175', 'step': 322, 'epoch': 1} +{'type': 'loss', 'content': 0.029915081337094307, 'timestamp': '2025-09-30 22:10:16.532803', 'step': 323, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:16.587604', 'step': 323, 'epoch': 1} +{'type': 'loss', 'content': 0.02484137937426567, 'timestamp': '2025-09-30 22:10:16.596854', 'step': 324, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:16.662203', 'step': 324, 'epoch': 1} +{'type': 'loss', 'content': 0.022063011303544044, 'timestamp': '2025-09-30 22:10:16.665242', 'step': 325, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:16.720220', 'step': 325, 'epoch': 1} +{'type': 'loss', 'content': 0.0251996461302042, 'timestamp': '2025-09-30 22:10:16.723926', 'step': 326, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:16.778169', 'step': 326, 'epoch': 1} +{'type': 'loss', 'content': 0.027159083634614944, 'timestamp': '2025-09-30 22:10:16.781078', 'step': 327, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-30 22:10:16.837381', 'step': 327, 'epoch': 1} +{'type': 'loss', 'content': 0.02359570376574993, 'timestamp': '2025-09-30 22:10:16.843453', 'step': 328, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:16.897805', 'step': 328, 'epoch': 1} +{'type': 'loss', 'content': 0.028713062405586243, 'timestamp': '2025-09-30 22:10:16.900001', 'step': 329, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:16.958749', 'step': 329, 'epoch': 1} +{'type': 'loss', 'content': 0.028622686862945557, 'timestamp': '2025-09-30 22:10:16.962010', 'step': 330, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:17.019372', 'step': 330, 'epoch': 1} +{'type': 'loss', 'content': 0.01968633010983467, 'timestamp': '2025-09-30 22:10:17.022951', 'step': 331, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:17.077163', 'step': 331, 'epoch': 1} +{'type': 'loss', 'content': 0.01846577599644661, 'timestamp': '2025-09-30 22:10:17.082746', 'step': 332, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:17.136997', 'step': 332, 'epoch': 1} +{'type': 'loss', 'content': 0.026781436055898666, 'timestamp': '2025-09-30 22:10:17.139637', 'step': 333, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:17.194116', 'step': 333, 'epoch': 1} +{'type': 'loss', 'content': 0.021621640771627426, 'timestamp': '2025-09-30 22:10:17.196713', 'step': 334, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:17.249465', 'step': 334, 'epoch': 1} +{'type': 'loss', 'content': 0.0181210245937109, 'timestamp': '2025-09-30 22:10:17.255248', 'step': 335, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-30 22:10:17.318124', 'step': 335, 'epoch': 1} +{'type': 'loss', 'content': 0.019643476232886314, 'timestamp': '2025-09-30 22:10:17.324116', 'step': 336, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:17.377700', 'step': 336, 'epoch': 1} +{'type': 'loss', 'content': 0.023817529901862144, 'timestamp': '2025-09-30 22:10:17.380338', 'step': 337, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:17.434506', 'step': 337, 'epoch': 1} +{'type': 'loss', 'content': 0.017907124012708664, 'timestamp': '2025-09-30 22:10:17.437861', 'step': 338, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:17.490611', 'step': 338, 'epoch': 1} +{'type': 'loss', 'content': 0.025304758921265602, 'timestamp': '2025-09-30 22:10:17.494601', 'step': 339, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:17.549116', 'step': 339, 'epoch': 1} +{'type': 'loss', 'content': 0.01915588602423668, 'timestamp': '2025-09-30 22:10:17.555595', 'step': 340, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:17.609286', 'step': 340, 'epoch': 1} +{'type': 'loss', 'content': 0.019800949841737747, 'timestamp': '2025-09-30 22:10:17.616037', 'step': 341, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:17.671375', 'step': 341, 'epoch': 1} +{'type': 'loss', 'content': 0.027113808318972588, 'timestamp': '2025-09-30 22:10:17.677090', 'step': 342, 'epoch': 1} +{'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}], 'timestamp': '2025-09-30 22:10:18.928133', 'step': 342, 'epoch': 1} +{'type': 'pplx', 'content': 33118764.612160176, 'timestamp': '2025-09-30 22:10:18.930856', 'step': 342, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:18.982725', 'step': 342, 'epoch': 1} +{'type': 'loss', 'content': 0.021984072402119637, 'timestamp': '2025-09-30 22:10:18.987387', 'step': 343, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:19.044229', 'step': 343, 'epoch': 1} +{'type': 'loss', 'content': 0.03714148327708244, 'timestamp': '2025-09-30 22:10:19.050010', 'step': 344, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:19.103528', 'step': 344, 'epoch': 1} +{'type': 'loss', 'content': 0.02487654611468315, 'timestamp': '2025-09-30 22:10:19.106751', 'step': 345, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:19.161194', 'step': 345, 'epoch': 1} +{'type': 'loss', 'content': 0.027115579694509506, 'timestamp': '2025-09-30 22:10:19.164115', 'step': 346, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:19.218126', 'step': 346, 'epoch': 1} +{'type': 'loss', 'content': 0.021236712113022804, 'timestamp': '2025-09-30 22:10:19.220655', 'step': 347, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:19.273985', 'step': 347, 'epoch': 1} +{'type': 'loss', 'content': 0.02278432808816433, 'timestamp': '2025-09-30 22:10:19.279822', 'step': 348, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:19.332898', 'step': 348, 'epoch': 1} +{'type': 'loss', 'content': 0.018770869821310043, 'timestamp': '2025-09-30 22:10:19.335724', 'step': 349, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:19.390109', 'step': 349, 'epoch': 1} +{'type': 'loss', 'content': 0.02387206256389618, 'timestamp': '2025-09-30 22:10:19.393380', 'step': 350, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:19.449148', 'step': 350, 'epoch': 1} +{'type': 'loss', 'content': 0.029281629249453545, 'timestamp': '2025-09-30 22:10:19.452340', 'step': 351, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:19.505888', 'step': 351, 'epoch': 1} +{'type': 'loss', 'content': 0.023951295763254166, 'timestamp': '2025-09-30 22:10:19.512866', 'step': 352, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:19.572066', 'step': 352, 'epoch': 1} +{'type': 'loss', 'content': 0.033801205456256866, 'timestamp': '2025-09-30 22:10:19.574004', 'step': 353, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:19.628517', 'step': 353, 'epoch': 1} +{'type': 'loss', 'content': 0.02057676389813423, 'timestamp': '2025-09-30 22:10:19.630640', 'step': 354, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:19.684004', 'step': 354, 'epoch': 1} +{'type': 'loss', 'content': 0.024962907657027245, 'timestamp': '2025-09-30 22:10:19.686449', 'step': 355, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:19.739625', 'step': 355, 'epoch': 1} +{'type': 'loss', 'content': 0.015609494410455227, 'timestamp': '2025-09-30 22:10:19.745689', 'step': 356, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:19.801726', 'step': 356, 'epoch': 1} +{'type': 'loss', 'content': 0.02138134464621544, 'timestamp': '2025-09-30 22:10:19.804873', 'step': 357, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:19.857927', 'step': 357, 'epoch': 1} +{'type': 'loss', 'content': 0.01888282783329487, 'timestamp': '2025-09-30 22:10:19.860392', 'step': 358, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:19.913884', 'step': 358, 'epoch': 1} +{'type': 'loss', 'content': 0.01716557890176773, 'timestamp': '2025-09-30 22:10:19.916491', 'step': 359, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:19.970266', 'step': 359, 'epoch': 1} +{'type': 'loss', 'content': 0.01593884639441967, 'timestamp': '2025-09-30 22:10:19.976418', 'step': 360, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:20.035102', 'step': 360, 'epoch': 1} +{'type': 'loss', 'content': 0.024571670219302177, 'timestamp': '2025-09-30 22:10:20.038125', 'step': 361, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:20.092636', 'step': 361, 'epoch': 1} +{'type': 'loss', 'content': 0.02865268848836422, 'timestamp': '2025-09-30 22:10:20.094879', 'step': 362, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:20.149660', 'step': 362, 'epoch': 1} +{'type': 'loss', 'content': 0.020879996940493584, 'timestamp': '2025-09-30 22:10:20.152442', 'step': 363, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:20.207160', 'step': 363, 'epoch': 1} +{'type': 'loss', 'content': 0.015153266489505768, 'timestamp': '2025-09-30 22:10:20.213071', 'step': 364, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:20.265866', 'step': 364, 'epoch': 1} +{'type': 'loss', 'content': 0.026744915172457695, 'timestamp': '2025-09-30 22:10:20.269002', 'step': 365, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:20.323260', 'step': 365, 'epoch': 1} +{'type': 'loss', 'content': 0.014960569329559803, 'timestamp': '2025-09-30 22:10:20.326362', 'step': 366, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-30 22:10:20.380718', 'step': 366, 'epoch': 1} +{'type': 'loss', 'content': 0.019018033519387245, 'timestamp': '2025-09-30 22:10:20.383960', 'step': 367, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:20.438786', 'step': 367, 'epoch': 1} +{'type': 'loss', 'content': 0.027187785133719444, 'timestamp': '2025-09-30 22:10:20.444869', 'step': 368, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:20.499318', 'step': 368, 'epoch': 1} +{'type': 'loss', 'content': 0.02450934611260891, 'timestamp': '2025-09-30 22:10:20.501389', 'step': 369, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:20.561671', 'step': 369, 'epoch': 1} +{'type': 'loss', 'content': 0.02450348250567913, 'timestamp': '2025-09-30 22:10:20.564282', 'step': 370, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:20.618016', 'step': 370, 'epoch': 1} +{'type': 'loss', 'content': 0.013314715586602688, 'timestamp': '2025-09-30 22:10:20.620695', 'step': 371, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:20.681024', 'step': 371, 'epoch': 1} +{'type': 'loss', 'content': 0.03389272093772888, 'timestamp': '2025-09-30 22:10:20.686886', 'step': 372, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:20.743579', 'step': 372, 'epoch': 1} +{'type': 'loss', 'content': 0.023728221654891968, 'timestamp': '2025-09-30 22:10:20.746212', 'step': 373, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:20.805459', 'step': 373, 'epoch': 1} +{'type': 'loss', 'content': 0.02945546805858612, 'timestamp': '2025-09-30 22:10:20.808132', 'step': 374, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:20.862539', 'step': 374, 'epoch': 1} +{'type': 'loss', 'content': 0.016989445313811302, 'timestamp': '2025-09-30 22:10:20.865660', 'step': 375, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:20.919975', 'step': 375, 'epoch': 1} +{'type': 'loss', 'content': 0.016743594780564308, 'timestamp': '2025-09-30 22:10:20.925919', 'step': 376, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:20.983291', 'step': 376, 'epoch': 1} +{'type': 'loss', 'content': 0.012260585092008114, 'timestamp': '2025-09-30 22:10:20.985700', 'step': 377, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:21.041935', 'step': 377, 'epoch': 1} +{'type': 'loss', 'content': 0.018505526706576347, 'timestamp': '2025-09-30 22:10:21.044221', 'step': 378, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:21.099361', 'step': 378, 'epoch': 1} +{'type': 'loss', 'content': 0.02107120119035244, 'timestamp': '2025-09-30 22:10:21.101514', 'step': 379, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:21.159867', 'step': 379, 'epoch': 1} +{'type': 'loss', 'content': 0.02227369323372841, 'timestamp': '2025-09-30 22:10:21.165467', 'step': 380, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:21.218242', 'step': 380, 'epoch': 1} +{'type': 'loss', 'content': 0.025716906413435936, 'timestamp': '2025-09-30 22:10:21.220708', 'step': 381, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:21.278757', 'step': 381, 'epoch': 1} +{'type': 'loss', 'content': 0.019525570794939995, 'timestamp': '2025-09-30 22:10:21.280994', 'step': 382, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:21.342716', 'step': 382, 'epoch': 1} +{'type': 'loss', 'content': 0.012794404290616512, 'timestamp': '2025-09-30 22:10:21.345127', 'step': 383, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:21.401362', 'step': 383, 'epoch': 1} +{'type': 'loss', 'content': 0.02354384958744049, 'timestamp': '2025-09-30 22:10:21.406613', 'step': 384, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:21.464824', 'step': 384, 'epoch': 1} +{'type': 'loss', 'content': 0.010914976708590984, 'timestamp': '2025-09-30 22:10:21.468655', 'step': 385, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:21.522272', 'step': 385, 'epoch': 1} +{'type': 'loss', 'content': 0.017349114641547203, 'timestamp': '2025-09-30 22:10:21.524393', 'step': 386, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:21.583430', 'step': 386, 'epoch': 1} +{'type': 'loss', 'content': 0.018263814970850945, 'timestamp': '2025-09-30 22:10:21.586006', 'step': 387, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:21.649324', 'step': 387, 'epoch': 1} +{'type': 'loss', 'content': 0.019998768344521523, 'timestamp': '2025-09-30 22:10:21.654921', 'step': 388, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:21.715235', 'step': 388, 'epoch': 1} +{'type': 'loss', 'content': 0.016233079135417938, 'timestamp': '2025-09-30 22:10:21.717707', 'step': 389, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:21.775916', 'step': 389, 'epoch': 1} +{'type': 'loss', 'content': 0.02938160113990307, 'timestamp': '2025-09-30 22:10:21.778133', 'step': 390, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:21.833207', 'step': 390, 'epoch': 1} +{'type': 'loss', 'content': 0.03876896947622299, 'timestamp': '2025-09-30 22:10:21.835192', 'step': 391, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:21.891248', 'step': 391, 'epoch': 1} +{'type': 'loss', 'content': 0.013861365616321564, 'timestamp': '2025-09-30 22:10:21.897422', 'step': 392, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:21.953983', 'step': 392, 'epoch': 1} +{'type': 'loss', 'content': 0.01384772453457117, 'timestamp': '2025-09-30 22:10:21.956099', 'step': 393, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-30 22:10:22.011307', 'step': 393, 'epoch': 1} +{'type': 'loss', 'content': 0.020624876022338867, 'timestamp': '2025-09-30 22:10:22.013443', 'step': 394, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:22.067347', 'step': 394, 'epoch': 1} +{'type': 'loss', 'content': 0.00986799132078886, 'timestamp': '2025-09-30 22:10:22.069504', 'step': 395, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:22.125853', 'step': 395, 'epoch': 1} +{'type': 'loss', 'content': 0.023464640602469444, 'timestamp': '2025-09-30 22:10:22.131474', 'step': 396, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:22.190813', 'step': 396, 'epoch': 1} +{'type': 'loss', 'content': 0.009329124353826046, 'timestamp': '2025-09-30 22:10:22.193032', 'step': 397, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:22.251437', 'step': 397, 'epoch': 1} +{'type': 'loss', 'content': 0.013422129675745964, 'timestamp': '2025-09-30 22:10:22.253601', 'step': 398, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:22.313231', 'step': 398, 'epoch': 1} +{'type': 'loss', 'content': 0.023139040917158127, 'timestamp': '2025-09-30 22:10:22.315268', 'step': 399, 'epoch': 1} +{'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}], 'timestamp': '2025-09-30 22:10:23.660296', 'step': 399, 'epoch': 1} +{'type': 'pplx', 'content': 40589601.62274881, 'timestamp': '2025-09-30 22:10:23.662816', 'step': 399, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:23.715616', 'step': 399, 'epoch': 1} +{'type': 'loss', 'content': 0.01724778302013874, 'timestamp': '2025-09-30 22:10:23.721952', 'step': 400, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:23.777851', 'step': 400, 'epoch': 1} +{'type': 'loss', 'content': 0.008374908939003944, 'timestamp': '2025-09-30 22:10:23.780509', 'step': 401, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:23.835680', 'step': 401, 'epoch': 1} +{'type': 'loss', 'content': 0.01957196369767189, 'timestamp': '2025-09-30 22:10:23.838371', 'step': 402, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:23.894866', 'step': 402, 'epoch': 1} +{'type': 'loss', 'content': 0.018934715539216995, 'timestamp': '2025-09-30 22:10:23.897082', 'step': 403, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:23.953152', 'step': 403, 'epoch': 1} +{'type': 'loss', 'content': 0.03373418375849724, 'timestamp': '2025-09-30 22:10:23.962739', 'step': 404, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:24.018580', 'step': 404, 'epoch': 1} +{'type': 'loss', 'content': 0.027509469538927078, 'timestamp': '2025-09-30 22:10:24.022584', 'step': 405, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:24.078777', 'step': 405, 'epoch': 1} +{'type': 'loss', 'content': 0.03421385958790779, 'timestamp': '2025-09-30 22:10:24.081776', 'step': 406, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:24.138442', 'step': 406, 'epoch': 1} +{'type': 'loss', 'content': 0.011767718009650707, 'timestamp': '2025-09-30 22:10:24.140810', 'step': 407, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:24.197396', 'step': 407, 'epoch': 1} +{'type': 'loss', 'content': 0.009523920714855194, 'timestamp': '2025-09-30 22:10:24.203005', 'step': 408, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:24.258085', 'step': 408, 'epoch': 1} +{'type': 'loss', 'content': 0.023020844906568527, 'timestamp': '2025-09-30 22:10:24.261501', 'step': 409, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:24.321430', 'step': 409, 'epoch': 1} +{'type': 'loss', 'content': 0.020535219460725784, 'timestamp': '2025-09-30 22:10:24.324678', 'step': 410, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:24.384770', 'step': 410, 'epoch': 1} +{'type': 'loss', 'content': 0.01701906882226467, 'timestamp': '2025-09-30 22:10:24.391280', 'step': 411, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:24.449399', 'step': 411, 'epoch': 1} +{'type': 'loss', 'content': 0.03201786056160927, 'timestamp': '2025-09-30 22:10:24.457737', 'step': 412, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:24.512109', 'step': 412, 'epoch': 1} +{'type': 'loss', 'content': 0.011379254050552845, 'timestamp': '2025-09-30 22:10:24.515992', 'step': 413, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:24.584178', 'step': 413, 'epoch': 1} +{'type': 'loss', 'content': 0.03204244002699852, 'timestamp': '2025-09-30 22:10:24.589115', 'step': 414, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:24.645428', 'step': 414, 'epoch': 1} +{'type': 'loss', 'content': 0.035000745207071304, 'timestamp': '2025-09-30 22:10:24.648528', 'step': 415, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:24.708676', 'step': 415, 'epoch': 1} +{'type': 'loss', 'content': 0.011228829622268677, 'timestamp': '2025-09-30 22:10:24.715697', 'step': 416, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:24.774441', 'step': 416, 'epoch': 1} +{'type': 'loss', 'content': 0.015902062878012657, 'timestamp': '2025-09-30 22:10:24.778489', 'step': 417, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:24.837328', 'step': 417, 'epoch': 1} +{'type': 'loss', 'content': 0.029384631663560867, 'timestamp': '2025-09-30 22:10:24.839269', 'step': 418, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:24.893599', 'step': 418, 'epoch': 1} +{'type': 'loss', 'content': 0.022646890953183174, 'timestamp': '2025-09-30 22:10:24.897402', 'step': 419, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:24.953933', 'step': 419, 'epoch': 1} +{'type': 'loss', 'content': 0.015414858236908913, 'timestamp': '2025-09-30 22:10:24.960738', 'step': 420, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:25.018909', 'step': 420, 'epoch': 1} +{'type': 'loss', 'content': 0.015387741848826408, 'timestamp': '2025-09-30 22:10:25.021339', 'step': 421, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:25.076031', 'step': 421, 'epoch': 1} +{'type': 'loss', 'content': 0.016175920143723488, 'timestamp': '2025-09-30 22:10:25.078361', 'step': 422, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:25.140468', 'step': 422, 'epoch': 1} +{'type': 'loss', 'content': 0.020924212411046028, 'timestamp': '2025-09-30 22:10:25.143111', 'step': 423, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:25.198775', 'step': 423, 'epoch': 1} +{'type': 'loss', 'content': 0.015957873314619064, 'timestamp': '2025-09-30 22:10:25.204587', 'step': 424, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:25.257981', 'step': 424, 'epoch': 1} +{'type': 'loss', 'content': 0.018746186047792435, 'timestamp': '2025-09-30 22:10:25.260168', 'step': 425, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:25.317623', 'step': 425, 'epoch': 1} +{'type': 'loss', 'content': 0.028188351541757584, 'timestamp': '2025-09-30 22:10:25.319833', 'step': 426, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:25.374649', 'step': 426, 'epoch': 1} +{'type': 'loss', 'content': 0.015842631459236145, 'timestamp': '2025-09-30 22:10:25.377221', 'step': 427, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:25.436214', 'step': 427, 'epoch': 1} +{'type': 'loss', 'content': 0.016134170815348625, 'timestamp': '2025-09-30 22:10:25.442167', 'step': 428, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:25.498915', 'step': 428, 'epoch': 1} +{'type': 'loss', 'content': 0.040546927601099014, 'timestamp': '2025-09-30 22:10:25.500908', 'step': 429, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:25.561078', 'step': 429, 'epoch': 1} +{'type': 'loss', 'content': 0.016925180330872536, 'timestamp': '2025-09-30 22:10:25.565189', 'step': 430, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:25.620787', 'step': 430, 'epoch': 1} +{'type': 'loss', 'content': 0.01041901670396328, 'timestamp': '2025-09-30 22:10:25.627804', 'step': 431, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:25.688574', 'step': 431, 'epoch': 1} +{'type': 'loss', 'content': 0.01679927296936512, 'timestamp': '2025-09-30 22:10:25.694911', 'step': 432, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:25.754989', 'step': 432, 'epoch': 1} +{'type': 'loss', 'content': 0.01948891021311283, 'timestamp': '2025-09-30 22:10:25.758013', 'step': 433, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-30 22:10:25.813575', 'step': 433, 'epoch': 1} +{'type': 'loss', 'content': 0.026692839339375496, 'timestamp': '2025-09-30 22:10:25.820015', 'step': 434, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:25.876179', 'step': 434, 'epoch': 1} +{'type': 'loss', 'content': 0.021725250408053398, 'timestamp': '2025-09-30 22:10:25.880139', 'step': 435, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:25.934999', 'step': 435, 'epoch': 1} +{'type': 'loss', 'content': 0.016403162851929665, 'timestamp': '2025-09-30 22:10:25.953512', 'step': 436, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:26.011910', 'step': 436, 'epoch': 1} +{'type': 'loss', 'content': 0.02581772208213806, 'timestamp': '2025-09-30 22:10:26.019119', 'step': 437, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:26.077043', 'step': 437, 'epoch': 1} +{'type': 'loss', 'content': 0.023395583033561707, 'timestamp': '2025-09-30 22:10:26.081776', 'step': 438, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:26.138996', 'step': 438, 'epoch': 1} +{'type': 'loss', 'content': 0.03953873738646507, 'timestamp': '2025-09-30 22:10:26.142696', 'step': 439, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:26.198275', 'step': 439, 'epoch': 1} +{'type': 'loss', 'content': 0.018297594040632248, 'timestamp': '2025-09-30 22:10:26.206228', 'step': 440, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:26.262605', 'step': 440, 'epoch': 1} +{'type': 'loss', 'content': 0.030052199959754944, 'timestamp': '2025-09-30 22:10:26.266573', 'step': 441, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:26.326102', 'step': 441, 'epoch': 1} +{'type': 'loss', 'content': 0.029010707512497902, 'timestamp': '2025-09-30 22:10:26.329939', 'step': 442, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:26.392124', 'step': 442, 'epoch': 1} +{'type': 'loss', 'content': 0.013197534717619419, 'timestamp': '2025-09-30 22:10:26.395865', 'step': 443, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:26.455099', 'step': 443, 'epoch': 1} +{'type': 'loss', 'content': 0.03683071583509445, 'timestamp': '2025-09-30 22:10:26.462646', 'step': 444, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:26.518023', 'step': 444, 'epoch': 1} +{'type': 'loss', 'content': 0.01791354827582836, 'timestamp': '2025-09-30 22:10:26.522558', 'step': 445, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:26.580204', 'step': 445, 'epoch': 1} +{'type': 'loss', 'content': 0.01698167435824871, 'timestamp': '2025-09-30 22:10:26.584622', 'step': 446, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:26.644908', 'step': 446, 'epoch': 1} +{'type': 'loss', 'content': 0.012840881943702698, 'timestamp': '2025-09-30 22:10:26.649589', 'step': 447, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:26.706363', 'step': 447, 'epoch': 1} +{'type': 'loss', 'content': 0.021327754482626915, 'timestamp': '2025-09-30 22:10:26.712053', 'step': 448, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:26.770017', 'step': 448, 'epoch': 1} +{'type': 'loss', 'content': 0.012066961266100407, 'timestamp': '2025-09-30 22:10:26.775318', 'step': 449, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:26.837781', 'step': 449, 'epoch': 1} +{'type': 'loss', 'content': 0.028330016881227493, 'timestamp': '2025-09-30 22:10:26.846404', 'step': 450, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:26.909229', 'step': 450, 'epoch': 1} +{'type': 'loss', 'content': 0.023282703012228012, 'timestamp': '2025-09-30 22:10:26.912219', 'step': 451, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:26.966278', 'step': 451, 'epoch': 1} +{'type': 'loss', 'content': 0.010071339085698128, 'timestamp': '2025-09-30 22:10:26.976341', 'step': 452, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:27.043117', 'step': 452, 'epoch': 1} +{'type': 'loss', 'content': 0.01836954988539219, 'timestamp': '2025-09-30 22:10:27.046716', 'step': 453, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-30 22:10:27.101355', 'step': 453, 'epoch': 1} +{'type': 'loss', 'content': 0.025107625871896744, 'timestamp': '2025-09-30 22:10:27.103861', 'step': 454, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:27.166366', 'step': 454, 'epoch': 1} +{'type': 'loss', 'content': 0.02213280089199543, 'timestamp': '2025-09-30 22:10:27.172278', 'step': 455, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:27.230762', 'step': 455, 'epoch': 1} +{'type': 'loss', 'content': 0.012315492145717144, 'timestamp': '2025-09-30 22:10:27.239089', 'step': 456, 'epoch': 1} +{'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}], 'timestamp': '2025-09-30 22:10:28.975277', 'step': 456, 'epoch': 1} +{'type': 'pplx', 'content': 43492757.85152588, 'timestamp': '2025-09-30 22:10:28.977567', 'step': 456, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:29.039128', 'step': 456, 'epoch': 1} +{'type': 'loss', 'content': 0.02141198329627514, 'timestamp': '2025-09-30 22:10:29.048866', 'step': 457, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:29.110780', 'step': 457, 'epoch': 1} +{'type': 'loss', 'content': 0.0336587019264698, 'timestamp': '2025-09-30 22:10:29.115494', 'step': 458, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:29.173645', 'step': 458, 'epoch': 1} +{'type': 'loss', 'content': 0.014187236316502094, 'timestamp': '2025-09-30 22:10:29.180707', 'step': 459, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:29.240772', 'step': 459, 'epoch': 1} +{'type': 'loss', 'content': 0.026401251554489136, 'timestamp': '2025-09-30 22:10:29.247468', 'step': 460, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:29.310695', 'step': 460, 'epoch': 1} +{'type': 'loss', 'content': 0.020034367218613625, 'timestamp': '2025-09-30 22:10:29.317981', 'step': 461, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:29.376268', 'step': 461, 'epoch': 1} +{'type': 'loss', 'content': 0.011318235658109188, 'timestamp': '2025-09-30 22:10:29.382341', 'step': 462, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:29.443870', 'step': 462, 'epoch': 1} +{'type': 'loss', 'content': 0.009131813421845436, 'timestamp': '2025-09-30 22:10:29.452675', 'step': 463, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:29.513953', 'step': 463, 'epoch': 1} +{'type': 'loss', 'content': 0.020363813266158104, 'timestamp': '2025-09-30 22:10:29.520672', 'step': 464, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:29.585071', 'step': 464, 'epoch': 1} +{'type': 'loss', 'content': 0.019311266019940376, 'timestamp': '2025-09-30 22:10:29.587763', 'step': 465, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:29.642703', 'step': 465, 'epoch': 1} +{'type': 'loss', 'content': 0.01495366357266903, 'timestamp': '2025-09-30 22:10:29.645362', 'step': 466, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:29.709787', 'step': 466, 'epoch': 1} +{'type': 'loss', 'content': 0.023592934012413025, 'timestamp': '2025-09-30 22:10:29.715529', 'step': 467, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:29.776645', 'step': 467, 'epoch': 1} +{'type': 'loss', 'content': 0.027076885104179382, 'timestamp': '2025-09-30 22:10:29.785682', 'step': 468, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:29.843777', 'step': 468, 'epoch': 1} +{'type': 'loss', 'content': 0.02357049658894539, 'timestamp': '2025-09-30 22:10:29.849824', 'step': 469, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:29.915063', 'step': 469, 'epoch': 1} +{'type': 'loss', 'content': 0.02975117228925228, 'timestamp': '2025-09-30 22:10:29.917443', 'step': 470, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:29.974919', 'step': 470, 'epoch': 1} +{'type': 'loss', 'content': 0.0073095522820949554, 'timestamp': '2025-09-30 22:10:29.981202', 'step': 471, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:30.042258', 'step': 471, 'epoch': 1} +{'type': 'loss', 'content': 0.01751861535012722, 'timestamp': '2025-09-30 22:10:30.048005', 'step': 472, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:30.104524', 'step': 472, 'epoch': 1} +{'type': 'loss', 'content': 0.017867306247353554, 'timestamp': '2025-09-30 22:10:30.107760', 'step': 473, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:30.163190', 'step': 473, 'epoch': 1} +{'type': 'loss', 'content': 0.018450839444994926, 'timestamp': '2025-09-30 22:10:30.165609', 'step': 474, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:30.243448', 'step': 474, 'epoch': 1} +{'type': 'loss', 'content': 0.011997430585324764, 'timestamp': '2025-09-30 22:10:30.245751', 'step': 475, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:30.303755', 'step': 475, 'epoch': 1} +{'type': 'loss', 'content': 0.012514320202171803, 'timestamp': '2025-09-30 22:10:30.310204', 'step': 476, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:30.369301', 'step': 476, 'epoch': 1} +{'type': 'loss', 'content': 0.017777670174837112, 'timestamp': '2025-09-30 22:10:30.371917', 'step': 477, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:30.429023', 'step': 477, 'epoch': 1} +{'type': 'loss', 'content': 0.01758970133960247, 'timestamp': '2025-09-30 22:10:30.435678', 'step': 478, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:30.492913', 'step': 478, 'epoch': 1} +{'type': 'loss', 'content': 0.01861104555428028, 'timestamp': '2025-09-30 22:10:30.495572', 'step': 479, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:30.558831', 'step': 479, 'epoch': 1} +{'type': 'loss', 'content': 0.012014762498438358, 'timestamp': '2025-09-30 22:10:30.565157', 'step': 480, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:30.623237', 'step': 480, 'epoch': 1} +{'type': 'loss', 'content': 0.022567814216017723, 'timestamp': '2025-09-30 22:10:30.632032', 'step': 481, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:30.696343', 'step': 481, 'epoch': 1} +{'type': 'loss', 'content': 0.02221139892935753, 'timestamp': '2025-09-30 22:10:30.700014', 'step': 482, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:30.755962', 'step': 482, 'epoch': 1} +{'type': 'loss', 'content': 0.014111381955444813, 'timestamp': '2025-09-30 22:10:30.767462', 'step': 483, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:30.829136', 'step': 483, 'epoch': 1} +{'type': 'loss', 'content': 0.01386257540434599, 'timestamp': '2025-09-30 22:10:30.844391', 'step': 484, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:30.905039', 'step': 484, 'epoch': 1} +{'type': 'loss', 'content': 0.019835516810417175, 'timestamp': '2025-09-30 22:10:30.914402', 'step': 485, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:30.979295', 'step': 485, 'epoch': 1} +{'type': 'loss', 'content': 0.015690794214606285, 'timestamp': '2025-09-30 22:10:30.984650', 'step': 486, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:31.042843', 'step': 486, 'epoch': 1} +{'type': 'loss', 'content': 0.028532350435853004, 'timestamp': '2025-09-30 22:10:31.045845', 'step': 487, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:31.101781', 'step': 487, 'epoch': 1} +{'type': 'loss', 'content': 0.024972129613161087, 'timestamp': '2025-09-30 22:10:31.112277', 'step': 488, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:31.171834', 'step': 488, 'epoch': 1} +{'type': 'loss', 'content': 0.018831370398402214, 'timestamp': '2025-09-30 22:10:31.174381', 'step': 489, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:31.230777', 'step': 489, 'epoch': 1} +{'type': 'loss', 'content': 0.014749753288924694, 'timestamp': '2025-09-30 22:10:31.233694', 'step': 490, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:31.289017', 'step': 490, 'epoch': 1} +{'type': 'loss', 'content': 0.013099046424031258, 'timestamp': '2025-09-30 22:10:31.298818', 'step': 491, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:31.370536', 'step': 491, 'epoch': 1} +{'type': 'loss', 'content': 0.009160849265754223, 'timestamp': '2025-09-30 22:10:31.377012', 'step': 492, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:31.442153', 'step': 492, 'epoch': 1} +{'type': 'loss', 'content': 0.016461042687296867, 'timestamp': '2025-09-30 22:10:31.451114', 'step': 493, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:31.512076', 'step': 493, 'epoch': 1} +{'type': 'loss', 'content': 0.043794337660074234, 'timestamp': '2025-09-30 22:10:31.515396', 'step': 494, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:31.573231', 'step': 494, 'epoch': 1} +{'type': 'loss', 'content': 0.022364290431141853, 'timestamp': '2025-09-30 22:10:31.582922', 'step': 495, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:31.640558', 'step': 495, 'epoch': 1} +{'type': 'loss', 'content': 0.021926935762166977, 'timestamp': '2025-09-30 22:10:31.647384', 'step': 496, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:31.707818', 'step': 496, 'epoch': 1} +{'type': 'loss', 'content': 0.00889673549681902, 'timestamp': '2025-09-30 22:10:31.709679', 'step': 497, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:31.796642', 'step': 497, 'epoch': 1} +{'type': 'loss', 'content': 0.01593521051108837, 'timestamp': '2025-09-30 22:10:31.799398', 'step': 498, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:31.855056', 'step': 498, 'epoch': 1} +{'type': 'loss', 'content': 0.01839728094637394, 'timestamp': '2025-09-30 22:10:31.872085', 'step': 499, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:31.931769', 'step': 499, 'epoch': 1} +{'type': 'loss', 'content': 0.010415417142212391, 'timestamp': '2025-09-30 22:10:31.944103', 'step': 500, 'epoch': 1} +{'type': 'info', 'content': 'Checkpoint saved at step 500', 'timestamp': '2025-09-30 22:10:32.385970', 'step': 500, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:32.457155', 'step': 500, 'epoch': 1} +{'type': 'loss', 'content': 0.009094706736505032, 'timestamp': '2025-09-30 22:10:32.460338', 'step': 501, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:32.530121', 'step': 501, 'epoch': 1} +{'type': 'loss', 'content': 0.021521523594856262, 'timestamp': '2025-09-30 22:10:32.534049', 'step': 502, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:32.597198', 'step': 502, 'epoch': 1} +{'type': 'loss', 'content': 0.01767658442258835, 'timestamp': '2025-09-30 22:10:32.600463', 'step': 503, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:32.662752', 'step': 503, 'epoch': 1} +{'type': 'loss', 'content': 0.01816992275416851, 'timestamp': '2025-09-30 22:10:32.670715', 'step': 504, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:32.744440', 'step': 504, 'epoch': 1} +{'type': 'loss', 'content': 0.005588170140981674, 'timestamp': '2025-09-30 22:10:32.747345', 'step': 505, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:32.802870', 'step': 505, 'epoch': 1} +{'type': 'loss', 'content': 0.04537493363022804, 'timestamp': '2025-09-30 22:10:32.805853', 'step': 506, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:32.886649', 'step': 506, 'epoch': 1} +{'type': 'loss', 'content': 0.028734682127833366, 'timestamp': '2025-09-30 22:10:32.891721', 'step': 507, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:32.961030', 'step': 507, 'epoch': 1} +{'type': 'loss', 'content': 0.030345702543854713, 'timestamp': '2025-09-30 22:10:32.973246', 'step': 508, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:33.039773', 'step': 508, 'epoch': 1} +{'type': 'loss', 'content': 0.008057617582380772, 'timestamp': '2025-09-30 22:10:33.042933', 'step': 509, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:33.100278', 'step': 509, 'epoch': 1} +{'type': 'loss', 'content': 0.013696548528969288, 'timestamp': '2025-09-30 22:10:33.102771', 'step': 510, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:33.162158', 'step': 510, 'epoch': 1} +{'type': 'loss', 'content': 0.024826964363455772, 'timestamp': '2025-09-30 22:10:33.166225', 'step': 511, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:33.224101', 'step': 511, 'epoch': 1} +{'type': 'loss', 'content': 0.00507523724809289, 'timestamp': '2025-09-30 22:10:33.231387', 'step': 512, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:33.287069', 'step': 512, 'epoch': 1} +{'type': 'loss', 'content': 0.019475262612104416, 'timestamp': '2025-09-30 22:10:33.292055', 'step': 513, 'epoch': 1} +{'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}], 'timestamp': '2025-09-30 22:10:34.768428', 'step': 513, 'epoch': 1} +{'type': 'pplx', 'content': 46595227.32665869, 'timestamp': '2025-09-30 22:10:34.775585', 'step': 513, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:34.833990', 'step': 513, 'epoch': 1} +{'type': 'loss', 'content': 0.007841977290809155, 'timestamp': '2025-09-30 22:10:34.845351', 'step': 514, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:34.908910', 'step': 514, 'epoch': 1} +{'type': 'loss', 'content': 0.02670980803668499, 'timestamp': '2025-09-30 22:10:34.911739', 'step': 515, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:34.971584', 'step': 515, 'epoch': 1} +{'type': 'loss', 'content': 0.017184732481837273, 'timestamp': '2025-09-30 22:10:34.984326', 'step': 516, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:35.047781', 'step': 516, 'epoch': 1} +{'type': 'loss', 'content': 0.0331471748650074, 'timestamp': '2025-09-30 22:10:35.055702', 'step': 517, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:35.112052', 'step': 517, 'epoch': 1} +{'type': 'loss', 'content': 0.026939330622553825, 'timestamp': '2025-09-30 22:10:35.114808', 'step': 518, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:35.171714', 'step': 518, 'epoch': 1} +{'type': 'loss', 'content': 0.02031738869845867, 'timestamp': '2025-09-30 22:10:35.181994', 'step': 519, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:35.247523', 'step': 519, 'epoch': 1} +{'type': 'loss', 'content': 0.009875715710222721, 'timestamp': '2025-09-30 22:10:35.260487', 'step': 520, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:35.326048', 'step': 520, 'epoch': 1} +{'type': 'loss', 'content': 0.022956620901823044, 'timestamp': '2025-09-30 22:10:35.329588', 'step': 521, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:35.405846', 'step': 521, 'epoch': 1} +{'type': 'loss', 'content': 0.02863324247300625, 'timestamp': '2025-09-30 22:10:35.415191', 'step': 522, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:35.480290', 'step': 522, 'epoch': 1} +{'type': 'loss', 'content': 0.010198934003710747, 'timestamp': '2025-09-30 22:10:35.488332', 'step': 523, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:35.554304', 'step': 523, 'epoch': 1} +{'type': 'loss', 'content': 0.006696059834212065, 'timestamp': '2025-09-30 22:10:35.561717', 'step': 524, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:35.625076', 'step': 524, 'epoch': 1} +{'type': 'loss', 'content': 0.014501352794468403, 'timestamp': '2025-09-30 22:10:35.629654', 'step': 525, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:35.686952', 'step': 525, 'epoch': 1} +{'type': 'loss', 'content': 0.024316150695085526, 'timestamp': '2025-09-30 22:10:35.689892', 'step': 526, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:35.746425', 'step': 526, 'epoch': 1} +{'type': 'loss', 'content': 0.014118111692368984, 'timestamp': '2025-09-30 22:10:35.750028', 'step': 527, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:35.817322', 'step': 527, 'epoch': 1} +{'type': 'loss', 'content': 0.013456260785460472, 'timestamp': '2025-09-30 22:10:35.829823', 'step': 528, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:35.887982', 'step': 528, 'epoch': 1} +{'type': 'loss', 'content': 0.010260426439344883, 'timestamp': '2025-09-30 22:10:35.895282', 'step': 529, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:35.958752', 'step': 529, 'epoch': 1} +{'type': 'loss', 'content': 0.01042400486767292, 'timestamp': '2025-09-30 22:10:35.962803', 'step': 530, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:36.028568', 'step': 530, 'epoch': 1} +{'type': 'loss', 'content': 0.027442054823040962, 'timestamp': '2025-09-30 22:10:36.031254', 'step': 531, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:36.092057', 'step': 531, 'epoch': 1} +{'type': 'loss', 'content': 0.022459326311945915, 'timestamp': '2025-09-30 22:10:36.104947', 'step': 532, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:36.168237', 'step': 532, 'epoch': 1} +{'type': 'loss', 'content': 0.02310291863977909, 'timestamp': '2025-09-30 22:10:36.171118', 'step': 533, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:36.227533', 'step': 533, 'epoch': 1} +{'type': 'loss', 'content': 0.02492346242070198, 'timestamp': '2025-09-30 22:10:36.233507', 'step': 534, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:36.294215', 'step': 534, 'epoch': 1} +{'type': 'loss', 'content': 0.01682530902326107, 'timestamp': '2025-09-30 22:10:36.299767', 'step': 535, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:36.361267', 'step': 535, 'epoch': 1} +{'type': 'loss', 'content': 0.007360723335295916, 'timestamp': '2025-09-30 22:10:36.367897', 'step': 536, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:36.423199', 'step': 536, 'epoch': 1} +{'type': 'loss', 'content': 0.03802620619535446, 'timestamp': '2025-09-30 22:10:36.426126', 'step': 537, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:36.486427', 'step': 537, 'epoch': 1} +{'type': 'loss', 'content': 0.014312833547592163, 'timestamp': '2025-09-30 22:10:36.491959', 'step': 538, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:36.552805', 'step': 538, 'epoch': 1} +{'type': 'loss', 'content': 0.04087325558066368, 'timestamp': '2025-09-30 22:10:36.559559', 'step': 539, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:36.628565', 'step': 539, 'epoch': 1} +{'type': 'loss', 'content': 0.015957888215780258, 'timestamp': '2025-09-30 22:10:36.638319', 'step': 540, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:36.697802', 'step': 540, 'epoch': 1} +{'type': 'loss', 'content': 0.02199072763323784, 'timestamp': '2025-09-30 22:10:36.707124', 'step': 541, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:36.770624', 'step': 541, 'epoch': 1} +{'type': 'loss', 'content': 0.018141640350222588, 'timestamp': '2025-09-30 22:10:36.774492', 'step': 542, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:36.840242', 'step': 542, 'epoch': 1} +{'type': 'loss', 'content': 0.011592322029173374, 'timestamp': '2025-09-30 22:10:36.843581', 'step': 543, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:36.909312', 'step': 543, 'epoch': 1} +{'type': 'loss', 'content': 0.023553509265184402, 'timestamp': '2025-09-30 22:10:36.920894', 'step': 544, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:36.983571', 'step': 544, 'epoch': 1} +{'type': 'loss', 'content': 0.025153178721666336, 'timestamp': '2025-09-30 22:10:36.990509', 'step': 545, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:37.055055', 'step': 545, 'epoch': 1} +{'type': 'loss', 'content': 0.024465525522828102, 'timestamp': '2025-09-30 22:10:37.061105', 'step': 546, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:37.122633', 'step': 546, 'epoch': 1} +{'type': 'loss', 'content': 0.036670148372650146, 'timestamp': '2025-09-30 22:10:37.129338', 'step': 547, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:37.190024', 'step': 547, 'epoch': 1} +{'type': 'loss', 'content': 0.022738128900527954, 'timestamp': '2025-09-30 22:10:37.197170', 'step': 548, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:37.258684', 'step': 548, 'epoch': 1} +{'type': 'loss', 'content': 0.03440006449818611, 'timestamp': '2025-09-30 22:10:37.265618', 'step': 549, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:37.323252', 'step': 549, 'epoch': 1} +{'type': 'loss', 'content': 0.030233899131417274, 'timestamp': '2025-09-30 22:10:37.326686', 'step': 550, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:37.395601', 'step': 550, 'epoch': 1} +{'type': 'loss', 'content': 0.021326003596186638, 'timestamp': '2025-09-30 22:10:37.399142', 'step': 551, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:37.460428', 'step': 551, 'epoch': 1} +{'type': 'loss', 'content': 0.03364909067749977, 'timestamp': '2025-09-30 22:10:37.470963', 'step': 552, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:37.532526', 'step': 552, 'epoch': 1} +{'type': 'loss', 'content': 0.027066290378570557, 'timestamp': '2025-09-30 22:10:37.538400', 'step': 553, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:37.605612', 'step': 553, 'epoch': 1} +{'type': 'loss', 'content': 0.02852269820868969, 'timestamp': '2025-09-30 22:10:37.615557', 'step': 554, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:37.696930', 'step': 554, 'epoch': 1} +{'type': 'loss', 'content': 0.009644846431910992, 'timestamp': '2025-09-30 22:10:37.706231', 'step': 555, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:37.782232', 'step': 555, 'epoch': 1} +{'type': 'loss', 'content': 0.024163635447621346, 'timestamp': '2025-09-30 22:10:37.791433', 'step': 556, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:37.848943', 'step': 556, 'epoch': 1} +{'type': 'loss', 'content': 0.017797913402318954, 'timestamp': '2025-09-30 22:10:37.851761', 'step': 557, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:37.911890', 'step': 557, 'epoch': 1} +{'type': 'loss', 'content': 0.015675710514187813, 'timestamp': '2025-09-30 22:10:37.914217', 'step': 558, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:37.983298', 'step': 558, 'epoch': 1} +{'type': 'loss', 'content': 0.017431657761335373, 'timestamp': '2025-09-30 22:10:37.987332', 'step': 559, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:38.065562', 'step': 559, 'epoch': 1} +{'type': 'loss', 'content': 0.011734207160770893, 'timestamp': '2025-09-30 22:10:38.072800', 'step': 560, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:38.128930', 'step': 560, 'epoch': 1} +{'type': 'loss', 'content': 0.016993921250104904, 'timestamp': '2025-09-30 22:10:38.131406', 'step': 561, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:38.190303', 'step': 561, 'epoch': 1} +{'type': 'loss', 'content': 0.019728850573301315, 'timestamp': '2025-09-30 22:10:38.193454', 'step': 562, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:38.261817', 'step': 562, 'epoch': 1} +{'type': 'loss', 'content': 0.02789020538330078, 'timestamp': '2025-09-30 22:10:38.274576', 'step': 563, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:38.332263', 'step': 563, 'epoch': 1} +{'type': 'loss', 'content': 0.030427774414420128, 'timestamp': '2025-09-30 22:10:38.339142', 'step': 564, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:38.396018', 'step': 564, 'epoch': 1} +{'type': 'loss', 'content': 0.02213878743350506, 'timestamp': '2025-09-30 22:10:38.409641', 'step': 565, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:38.472316', 'step': 565, 'epoch': 1} +{'type': 'loss', 'content': 0.019610082730650902, 'timestamp': '2025-09-30 22:10:38.474674', 'step': 566, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:38.533261', 'step': 566, 'epoch': 1} +{'type': 'loss', 'content': 0.01741156354546547, 'timestamp': '2025-09-30 22:10:38.540039', 'step': 567, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-30 22:10:38.601991', 'step': 567, 'epoch': 1} +{'type': 'loss', 'content': 0.022262096405029297, 'timestamp': '2025-09-30 22:10:38.609836', 'step': 568, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:38.667090', 'step': 568, 'epoch': 1} +{'type': 'loss', 'content': 0.024254482239484787, 'timestamp': '2025-09-30 22:10:38.671827', 'step': 569, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:38.733470', 'step': 569, 'epoch': 1} +{'type': 'loss', 'content': 0.017775481566786766, 'timestamp': '2025-09-30 22:10:38.736235', 'step': 570, 'epoch': 1} +{'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}], 'timestamp': '2025-09-30 22:10:40.258822', 'step': 570, 'epoch': 1} +{'type': 'pplx', 'content': 47270191.72182987, 'timestamp': '2025-09-30 22:10:40.261088', 'step': 570, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:40.317993', 'step': 570, 'epoch': 1} +{'type': 'loss', 'content': 0.012138426303863525, 'timestamp': '2025-09-30 22:10:40.320234', 'step': 571, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:40.376160', 'step': 571, 'epoch': 1} +{'type': 'loss', 'content': 0.011941500008106232, 'timestamp': '2025-09-30 22:10:40.382939', 'step': 572, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:40.458446', 'step': 572, 'epoch': 1} +{'type': 'loss', 'content': 0.045111771672964096, 'timestamp': '2025-09-30 22:10:40.468771', 'step': 573, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:40.533176', 'step': 573, 'epoch': 1} +{'type': 'loss', 'content': 0.026581604033708572, 'timestamp': '2025-09-30 22:10:40.535322', 'step': 574, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:40.592110', 'step': 574, 'epoch': 1} +{'type': 'loss', 'content': 0.017682049423456192, 'timestamp': '2025-09-30 22:10:40.594331', 'step': 575, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:40.650481', 'step': 575, 'epoch': 1} +{'type': 'loss', 'content': 0.010170449502766132, 'timestamp': '2025-09-30 22:10:40.657729', 'step': 576, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:40.715792', 'step': 576, 'epoch': 1} +{'type': 'loss', 'content': 0.02111336775124073, 'timestamp': '2025-09-30 22:10:40.718679', 'step': 577, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-30 22:10:40.782680', 'step': 577, 'epoch': 1} +{'type': 'loss', 'content': 0.026711730286478996, 'timestamp': '2025-09-30 22:10:40.787449', 'step': 578, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:40.854518', 'step': 578, 'epoch': 1} +{'type': 'loss', 'content': 0.0173348356038332, 'timestamp': '2025-09-30 22:10:40.857175', 'step': 579, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:40.915291', 'step': 579, 'epoch': 1} +{'type': 'loss', 'content': 0.015113255940377712, 'timestamp': '2025-09-30 22:10:40.926331', 'step': 580, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:40.990132', 'step': 580, 'epoch': 1} +{'type': 'loss', 'content': 0.014600790105760098, 'timestamp': '2025-09-30 22:10:40.997747', 'step': 581, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:41.073398', 'step': 581, 'epoch': 1} +{'type': 'loss', 'content': 0.007143207360059023, 'timestamp': '2025-09-30 22:10:41.076344', 'step': 582, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:41.138532', 'step': 582, 'epoch': 1} +{'type': 'loss', 'content': 0.020687399432063103, 'timestamp': '2025-09-30 22:10:41.141022', 'step': 583, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:41.201736', 'step': 583, 'epoch': 1} +{'type': 'loss', 'content': 0.01433294266462326, 'timestamp': '2025-09-30 22:10:41.213194', 'step': 584, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:41.273627', 'step': 584, 'epoch': 1} +{'type': 'loss', 'content': 0.016348710283637047, 'timestamp': '2025-09-30 22:10:41.280786', 'step': 585, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:41.342328', 'step': 585, 'epoch': 1} +{'type': 'loss', 'content': 0.0035052443854510784, 'timestamp': '2025-09-30 22:10:41.347937', 'step': 586, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:41.414579', 'step': 586, 'epoch': 1} +{'type': 'loss', 'content': 0.002743300748988986, 'timestamp': '2025-09-30 22:10:41.416873', 'step': 587, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:41.472859', 'step': 587, 'epoch': 1} +{'type': 'loss', 'content': 0.047513365745544434, 'timestamp': '2025-09-30 22:10:41.480545', 'step': 588, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:41.546176', 'step': 588, 'epoch': 1} +{'type': 'loss', 'content': 0.040245767682790756, 'timestamp': '2025-09-30 22:10:41.549512', 'step': 589, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:41.609742', 'step': 589, 'epoch': 1} +{'type': 'loss', 'content': 0.02579033002257347, 'timestamp': '2025-09-30 22:10:41.612831', 'step': 590, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-30 22:10:41.696774', 'step': 590, 'epoch': 1} +{'type': 'loss', 'content': 0.0405263788998127, 'timestamp': '2025-09-30 22:10:41.700437', 'step': 591, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:41.757185', 'step': 591, 'epoch': 1} +{'type': 'loss', 'content': 0.021822741255164146, 'timestamp': '2025-09-30 22:10:41.764096', 'step': 592, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:41.820886', 'step': 592, 'epoch': 1} +{'type': 'loss', 'content': 0.027860483154654503, 'timestamp': '2025-09-30 22:10:41.828222', 'step': 593, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:41.885937', 'step': 593, 'epoch': 1} +{'type': 'loss', 'content': 0.004523556213825941, 'timestamp': '2025-09-30 22:10:41.894777', 'step': 594, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:41.958205', 'step': 594, 'epoch': 1} +{'type': 'loss', 'content': 0.02394627407193184, 'timestamp': '2025-09-30 22:10:41.962740', 'step': 595, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:42.021860', 'step': 595, 'epoch': 1} +{'type': 'loss', 'content': 0.038400568068027496, 'timestamp': '2025-09-30 22:10:42.029446', 'step': 596, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:42.091560', 'step': 596, 'epoch': 1} +{'type': 'loss', 'content': 0.020109396427869797, 'timestamp': '2025-09-30 22:10:42.094174', 'step': 597, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:42.165711', 'step': 597, 'epoch': 1} +{'type': 'loss', 'content': 0.01921306923031807, 'timestamp': '2025-09-30 22:10:42.174038', 'step': 598, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:42.237330', 'step': 598, 'epoch': 1} +{'type': 'loss', 'content': 0.035011257976293564, 'timestamp': '2025-09-30 22:10:42.240295', 'step': 599, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:42.298368', 'step': 599, 'epoch': 1} +{'type': 'loss', 'content': 0.022402983158826828, 'timestamp': '2025-09-30 22:10:42.306110', 'step': 600, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:42.369255', 'step': 600, 'epoch': 1} +{'type': 'loss', 'content': 0.023800494149327278, 'timestamp': '2025-09-30 22:10:42.372114', 'step': 601, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:42.428097', 'step': 601, 'epoch': 1} +{'type': 'loss', 'content': 0.01819646917283535, 'timestamp': '2025-09-30 22:10:42.431036', 'step': 602, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:42.490355', 'step': 602, 'epoch': 1} +{'type': 'loss', 'content': 0.02224593423306942, 'timestamp': '2025-09-30 22:10:42.497290', 'step': 603, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:42.558386', 'step': 603, 'epoch': 1} +{'type': 'loss', 'content': 0.0203517097979784, 'timestamp': '2025-09-30 22:10:42.564695', 'step': 604, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:42.627576', 'step': 604, 'epoch': 1} +{'type': 'loss', 'content': 0.019523965194821358, 'timestamp': '2025-09-30 22:10:42.631531', 'step': 605, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:42.688275', 'step': 605, 'epoch': 1} +{'type': 'loss', 'content': 0.027153076604008675, 'timestamp': '2025-09-30 22:10:42.690498', 'step': 606, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:42.755120', 'step': 606, 'epoch': 1} +{'type': 'loss', 'content': 0.02785666286945343, 'timestamp': '2025-09-30 22:10:42.757460', 'step': 607, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:42.815434', 'step': 607, 'epoch': 1} +{'type': 'loss', 'content': 0.021869728341698647, 'timestamp': '2025-09-30 22:10:42.821966', 'step': 608, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:42.884795', 'step': 608, 'epoch': 1} +{'type': 'loss', 'content': 0.018342459574341774, 'timestamp': '2025-09-30 22:10:42.888293', 'step': 609, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:42.944945', 'step': 609, 'epoch': 1} +{'type': 'loss', 'content': 0.028766410425305367, 'timestamp': '2025-09-30 22:10:42.947387', 'step': 610, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:43.005852', 'step': 610, 'epoch': 1} +{'type': 'loss', 'content': 0.018140679225325584, 'timestamp': '2025-09-30 22:10:43.009253', 'step': 611, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:43.067906', 'step': 611, 'epoch': 1} +{'type': 'loss', 'content': 0.045255787670612335, 'timestamp': '2025-09-30 22:10:43.074222', 'step': 612, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:43.128319', 'step': 612, 'epoch': 1} +{'type': 'loss', 'content': 0.0265056025236845, 'timestamp': '2025-09-30 22:10:43.132320', 'step': 613, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:43.198083', 'step': 613, 'epoch': 1} +{'type': 'loss', 'content': 0.01379645336419344, 'timestamp': '2025-09-30 22:10:43.203353', 'step': 614, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:43.263000', 'step': 614, 'epoch': 1} +{'type': 'loss', 'content': 0.027471302077174187, 'timestamp': '2025-09-30 22:10:43.269960', 'step': 615, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:43.333110', 'step': 615, 'epoch': 1} +{'type': 'loss', 'content': 0.024327319115400314, 'timestamp': '2025-09-30 22:10:43.346979', 'step': 616, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:43.409256', 'step': 616, 'epoch': 1} +{'type': 'loss', 'content': 0.03087471053004265, 'timestamp': '2025-09-30 22:10:43.412560', 'step': 617, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:43.470023', 'step': 617, 'epoch': 1} +{'type': 'loss', 'content': 0.01593298651278019, 'timestamp': '2025-09-30 22:10:43.472721', 'step': 618, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:43.535159', 'step': 618, 'epoch': 1} +{'type': 'loss', 'content': 0.025733623653650284, 'timestamp': '2025-09-30 22:10:43.542048', 'step': 619, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:43.600543', 'step': 619, 'epoch': 1} +{'type': 'loss', 'content': 0.014233306981623173, 'timestamp': '2025-09-30 22:10:43.610569', 'step': 620, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:43.668833', 'step': 620, 'epoch': 1} +{'type': 'loss', 'content': 0.016759194433689117, 'timestamp': '2025-09-30 22:10:43.673266', 'step': 621, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:43.732562', 'step': 621, 'epoch': 1} +{'type': 'loss', 'content': 0.017879139631986618, 'timestamp': '2025-09-30 22:10:43.734884', 'step': 622, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:43.790516', 'step': 622, 'epoch': 1} +{'type': 'loss', 'content': 0.015650030225515366, 'timestamp': '2025-09-30 22:10:43.793825', 'step': 623, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:43.854359', 'step': 623, 'epoch': 1} +{'type': 'loss', 'content': 0.03968540579080582, 'timestamp': '2025-09-30 22:10:43.860624', 'step': 624, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:43.915520', 'step': 624, 'epoch': 1} +{'type': 'loss', 'content': 0.02014957368373871, 'timestamp': '2025-09-30 22:10:43.919115', 'step': 625, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:43.983591', 'step': 625, 'epoch': 1} +{'type': 'loss', 'content': 0.013079083524644375, 'timestamp': '2025-09-30 22:10:43.986020', 'step': 626, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:44.049977', 'step': 626, 'epoch': 1} +{'type': 'loss', 'content': 0.008377542719244957, 'timestamp': '2025-09-30 22:10:44.052699', 'step': 627, 'epoch': 1} +{'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}], 'timestamp': '2025-09-30 22:10:45.478375', 'step': 627, 'epoch': 1} +{'type': 'pplx', 'content': 36910385.758393474, 'timestamp': '2025-09-30 22:10:45.481360', 'step': 627, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:45.537084', 'step': 627, 'epoch': 1} +{'type': 'loss', 'content': 0.00950475875288248, 'timestamp': '2025-09-30 22:10:45.543599', 'step': 628, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:45.607483', 'step': 628, 'epoch': 1} +{'type': 'loss', 'content': 0.04431208595633507, 'timestamp': '2025-09-30 22:10:45.615877', 'step': 629, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:45.695439', 'step': 629, 'epoch': 1} +{'type': 'loss', 'content': 0.03406307473778725, 'timestamp': '2025-09-30 22:10:45.699071', 'step': 630, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:45.761839', 'step': 630, 'epoch': 1} +{'type': 'loss', 'content': 0.020670806989073753, 'timestamp': '2025-09-30 22:10:45.764255', 'step': 631, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:45.826547', 'step': 631, 'epoch': 1} +{'type': 'loss', 'content': 0.01893521286547184, 'timestamp': '2025-09-30 22:10:45.833509', 'step': 632, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:45.888363', 'step': 632, 'epoch': 1} +{'type': 'loss', 'content': 0.004280415363609791, 'timestamp': '2025-09-30 22:10:45.892234', 'step': 633, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:45.959896', 'step': 633, 'epoch': 1} +{'type': 'loss', 'content': 0.011709028854966164, 'timestamp': '2025-09-30 22:10:45.962642', 'step': 634, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:46.031711', 'step': 634, 'epoch': 1} +{'type': 'loss', 'content': 0.027526134625077248, 'timestamp': '2025-09-30 22:10:46.034267', 'step': 635, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:46.103348', 'step': 635, 'epoch': 1} +{'type': 'loss', 'content': 0.00565726263448596, 'timestamp': '2025-09-30 22:10:46.109439', 'step': 636, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:46.168041', 'step': 636, 'epoch': 1} +{'type': 'loss', 'content': 0.003934341017156839, 'timestamp': '2025-09-30 22:10:46.170929', 'step': 637, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:46.255593', 'step': 637, 'epoch': 1} +{'type': 'loss', 'content': 0.04277295991778374, 'timestamp': '2025-09-30 22:10:46.258249', 'step': 638, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:46.328094', 'step': 638, 'epoch': 1} +{'type': 'loss', 'content': 0.03677859529852867, 'timestamp': '2025-09-30 22:10:46.331919', 'step': 639, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:46.388579', 'step': 639, 'epoch': 1} +{'type': 'loss', 'content': 0.003129987744614482, 'timestamp': '2025-09-30 22:10:46.395424', 'step': 640, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:46.456612', 'step': 640, 'epoch': 1} +{'type': 'loss', 'content': 0.006956647150218487, 'timestamp': '2025-09-30 22:10:46.458826', 'step': 641, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:46.522331', 'step': 641, 'epoch': 1} +{'type': 'loss', 'content': 0.016979986801743507, 'timestamp': '2025-09-30 22:10:46.524475', 'step': 642, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:46.599315', 'step': 642, 'epoch': 1} +{'type': 'loss', 'content': 0.0076353359036147594, 'timestamp': '2025-09-30 22:10:46.602275', 'step': 643, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:46.662323', 'step': 643, 'epoch': 1} +{'type': 'loss', 'content': 0.0387401208281517, 'timestamp': '2025-09-30 22:10:46.668774', 'step': 644, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:46.750641', 'step': 644, 'epoch': 1} +{'type': 'loss', 'content': 0.011696984991431236, 'timestamp': '2025-09-30 22:10:46.752924', 'step': 645, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:46.809448', 'step': 645, 'epoch': 1} +{'type': 'loss', 'content': 0.03578943759202957, 'timestamp': '2025-09-30 22:10:46.812025', 'step': 646, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:46.883057', 'step': 646, 'epoch': 1} +{'type': 'loss', 'content': 0.007673552725464106, 'timestamp': '2025-09-30 22:10:46.885598', 'step': 647, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:46.947206', 'step': 647, 'epoch': 1} +{'type': 'loss', 'content': 0.0077004688791930676, 'timestamp': '2025-09-30 22:10:46.954082', 'step': 648, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:47.017350', 'step': 648, 'epoch': 1} +{'type': 'loss', 'content': 0.009061289019882679, 'timestamp': '2025-09-30 22:10:47.021631', 'step': 649, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:47.077628', 'step': 649, 'epoch': 1} +{'type': 'loss', 'content': 0.008708625100553036, 'timestamp': '2025-09-30 22:10:47.080916', 'step': 650, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:47.135851', 'step': 650, 'epoch': 1} +{'type': 'loss', 'content': 0.02237722836434841, 'timestamp': '2025-09-30 22:10:47.138706', 'step': 651, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:47.193613', 'step': 651, 'epoch': 1} +{'type': 'loss', 'content': 0.015081758610904217, 'timestamp': '2025-09-30 22:10:47.204993', 'step': 652, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:47.261802', 'step': 652, 'epoch': 1} +{'type': 'loss', 'content': 0.020020443946123123, 'timestamp': '2025-09-30 22:10:47.264706', 'step': 653, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:47.325696', 'step': 653, 'epoch': 1} +{'type': 'loss', 'content': 0.029474765062332153, 'timestamp': '2025-09-30 22:10:47.335406', 'step': 654, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:47.396809', 'step': 654, 'epoch': 1} +{'type': 'loss', 'content': 0.0116732781752944, 'timestamp': '2025-09-30 22:10:47.399639', 'step': 655, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:47.458257', 'step': 655, 'epoch': 1} +{'type': 'loss', 'content': 0.007577078882604837, 'timestamp': '2025-09-30 22:10:47.464479', 'step': 656, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:47.526732', 'step': 656, 'epoch': 1} +{'type': 'loss', 'content': 0.0052547636441886425, 'timestamp': '2025-09-30 22:10:47.529806', 'step': 657, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:47.586913', 'step': 657, 'epoch': 1} +{'type': 'loss', 'content': 0.018849587067961693, 'timestamp': '2025-09-30 22:10:47.593218', 'step': 658, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:47.660670', 'step': 658, 'epoch': 1} +{'type': 'loss', 'content': 0.009038467891514301, 'timestamp': '2025-09-30 22:10:47.665059', 'step': 659, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:47.724782', 'step': 659, 'epoch': 1} +{'type': 'loss', 'content': 0.01595907285809517, 'timestamp': '2025-09-30 22:10:47.732513', 'step': 660, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:47.789691', 'step': 660, 'epoch': 1} +{'type': 'loss', 'content': 0.013368090614676476, 'timestamp': '2025-09-30 22:10:47.791930', 'step': 661, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:47.850005', 'step': 661, 'epoch': 1} +{'type': 'loss', 'content': 0.006896801292896271, 'timestamp': '2025-09-30 22:10:47.854407', 'step': 662, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:47.916187', 'step': 662, 'epoch': 1} +{'type': 'loss', 'content': 0.01652236096560955, 'timestamp': '2025-09-30 22:10:47.918980', 'step': 663, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:47.979786', 'step': 663, 'epoch': 1} +{'type': 'loss', 'content': 0.024208780378103256, 'timestamp': '2025-09-30 22:10:47.995948', 'step': 664, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:48.049264', 'step': 664, 'epoch': 1} +{'type': 'loss', 'content': 0.01214011013507843, 'timestamp': '2025-09-30 22:10:48.051480', 'step': 665, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:48.105936', 'step': 665, 'epoch': 1} +{'type': 'loss', 'content': 0.00862074550241232, 'timestamp': '2025-09-30 22:10:48.109173', 'step': 666, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:48.163901', 'step': 666, 'epoch': 1} +{'type': 'loss', 'content': 0.015050886198878288, 'timestamp': '2025-09-30 22:10:48.167571', 'step': 667, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:48.222532', 'step': 667, 'epoch': 1} +{'type': 'loss', 'content': 0.013227096758782864, 'timestamp': '2025-09-30 22:10:48.228725', 'step': 668, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:48.283259', 'step': 668, 'epoch': 1} +{'type': 'loss', 'content': 0.038987185806035995, 'timestamp': '2025-09-30 22:10:48.288928', 'step': 669, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:48.351701', 'step': 669, 'epoch': 1} +{'type': 'loss', 'content': 0.04726878181099892, 'timestamp': '2025-09-30 22:10:48.354165', 'step': 670, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:48.410143', 'step': 670, 'epoch': 1} +{'type': 'loss', 'content': 0.008983231149613857, 'timestamp': '2025-09-30 22:10:48.413801', 'step': 671, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:48.468268', 'step': 671, 'epoch': 1} +{'type': 'loss', 'content': 0.018243545666337013, 'timestamp': '2025-09-30 22:10:48.475102', 'step': 672, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:48.531770', 'step': 672, 'epoch': 1} +{'type': 'loss', 'content': 0.02222895435988903, 'timestamp': '2025-09-30 22:10:48.536427', 'step': 673, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:48.596231', 'step': 673, 'epoch': 1} +{'type': 'loss', 'content': 0.02621746063232422, 'timestamp': '2025-09-30 22:10:48.598374', 'step': 674, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:48.653154', 'step': 674, 'epoch': 1} +{'type': 'loss', 'content': 0.04202299192547798, 'timestamp': '2025-09-30 22:10:48.656988', 'step': 675, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:48.716543', 'step': 675, 'epoch': 1} +{'type': 'loss', 'content': 0.014642206020653248, 'timestamp': '2025-09-30 22:10:48.724600', 'step': 676, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:48.784864', 'step': 676, 'epoch': 1} +{'type': 'loss', 'content': 0.020012324675917625, 'timestamp': '2025-09-30 22:10:48.787118', 'step': 677, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:48.846088', 'step': 677, 'epoch': 1} +{'type': 'loss', 'content': 0.00840475969016552, 'timestamp': '2025-09-30 22:10:48.848230', 'step': 678, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:48.902428', 'step': 678, 'epoch': 1} +{'type': 'loss', 'content': 0.007752344012260437, 'timestamp': '2025-09-30 22:10:48.909039', 'step': 679, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:48.967559', 'step': 679, 'epoch': 1} +{'type': 'loss', 'content': 0.013558789156377316, 'timestamp': '2025-09-30 22:10:48.975134', 'step': 680, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:49.029934', 'step': 680, 'epoch': 1} +{'type': 'loss', 'content': 0.010377148166298866, 'timestamp': '2025-09-30 22:10:49.044282', 'step': 681, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:49.099360', 'step': 681, 'epoch': 1} +{'type': 'loss', 'content': 0.012168629094958305, 'timestamp': '2025-09-30 22:10:49.101658', 'step': 682, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:49.166872', 'step': 682, 'epoch': 1} +{'type': 'loss', 'content': 0.015606013126671314, 'timestamp': '2025-09-30 22:10:49.169829', 'step': 683, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:49.234287', 'step': 683, 'epoch': 1} +{'type': 'loss', 'content': 0.009783417917788029, 'timestamp': '2025-09-30 22:10:49.251208', 'step': 684, 'epoch': 1} +{'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}], 'timestamp': '2025-09-30 22:10:50.679333', 'step': 684, 'epoch': 1} +{'type': 'pplx', 'content': 32421580.472615503, 'timestamp': '2025-09-30 22:10:50.682440', 'step': 684, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:50.737433', 'step': 684, 'epoch': 1} +{'type': 'loss', 'content': 0.030063265934586525, 'timestamp': '2025-09-30 22:10:50.740440', 'step': 685, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:50.803382', 'step': 685, 'epoch': 1} +{'type': 'loss', 'content': 0.01967502571642399, 'timestamp': '2025-09-30 22:10:50.806394', 'step': 686, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:50.862605', 'step': 686, 'epoch': 1} +{'type': 'loss', 'content': 0.008366209454834461, 'timestamp': '2025-09-30 22:10:50.865456', 'step': 687, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:50.922320', 'step': 687, 'epoch': 1} +{'type': 'loss', 'content': 0.010231166146695614, 'timestamp': '2025-09-30 22:10:50.928995', 'step': 688, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:50.983225', 'step': 688, 'epoch': 1} +{'type': 'loss', 'content': 0.027960339561104774, 'timestamp': '2025-09-30 22:10:50.985402', 'step': 689, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:51.048020', 'step': 689, 'epoch': 1} +{'type': 'loss', 'content': 0.008330179378390312, 'timestamp': '2025-09-30 22:10:51.052493', 'step': 690, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:51.112208', 'step': 690, 'epoch': 1} +{'type': 'loss', 'content': 0.01055544801056385, 'timestamp': '2025-09-30 22:10:51.115301', 'step': 691, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:51.173114', 'step': 691, 'epoch': 1} +{'type': 'loss', 'content': 0.010108939372003078, 'timestamp': '2025-09-30 22:10:51.179615', 'step': 692, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:51.235667', 'step': 692, 'epoch': 1} +{'type': 'loss', 'content': 0.021281301975250244, 'timestamp': '2025-09-30 22:10:51.239421', 'step': 693, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:51.294223', 'step': 693, 'epoch': 1} +{'type': 'loss', 'content': 0.05024373531341553, 'timestamp': '2025-09-30 22:10:51.296468', 'step': 694, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:51.351440', 'step': 694, 'epoch': 1} +{'type': 'loss', 'content': 0.005921120289713144, 'timestamp': '2025-09-30 22:10:51.354235', 'step': 695, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:51.409886', 'step': 695, 'epoch': 1} +{'type': 'loss', 'content': 0.02190905436873436, 'timestamp': '2025-09-30 22:10:51.415774', 'step': 696, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:51.473821', 'step': 696, 'epoch': 1} +{'type': 'loss', 'content': 0.012462352402508259, 'timestamp': '2025-09-30 22:10:51.476265', 'step': 697, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:51.532028', 'step': 697, 'epoch': 1} +{'type': 'loss', 'content': 0.009990805760025978, 'timestamp': '2025-09-30 22:10:51.534677', 'step': 698, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:51.589732', 'step': 698, 'epoch': 1} +{'type': 'loss', 'content': 0.023069290444254875, 'timestamp': '2025-09-30 22:10:51.596286', 'step': 699, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:51.650913', 'step': 699, 'epoch': 1} +{'type': 'loss', 'content': 0.025465210899710655, 'timestamp': '2025-09-30 22:10:51.657683', 'step': 700, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:51.729574', 'step': 700, 'epoch': 1} +{'type': 'loss', 'content': 0.007522939704358578, 'timestamp': '2025-09-30 22:10:51.731746', 'step': 701, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:51.786127', 'step': 701, 'epoch': 1} +{'type': 'loss', 'content': 0.023840337991714478, 'timestamp': '2025-09-30 22:10:51.788879', 'step': 702, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:51.843789', 'step': 702, 'epoch': 1} +{'type': 'loss', 'content': 0.01778830774128437, 'timestamp': '2025-09-30 22:10:51.860122', 'step': 703, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:51.917114', 'step': 703, 'epoch': 1} +{'type': 'loss', 'content': 0.013482254929840565, 'timestamp': '2025-09-30 22:10:51.923966', 'step': 704, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:51.978573', 'step': 704, 'epoch': 1} +{'type': 'loss', 'content': 0.018839532509446144, 'timestamp': '2025-09-30 22:10:51.980911', 'step': 705, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:52.038927', 'step': 705, 'epoch': 1} +{'type': 'loss', 'content': 0.006275936495512724, 'timestamp': '2025-09-30 22:10:52.041640', 'step': 706, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:52.096729', 'step': 706, 'epoch': 1} +{'type': 'loss', 'content': 0.006527402438223362, 'timestamp': '2025-09-30 22:10:52.099087', 'step': 707, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:52.152269', 'step': 707, 'epoch': 1} +{'type': 'loss', 'content': 0.016934264451265335, 'timestamp': '2025-09-30 22:10:52.158593', 'step': 708, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:52.217087', 'step': 708, 'epoch': 1} +{'type': 'loss', 'content': 0.013991935178637505, 'timestamp': '2025-09-30 22:10:52.220158', 'step': 709, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:52.273558', 'step': 709, 'epoch': 1} +{'type': 'loss', 'content': 0.043482519686222076, 'timestamp': '2025-09-30 22:10:52.277808', 'step': 710, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-30 22:10:52.335518', 'step': 710, 'epoch': 1} +{'type': 'loss', 'content': 0.02595517970621586, 'timestamp': '2025-09-30 22:10:52.338420', 'step': 711, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:52.398550', 'step': 711, 'epoch': 1} +{'type': 'loss', 'content': 0.03393742814660072, 'timestamp': '2025-09-30 22:10:52.404498', 'step': 712, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:52.467147', 'step': 712, 'epoch': 1} +{'type': 'loss', 'content': 0.019353583455085754, 'timestamp': '2025-09-30 22:10:52.470659', 'step': 713, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:52.529899', 'step': 713, 'epoch': 1} +{'type': 'loss', 'content': 0.01128054317086935, 'timestamp': '2025-09-30 22:10:52.535951', 'step': 714, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:52.590776', 'step': 714, 'epoch': 1} +{'type': 'loss', 'content': 0.021564552560448647, 'timestamp': '2025-09-30 22:10:52.595225', 'step': 715, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:52.655656', 'step': 715, 'epoch': 1} +{'type': 'loss', 'content': 0.03504403680562973, 'timestamp': '2025-09-30 22:10:52.663444', 'step': 716, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:52.719328', 'step': 716, 'epoch': 1} +{'type': 'loss', 'content': 0.0178972315043211, 'timestamp': '2025-09-30 22:10:52.723225', 'step': 717, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:52.784679', 'step': 717, 'epoch': 1} +{'type': 'loss', 'content': 0.019097017124295235, 'timestamp': '2025-09-30 22:10:52.791581', 'step': 718, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:52.847779', 'step': 718, 'epoch': 1} +{'type': 'loss', 'content': 0.01555024366825819, 'timestamp': '2025-09-30 22:10:52.851233', 'step': 719, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:52.905574', 'step': 719, 'epoch': 1} +{'type': 'loss', 'content': 0.03621254116296768, 'timestamp': '2025-09-30 22:10:52.912518', 'step': 720, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:52.965224', 'step': 720, 'epoch': 1} +{'type': 'loss', 'content': 0.00840581115335226, 'timestamp': '2025-09-30 22:10:52.968506', 'step': 721, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:53.026689', 'step': 721, 'epoch': 1} +{'type': 'loss', 'content': 0.007782516535371542, 'timestamp': '2025-09-30 22:10:53.030738', 'step': 722, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:53.085295', 'step': 722, 'epoch': 1} +{'type': 'loss', 'content': 0.012187975458800793, 'timestamp': '2025-09-30 22:10:53.089215', 'step': 723, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:53.144371', 'step': 723, 'epoch': 1} +{'type': 'loss', 'content': 0.0311259888112545, 'timestamp': '2025-09-30 22:10:53.150171', 'step': 724, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:53.212857', 'step': 724, 'epoch': 1} +{'type': 'loss', 'content': 0.030853962525725365, 'timestamp': '2025-09-30 22:10:53.216394', 'step': 725, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:53.272679', 'step': 725, 'epoch': 1} +{'type': 'loss', 'content': 0.02167673408985138, 'timestamp': '2025-09-30 22:10:53.275493', 'step': 726, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:53.332586', 'step': 726, 'epoch': 1} +{'type': 'loss', 'content': 0.014911350794136524, 'timestamp': '2025-09-30 22:10:53.338789', 'step': 727, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:53.401157', 'step': 727, 'epoch': 1} +{'type': 'loss', 'content': 0.009337635710835457, 'timestamp': '2025-09-30 22:10:53.413064', 'step': 728, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:53.469760', 'step': 728, 'epoch': 1} +{'type': 'loss', 'content': 0.007847541943192482, 'timestamp': '2025-09-30 22:10:53.473222', 'step': 729, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:53.536806', 'step': 729, 'epoch': 1} +{'type': 'loss', 'content': 0.021578481420874596, 'timestamp': '2025-09-30 22:10:53.540382', 'step': 730, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:53.601369', 'step': 730, 'epoch': 1} +{'type': 'loss', 'content': 0.009829229675233364, 'timestamp': '2025-09-30 22:10:53.604142', 'step': 731, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:53.675425', 'step': 731, 'epoch': 1} +{'type': 'loss', 'content': 0.022942425683140755, 'timestamp': '2025-09-30 22:10:53.683426', 'step': 732, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:53.740581', 'step': 732, 'epoch': 1} +{'type': 'loss', 'content': 0.018000300973653793, 'timestamp': '2025-09-30 22:10:53.743633', 'step': 733, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:53.802944', 'step': 733, 'epoch': 1} +{'type': 'loss', 'content': 0.01805448904633522, 'timestamp': '2025-09-30 22:10:53.806262', 'step': 734, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:53.865277', 'step': 734, 'epoch': 1} +{'type': 'loss', 'content': 0.01143564097583294, 'timestamp': '2025-09-30 22:10:53.868349', 'step': 735, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:53.929590', 'step': 735, 'epoch': 1} +{'type': 'loss', 'content': 0.010203097946941853, 'timestamp': '2025-09-30 22:10:53.935468', 'step': 736, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:54.001180', 'step': 736, 'epoch': 1} +{'type': 'loss', 'content': 0.028915002942085266, 'timestamp': '2025-09-30 22:10:54.003750', 'step': 737, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:54.064463', 'step': 737, 'epoch': 1} +{'type': 'loss', 'content': 0.01114210207015276, 'timestamp': '2025-09-30 22:10:54.067247', 'step': 738, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:54.121329', 'step': 738, 'epoch': 1} +{'type': 'loss', 'content': 0.02133123390376568, 'timestamp': '2025-09-30 22:10:54.124058', 'step': 739, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:54.189872', 'step': 739, 'epoch': 1} +{'type': 'loss', 'content': 0.034789033234119415, 'timestamp': '2025-09-30 22:10:54.196789', 'step': 740, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:54.264175', 'step': 740, 'epoch': 1} +{'type': 'loss', 'content': 0.029856473207473755, 'timestamp': '2025-09-30 22:10:54.266805', 'step': 741, 'epoch': 1} +{'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}], 'timestamp': '2025-09-30 22:10:55.578341', 'step': 741, 'epoch': 1} +{'type': 'pplx', 'content': 32540290.98788411, 'timestamp': '2025-09-30 22:10:55.580487', 'step': 741, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:55.634594', 'step': 741, 'epoch': 1} +{'type': 'loss', 'content': 0.011041047051548958, 'timestamp': '2025-09-30 22:10:55.637911', 'step': 742, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:55.693181', 'step': 742, 'epoch': 1} +{'type': 'loss', 'content': 0.013807429000735283, 'timestamp': '2025-09-30 22:10:55.695078', 'step': 743, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:55.751626', 'step': 743, 'epoch': 1} +{'type': 'loss', 'content': 0.03291318565607071, 'timestamp': '2025-09-30 22:10:55.757522', 'step': 744, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:55.812977', 'step': 744, 'epoch': 1} +{'type': 'loss', 'content': 0.017772836610674858, 'timestamp': '2025-09-30 22:10:55.817366', 'step': 745, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:55.870542', 'step': 745, 'epoch': 1} +{'type': 'loss', 'content': 0.010666130110621452, 'timestamp': '2025-09-30 22:10:55.872940', 'step': 746, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:55.927591', 'step': 746, 'epoch': 1} +{'type': 'loss', 'content': 0.008871505968272686, 'timestamp': '2025-09-30 22:10:55.929995', 'step': 747, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:55.983417', 'step': 747, 'epoch': 1} +{'type': 'loss', 'content': 0.02815295197069645, 'timestamp': '2025-09-30 22:10:55.989362', 'step': 748, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:56.043054', 'step': 748, 'epoch': 1} +{'type': 'loss', 'content': 0.018090158700942993, 'timestamp': '2025-09-30 22:10:56.045234', 'step': 749, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:56.098937', 'step': 749, 'epoch': 1} +{'type': 'loss', 'content': 0.012871643528342247, 'timestamp': '2025-09-30 22:10:56.101068', 'step': 750, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:56.155911', 'step': 750, 'epoch': 1} +{'type': 'loss', 'content': 0.02255532518029213, 'timestamp': '2025-09-30 22:10:56.167119', 'step': 751, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:56.225904', 'step': 751, 'epoch': 1} +{'type': 'loss', 'content': 0.03662119060754776, 'timestamp': '2025-09-30 22:10:56.231510', 'step': 752, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:56.285709', 'step': 752, 'epoch': 1} +{'type': 'loss', 'content': 0.01848854124546051, 'timestamp': '2025-09-30 22:10:56.289641', 'step': 753, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:56.347354', 'step': 753, 'epoch': 1} +{'type': 'loss', 'content': 0.010717155411839485, 'timestamp': '2025-09-30 22:10:56.355255', 'step': 754, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:56.422808', 'step': 754, 'epoch': 1} +{'type': 'loss', 'content': 0.014511531218886375, 'timestamp': '2025-09-30 22:10:56.424913', 'step': 755, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:56.482154', 'step': 755, 'epoch': 1} +{'type': 'loss', 'content': 0.026509271934628487, 'timestamp': '2025-09-30 22:10:56.489952', 'step': 756, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:56.549793', 'step': 756, 'epoch': 1} +{'type': 'loss', 'content': 0.004727398511022329, 'timestamp': '2025-09-30 22:10:56.553813', 'step': 757, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:56.613752', 'step': 757, 'epoch': 1} +{'type': 'loss', 'content': 0.0046147494576871395, 'timestamp': '2025-09-30 22:10:56.620038', 'step': 758, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:56.678010', 'step': 758, 'epoch': 1} +{'type': 'loss', 'content': 0.009979399852454662, 'timestamp': '2025-09-30 22:10:56.680933', 'step': 759, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:56.742069', 'step': 759, 'epoch': 1} +{'type': 'loss', 'content': 0.0126791438087821, 'timestamp': '2025-09-30 22:10:56.750293', 'step': 760, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:56.814648', 'step': 760, 'epoch': 1} +{'type': 'loss', 'content': 0.005177278071641922, 'timestamp': '2025-09-30 22:10:56.817288', 'step': 761, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-30 22:10:56.881049', 'step': 761, 'epoch': 1} +{'type': 'loss', 'content': 0.009649130515754223, 'timestamp': '2025-09-30 22:10:56.886005', 'step': 762, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:56.949170', 'step': 762, 'epoch': 1} +{'type': 'loss', 'content': 0.036273036152124405, 'timestamp': '2025-09-30 22:10:56.957677', 'step': 763, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:57.012209', 'step': 763, 'epoch': 1} +{'type': 'loss', 'content': 0.009934181347489357, 'timestamp': '2025-09-30 22:10:57.020192', 'step': 764, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:57.081575', 'step': 764, 'epoch': 1} +{'type': 'loss', 'content': 0.009199343621730804, 'timestamp': '2025-09-30 22:10:57.095822', 'step': 765, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:57.156959', 'step': 765, 'epoch': 1} +{'type': 'loss', 'content': 0.013879785314202309, 'timestamp': '2025-09-30 22:10:57.159436', 'step': 766, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:57.212583', 'step': 766, 'epoch': 1} +{'type': 'loss', 'content': 0.00856445636600256, 'timestamp': '2025-09-30 22:10:57.216284', 'step': 767, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:57.276051', 'step': 767, 'epoch': 1} +{'type': 'loss', 'content': 0.04576727747917175, 'timestamp': '2025-09-30 22:10:57.284101', 'step': 768, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:57.339006', 'step': 768, 'epoch': 1} +{'type': 'loss', 'content': 0.01549856923520565, 'timestamp': '2025-09-30 22:10:57.347060', 'step': 769, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:57.406865', 'step': 769, 'epoch': 1} +{'type': 'loss', 'content': 0.035134777426719666, 'timestamp': '2025-09-30 22:10:57.425285', 'step': 770, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:57.479947', 'step': 770, 'epoch': 1} +{'type': 'loss', 'content': 0.027558207511901855, 'timestamp': '2025-09-30 22:10:57.500451', 'step': 771, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:57.572535', 'step': 771, 'epoch': 1} +{'type': 'loss', 'content': 0.008749599568545818, 'timestamp': '2025-09-30 22:10:57.578703', 'step': 772, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:57.632785', 'step': 772, 'epoch': 1} +{'type': 'loss', 'content': 0.0587838776409626, 'timestamp': '2025-09-30 22:10:57.634782', 'step': 773, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:57.702651', 'step': 773, 'epoch': 1} +{'type': 'loss', 'content': 0.004769704304635525, 'timestamp': '2025-09-30 22:10:57.704741', 'step': 774, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:57.762540', 'step': 774, 'epoch': 1} +{'type': 'loss', 'content': 0.030832087621092796, 'timestamp': '2025-09-30 22:10:57.764624', 'step': 775, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:57.818884', 'step': 775, 'epoch': 1} +{'type': 'loss', 'content': 0.02731013298034668, 'timestamp': '2025-09-30 22:10:57.824632', 'step': 776, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:57.877914', 'step': 776, 'epoch': 1} +{'type': 'loss', 'content': 0.007288725581020117, 'timestamp': '2025-09-30 22:10:57.880273', 'step': 777, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:57.936165', 'step': 777, 'epoch': 1} +{'type': 'loss', 'content': 0.003071850398555398, 'timestamp': '2025-09-30 22:10:57.938334', 'step': 778, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:57.997184', 'step': 778, 'epoch': 1} +{'type': 'loss', 'content': 0.02605491690337658, 'timestamp': '2025-09-30 22:10:57.999595', 'step': 779, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:58.055531', 'step': 779, 'epoch': 1} +{'type': 'loss', 'content': 0.006960700731724501, 'timestamp': '2025-09-30 22:10:58.061276', 'step': 780, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:58.121422', 'step': 780, 'epoch': 1} +{'type': 'loss', 'content': 0.018633553758263588, 'timestamp': '2025-09-30 22:10:58.124436', 'step': 781, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:58.186438', 'step': 781, 'epoch': 1} +{'type': 'loss', 'content': 0.0156040508300066, 'timestamp': '2025-09-30 22:10:58.188684', 'step': 782, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:58.241872', 'step': 782, 'epoch': 1} +{'type': 'loss', 'content': 0.024231048300862312, 'timestamp': '2025-09-30 22:10:58.244128', 'step': 783, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:58.298950', 'step': 783, 'epoch': 1} +{'type': 'loss', 'content': 0.01897078938782215, 'timestamp': '2025-09-30 22:10:58.304930', 'step': 784, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:58.362591', 'step': 784, 'epoch': 1} +{'type': 'loss', 'content': 0.007195747457444668, 'timestamp': '2025-09-30 22:10:58.364851', 'step': 785, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:58.419197', 'step': 785, 'epoch': 1} +{'type': 'loss', 'content': 0.004759868141263723, 'timestamp': '2025-09-30 22:10:58.421346', 'step': 786, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:58.475595', 'step': 786, 'epoch': 1} +{'type': 'loss', 'content': 0.006611085496842861, 'timestamp': '2025-09-30 22:10:58.477615', 'step': 787, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:58.538663', 'step': 787, 'epoch': 1} +{'type': 'loss', 'content': 0.024320388212800026, 'timestamp': '2025-09-30 22:10:58.544699', 'step': 788, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:58.604428', 'step': 788, 'epoch': 1} +{'type': 'loss', 'content': 0.01196072157472372, 'timestamp': '2025-09-30 22:10:58.606592', 'step': 789, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:10:58.664400', 'step': 789, 'epoch': 1} +{'type': 'loss', 'content': 0.00526924803853035, 'timestamp': '2025-09-30 22:10:58.666685', 'step': 790, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:58.721426', 'step': 790, 'epoch': 1} +{'type': 'loss', 'content': 0.013081463053822517, 'timestamp': '2025-09-30 22:10:58.724083', 'step': 791, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:58.780350', 'step': 791, 'epoch': 1} +{'type': 'loss', 'content': 0.021804455667734146, 'timestamp': '2025-09-30 22:10:58.786275', 'step': 792, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:58.839769', 'step': 792, 'epoch': 1} +{'type': 'loss', 'content': 0.014370249584317207, 'timestamp': '2025-09-30 22:10:58.843237', 'step': 793, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:58.897409', 'step': 793, 'epoch': 1} +{'type': 'loss', 'content': 0.03542889282107353, 'timestamp': '2025-09-30 22:10:58.899546', 'step': 794, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:58.957528', 'step': 794, 'epoch': 1} +{'type': 'loss', 'content': 0.014196035452187061, 'timestamp': '2025-09-30 22:10:58.959878', 'step': 795, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:59.013285', 'step': 795, 'epoch': 1} +{'type': 'loss', 'content': 0.006897695828229189, 'timestamp': '2025-09-30 22:10:59.019364', 'step': 796, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:10:59.072892', 'step': 796, 'epoch': 1} +{'type': 'loss', 'content': 0.009900706820189953, 'timestamp': '2025-09-30 22:10:59.078591', 'step': 797, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:10:59.133963', 'step': 797, 'epoch': 1} +{'type': 'loss', 'content': 0.017876118421554565, 'timestamp': '2025-09-30 22:10:59.136126', 'step': 798, 'epoch': 1} +{'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}], 'timestamp': '2025-09-30 22:11:00.461430', 'step': 798, 'epoch': 1} +{'type': 'pplx', 'content': 35240375.419794545, 'timestamp': '2025-09-30 22:11:00.463470', 'step': 798, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:00.520434', 'step': 798, 'epoch': 1} +{'type': 'loss', 'content': 0.0068840510211884975, 'timestamp': '2025-09-30 22:11:00.522737', 'step': 799, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:00.575833', 'step': 799, 'epoch': 1} +{'type': 'loss', 'content': 0.032674796879291534, 'timestamp': '2025-09-30 22:11:00.583553', 'step': 800, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:00.636895', 'step': 800, 'epoch': 1} +{'type': 'loss', 'content': 0.026194969192147255, 'timestamp': '2025-09-30 22:11:00.640345', 'step': 801, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:00.694348', 'step': 801, 'epoch': 1} +{'type': 'loss', 'content': 0.029077323153614998, 'timestamp': '2025-09-30 22:11:00.696603', 'step': 802, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:00.751803', 'step': 802, 'epoch': 1} +{'type': 'loss', 'content': 0.021133294329047203, 'timestamp': '2025-09-30 22:11:00.754628', 'step': 803, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:00.810255', 'step': 803, 'epoch': 1} +{'type': 'loss', 'content': 0.008854121901094913, 'timestamp': '2025-09-30 22:11:00.816284', 'step': 804, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:00.882520', 'step': 804, 'epoch': 1} +{'type': 'loss', 'content': 0.025976279750466347, 'timestamp': '2025-09-30 22:11:00.885485', 'step': 805, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:00.940450', 'step': 805, 'epoch': 1} +{'type': 'loss', 'content': 0.02907080017030239, 'timestamp': '2025-09-30 22:11:00.942441', 'step': 806, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:00.995813', 'step': 806, 'epoch': 1} +{'type': 'loss', 'content': 0.009363925084471703, 'timestamp': '2025-09-30 22:11:00.998106', 'step': 807, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:01.051881', 'step': 807, 'epoch': 1} +{'type': 'loss', 'content': 0.017092768102884293, 'timestamp': '2025-09-30 22:11:01.057512', 'step': 808, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:01.111311', 'step': 808, 'epoch': 1} +{'type': 'loss', 'content': 0.00928075797855854, 'timestamp': '2025-09-30 22:11:01.113383', 'step': 809, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:01.168876', 'step': 809, 'epoch': 1} +{'type': 'loss', 'content': 0.015542738139629364, 'timestamp': '2025-09-30 22:11:01.171649', 'step': 810, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:01.227333', 'step': 810, 'epoch': 1} +{'type': 'loss', 'content': 0.022915521636605263, 'timestamp': '2025-09-30 22:11:01.229426', 'step': 811, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:01.282472', 'step': 811, 'epoch': 1} +{'type': 'loss', 'content': 0.012876084074378014, 'timestamp': '2025-09-30 22:11:01.288399', 'step': 812, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:01.342096', 'step': 812, 'epoch': 1} +{'type': 'loss', 'content': 0.0221566129475832, 'timestamp': '2025-09-30 22:11:01.344402', 'step': 813, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:01.397809', 'step': 813, 'epoch': 1} +{'type': 'loss', 'content': 0.017164286226034164, 'timestamp': '2025-09-30 22:11:01.399891', 'step': 814, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:01.453987', 'step': 814, 'epoch': 1} +{'type': 'loss', 'content': 0.009646688587963581, 'timestamp': '2025-09-30 22:11:01.457594', 'step': 815, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:01.513635', 'step': 815, 'epoch': 1} +{'type': 'loss', 'content': 0.015178002417087555, 'timestamp': '2025-09-30 22:11:01.522945', 'step': 816, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:01.577810', 'step': 816, 'epoch': 1} +{'type': 'loss', 'content': 0.030943872407078743, 'timestamp': '2025-09-30 22:11:01.580407', 'step': 817, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:01.635707', 'step': 817, 'epoch': 1} +{'type': 'loss', 'content': 0.016377057880163193, 'timestamp': '2025-09-30 22:11:01.638511', 'step': 818, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:01.692527', 'step': 818, 'epoch': 1} +{'type': 'loss', 'content': 0.007752551231533289, 'timestamp': '2025-09-30 22:11:01.694955', 'step': 819, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:01.748220', 'step': 819, 'epoch': 1} +{'type': 'loss', 'content': 0.011008880101144314, 'timestamp': '2025-09-30 22:11:01.754611', 'step': 820, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:01.808426', 'step': 820, 'epoch': 1} +{'type': 'loss', 'content': 0.011034859344363213, 'timestamp': '2025-09-30 22:11:01.813052', 'step': 821, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:01.869500', 'step': 821, 'epoch': 1} +{'type': 'loss', 'content': 0.01996428705751896, 'timestamp': '2025-09-30 22:11:01.873235', 'step': 822, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:01.935142', 'step': 822, 'epoch': 1} +{'type': 'loss', 'content': 0.017574485391378403, 'timestamp': '2025-09-30 22:11:01.944753', 'step': 823, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:02.005799', 'step': 823, 'epoch': 1} +{'type': 'loss', 'content': 0.024993544444441795, 'timestamp': '2025-09-30 22:11:02.012099', 'step': 824, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:02.066524', 'step': 824, 'epoch': 1} +{'type': 'loss', 'content': 0.010080697014927864, 'timestamp': '2025-09-30 22:11:02.072252', 'step': 825, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:02.126597', 'step': 825, 'epoch': 1} +{'type': 'loss', 'content': 0.015512553974986076, 'timestamp': '2025-09-30 22:11:02.129066', 'step': 826, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:02.186535', 'step': 826, 'epoch': 1} +{'type': 'loss', 'content': 0.028046919032931328, 'timestamp': '2025-09-30 22:11:02.192835', 'step': 827, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:02.258556', 'step': 827, 'epoch': 1} +{'type': 'loss', 'content': 0.01140694972127676, 'timestamp': '2025-09-30 22:11:02.264406', 'step': 828, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:02.325633', 'step': 828, 'epoch': 1} +{'type': 'loss', 'content': 0.01551117654889822, 'timestamp': '2025-09-30 22:11:02.330248', 'step': 829, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:02.383542', 'step': 829, 'epoch': 1} +{'type': 'loss', 'content': 0.005076105706393719, 'timestamp': '2025-09-30 22:11:02.390134', 'step': 830, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:02.447396', 'step': 830, 'epoch': 1} +{'type': 'loss', 'content': 0.007691748905926943, 'timestamp': '2025-09-30 22:11:02.464235', 'step': 831, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:02.521544', 'step': 831, 'epoch': 1} +{'type': 'loss', 'content': 0.008671620860695839, 'timestamp': '2025-09-30 22:11:02.535575', 'step': 832, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:02.590330', 'step': 832, 'epoch': 1} +{'type': 'loss', 'content': 0.031672995537519455, 'timestamp': '2025-09-30 22:11:02.594947', 'step': 833, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:02.650326', 'step': 833, 'epoch': 1} +{'type': 'loss', 'content': 0.01112611498683691, 'timestamp': '2025-09-30 22:11:02.662307', 'step': 834, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:02.727931', 'step': 834, 'epoch': 1} +{'type': 'loss', 'content': 0.0036083075683563948, 'timestamp': '2025-09-30 22:11:02.731299', 'step': 835, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:02.789283', 'step': 835, 'epoch': 1} +{'type': 'loss', 'content': 0.008130187168717384, 'timestamp': '2025-09-30 22:11:02.799273', 'step': 836, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:02.857425', 'step': 836, 'epoch': 1} +{'type': 'loss', 'content': 0.036892350763082504, 'timestamp': '2025-09-30 22:11:02.860090', 'step': 837, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:02.922647', 'step': 837, 'epoch': 1} +{'type': 'loss', 'content': 0.018781444057822227, 'timestamp': '2025-09-30 22:11:02.931661', 'step': 838, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:02.986358', 'step': 838, 'epoch': 1} +{'type': 'loss', 'content': 0.029954804107546806, 'timestamp': '2025-09-30 22:11:02.988486', 'step': 839, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:03.047723', 'step': 839, 'epoch': 1} +{'type': 'loss', 'content': 0.01475707720965147, 'timestamp': '2025-09-30 22:11:03.054607', 'step': 840, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:03.117898', 'step': 840, 'epoch': 1} +{'type': 'loss', 'content': 0.01812085136771202, 'timestamp': '2025-09-30 22:11:03.120429', 'step': 841, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:03.174268', 'step': 841, 'epoch': 1} +{'type': 'loss', 'content': 0.030853111296892166, 'timestamp': '2025-09-30 22:11:03.176617', 'step': 842, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:03.235615', 'step': 842, 'epoch': 1} +{'type': 'loss', 'content': 0.014583474956452847, 'timestamp': '2025-09-30 22:11:03.237574', 'step': 843, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:03.290976', 'step': 843, 'epoch': 1} +{'type': 'loss', 'content': 0.020900966599583626, 'timestamp': '2025-09-30 22:11:03.301990', 'step': 844, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:03.356268', 'step': 844, 'epoch': 1} +{'type': 'loss', 'content': 0.0058554792776703835, 'timestamp': '2025-09-30 22:11:03.358429', 'step': 845, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:03.413993', 'step': 845, 'epoch': 1} +{'type': 'loss', 'content': 0.001921547343954444, 'timestamp': '2025-09-30 22:11:03.416165', 'step': 846, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:03.470304', 'step': 846, 'epoch': 1} +{'type': 'loss', 'content': 0.033496271818876266, 'timestamp': '2025-09-30 22:11:03.472297', 'step': 847, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:03.525170', 'step': 847, 'epoch': 1} +{'type': 'loss', 'content': 0.01942761428654194, 'timestamp': '2025-09-30 22:11:03.530403', 'step': 848, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:03.586709', 'step': 848, 'epoch': 1} +{'type': 'loss', 'content': 0.003972503822296858, 'timestamp': '2025-09-30 22:11:03.588508', 'step': 849, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:03.646023', 'step': 849, 'epoch': 1} +{'type': 'loss', 'content': 0.00408882787451148, 'timestamp': '2025-09-30 22:11:03.648241', 'step': 850, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:03.706513', 'step': 850, 'epoch': 1} +{'type': 'loss', 'content': 0.00527701573446393, 'timestamp': '2025-09-30 22:11:03.708413', 'step': 851, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:03.762487', 'step': 851, 'epoch': 1} +{'type': 'loss', 'content': 0.016006560996174812, 'timestamp': '2025-09-30 22:11:03.768331', 'step': 852, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:03.821876', 'step': 852, 'epoch': 1} +{'type': 'loss', 'content': 0.006566441617906094, 'timestamp': '2025-09-30 22:11:03.823974', 'step': 853, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:03.879460', 'step': 853, 'epoch': 1} +{'type': 'loss', 'content': 0.02836592122912407, 'timestamp': '2025-09-30 22:11:03.881726', 'step': 854, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:03.937448', 'step': 854, 'epoch': 1} +{'type': 'loss', 'content': 0.006195141933858395, 'timestamp': '2025-09-30 22:11:03.939682', 'step': 855, 'epoch': 1} +{'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}], 'timestamp': '2025-09-30 22:11:05.130762', 'step': 855, 'epoch': 1} +{'type': 'pplx', 'content': 37193476.96670681, 'timestamp': '2025-09-30 22:11:05.132796', 'step': 855, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:05.183818', 'step': 855, 'epoch': 1} +{'type': 'loss', 'content': 0.005694164428859949, 'timestamp': '2025-09-30 22:11:05.189422', 'step': 856, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:05.241745', 'step': 856, 'epoch': 1} +{'type': 'loss', 'content': 0.012527307495474815, 'timestamp': '2025-09-30 22:11:05.248245', 'step': 857, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:05.301596', 'step': 857, 'epoch': 1} +{'type': 'loss', 'content': 0.010429566726088524, 'timestamp': '2025-09-30 22:11:05.303733', 'step': 858, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:05.357377', 'step': 858, 'epoch': 1} +{'type': 'loss', 'content': 0.01283800695091486, 'timestamp': '2025-09-30 22:11:05.359112', 'step': 859, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:05.412159', 'step': 859, 'epoch': 1} +{'type': 'loss', 'content': 0.027922047302126884, 'timestamp': '2025-09-30 22:11:05.417257', 'step': 860, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:05.470079', 'step': 860, 'epoch': 1} +{'type': 'loss', 'content': 0.019196191802620888, 'timestamp': '2025-09-30 22:11:05.471898', 'step': 861, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:05.525280', 'step': 861, 'epoch': 1} +{'type': 'loss', 'content': 0.006581494119018316, 'timestamp': '2025-09-30 22:11:05.529503', 'step': 862, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:05.585152', 'step': 862, 'epoch': 1} +{'type': 'loss', 'content': 0.019029339775443077, 'timestamp': '2025-09-30 22:11:05.589783', 'step': 863, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:05.645567', 'step': 863, 'epoch': 1} +{'type': 'loss', 'content': 0.013398184441030025, 'timestamp': '2025-09-30 22:11:05.651621', 'step': 864, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:05.705265', 'step': 864, 'epoch': 1} +{'type': 'loss', 'content': 0.02754964306950569, 'timestamp': '2025-09-30 22:11:05.708449', 'step': 865, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:05.778912', 'step': 865, 'epoch': 1} +{'type': 'loss', 'content': 0.006636460777372122, 'timestamp': '2025-09-30 22:11:05.785334', 'step': 866, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:05.846711', 'step': 866, 'epoch': 1} +{'type': 'loss', 'content': 0.0036353315226733685, 'timestamp': '2025-09-30 22:11:05.849779', 'step': 867, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:05.904919', 'step': 867, 'epoch': 1} +{'type': 'loss', 'content': 0.011577283963561058, 'timestamp': '2025-09-30 22:11:05.912851', 'step': 868, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:05.976102', 'step': 868, 'epoch': 1} +{'type': 'loss', 'content': 0.016597332432866096, 'timestamp': '2025-09-30 22:11:05.980116', 'step': 869, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:06.039350', 'step': 869, 'epoch': 1} +{'type': 'loss', 'content': 0.044374849647283554, 'timestamp': '2025-09-30 22:11:06.043036', 'step': 870, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:06.101559', 'step': 870, 'epoch': 1} +{'type': 'loss', 'content': 0.005777058191597462, 'timestamp': '2025-09-30 22:11:06.105594', 'step': 871, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:06.171476', 'step': 871, 'epoch': 1} +{'type': 'loss', 'content': 0.000960107718128711, 'timestamp': '2025-09-30 22:11:06.179702', 'step': 872, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:06.249129', 'step': 872, 'epoch': 1} +{'type': 'loss', 'content': 0.018397826701402664, 'timestamp': '2025-09-30 22:11:06.254591', 'step': 873, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:06.311920', 'step': 873, 'epoch': 1} +{'type': 'loss', 'content': 0.009636670351028442, 'timestamp': '2025-09-30 22:11:06.314872', 'step': 874, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:06.372349', 'step': 874, 'epoch': 1} +{'type': 'loss', 'content': 0.02451557293534279, 'timestamp': '2025-09-30 22:11:06.376520', 'step': 875, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:06.451044', 'step': 875, 'epoch': 1} +{'type': 'loss', 'content': 0.036388419568538666, 'timestamp': '2025-09-30 22:11:06.458139', 'step': 876, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:06.511108', 'step': 876, 'epoch': 1} +{'type': 'loss', 'content': 0.00405623484402895, 'timestamp': '2025-09-30 22:11:06.514565', 'step': 877, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:06.585352', 'step': 877, 'epoch': 1} +{'type': 'loss', 'content': 0.00108911597635597, 'timestamp': '2025-09-30 22:11:06.589256', 'step': 878, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:06.647797', 'step': 878, 'epoch': 1} +{'type': 'loss', 'content': 0.040293216705322266, 'timestamp': '2025-09-30 22:11:06.650671', 'step': 879, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:06.705666', 'step': 879, 'epoch': 1} +{'type': 'loss', 'content': 0.053030043840408325, 'timestamp': '2025-09-30 22:11:06.712051', 'step': 880, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:06.777617', 'step': 880, 'epoch': 1} +{'type': 'loss', 'content': 0.029138272628188133, 'timestamp': '2025-09-30 22:11:06.780854', 'step': 881, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:06.835173', 'step': 881, 'epoch': 1} +{'type': 'loss', 'content': 0.04902893677353859, 'timestamp': '2025-09-30 22:11:06.841768', 'step': 882, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:06.908278', 'step': 882, 'epoch': 1} +{'type': 'loss', 'content': 0.03171558305621147, 'timestamp': '2025-09-30 22:11:06.910742', 'step': 883, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:06.973977', 'step': 883, 'epoch': 1} +{'type': 'loss', 'content': 0.0021508908830583096, 'timestamp': '2025-09-30 22:11:06.979781', 'step': 884, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:07.038739', 'step': 884, 'epoch': 1} +{'type': 'loss', 'content': 0.005451333709061146, 'timestamp': '2025-09-30 22:11:07.043018', 'step': 885, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:07.103661', 'step': 885, 'epoch': 1} +{'type': 'loss', 'content': 0.021850435063242912, 'timestamp': '2025-09-30 22:11:07.118706', 'step': 886, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:07.182709', 'step': 886, 'epoch': 1} +{'type': 'loss', 'content': 0.007379130460321903, 'timestamp': '2025-09-30 22:11:07.196464', 'step': 887, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:07.256055', 'step': 887, 'epoch': 1} +{'type': 'loss', 'content': 0.024785201996564865, 'timestamp': '2025-09-30 22:11:07.262829', 'step': 888, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:07.316559', 'step': 888, 'epoch': 1} +{'type': 'loss', 'content': 0.018535610288381577, 'timestamp': '2025-09-30 22:11:07.319236', 'step': 889, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:07.373407', 'step': 889, 'epoch': 1} +{'type': 'loss', 'content': 0.03353133052587509, 'timestamp': '2025-09-30 22:11:07.379519', 'step': 890, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:07.439250', 'step': 890, 'epoch': 1} +{'type': 'loss', 'content': 0.018443452194333076, 'timestamp': '2025-09-30 22:11:07.442180', 'step': 891, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:07.496055', 'step': 891, 'epoch': 1} +{'type': 'loss', 'content': 0.018285350874066353, 'timestamp': '2025-09-30 22:11:07.503954', 'step': 892, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:07.557501', 'step': 892, 'epoch': 1} +{'type': 'loss', 'content': 0.012829815968871117, 'timestamp': '2025-09-30 22:11:07.567651', 'step': 893, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:07.630050', 'step': 893, 'epoch': 1} +{'type': 'loss', 'content': 0.05475155636668205, 'timestamp': '2025-09-30 22:11:07.641354', 'step': 894, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:07.705736', 'step': 894, 'epoch': 1} +{'type': 'loss', 'content': 0.021429507061839104, 'timestamp': '2025-09-30 22:11:07.720924', 'step': 895, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:07.786174', 'step': 895, 'epoch': 1} +{'type': 'loss', 'content': 0.02315603196620941, 'timestamp': '2025-09-30 22:11:07.796568', 'step': 896, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:07.850106', 'step': 896, 'epoch': 1} +{'type': 'loss', 'content': 0.03501684218645096, 'timestamp': '2025-09-30 22:11:07.863817', 'step': 897, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:07.919325', 'step': 897, 'epoch': 1} +{'type': 'loss', 'content': 0.013003984466195107, 'timestamp': '2025-09-30 22:11:07.923166', 'step': 898, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:07.980445', 'step': 898, 'epoch': 1} +{'type': 'loss', 'content': 0.010267031379044056, 'timestamp': '2025-09-30 22:11:07.984239', 'step': 899, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:08.038645', 'step': 899, 'epoch': 1} +{'type': 'loss', 'content': 0.018157100304961205, 'timestamp': '2025-09-30 22:11:08.049287', 'step': 900, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:08.104784', 'step': 900, 'epoch': 1} +{'type': 'loss', 'content': 0.029190553352236748, 'timestamp': '2025-09-30 22:11:08.114701', 'step': 901, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:08.172726', 'step': 901, 'epoch': 1} +{'type': 'loss', 'content': 0.026753077283501625, 'timestamp': '2025-09-30 22:11:08.179576', 'step': 902, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:08.240865', 'step': 902, 'epoch': 1} +{'type': 'loss', 'content': 0.01073368452489376, 'timestamp': '2025-09-30 22:11:08.248813', 'step': 903, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:08.309720', 'step': 903, 'epoch': 1} +{'type': 'loss', 'content': 0.03276833891868591, 'timestamp': '2025-09-30 22:11:08.323380', 'step': 904, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:08.380159', 'step': 904, 'epoch': 1} +{'type': 'loss', 'content': 0.01617128774523735, 'timestamp': '2025-09-30 22:11:08.395340', 'step': 905, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:08.456218', 'step': 905, 'epoch': 1} +{'type': 'loss', 'content': 0.010833910666406155, 'timestamp': '2025-09-30 22:11:08.460300', 'step': 906, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:08.515914', 'step': 906, 'epoch': 1} +{'type': 'loss', 'content': 0.027199676260352135, 'timestamp': '2025-09-30 22:11:08.519480', 'step': 907, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:08.576222', 'step': 907, 'epoch': 1} +{'type': 'loss', 'content': 0.018916381523013115, 'timestamp': '2025-09-30 22:11:08.583916', 'step': 908, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:08.641627', 'step': 908, 'epoch': 1} +{'type': 'loss', 'content': 0.02062804065644741, 'timestamp': '2025-09-30 22:11:08.644997', 'step': 909, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:08.706192', 'step': 909, 'epoch': 1} +{'type': 'loss', 'content': 0.0058442666195333, 'timestamp': '2025-09-30 22:11:08.709737', 'step': 910, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:08.766142', 'step': 910, 'epoch': 1} +{'type': 'loss', 'content': 0.01607462391257286, 'timestamp': '2025-09-30 22:11:08.778255', 'step': 911, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:08.836438', 'step': 911, 'epoch': 1} +{'type': 'loss', 'content': 0.007725600618869066, 'timestamp': '2025-09-30 22:11:08.842851', 'step': 912, 'epoch': 1} +{'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}], 'timestamp': '2025-09-30 22:11:10.078308', 'step': 912, 'epoch': 1} +{'type': 'pplx', 'content': 32940359.600703914, 'timestamp': '2025-09-30 22:11:10.090732', 'step': 912, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:10.153675', 'step': 912, 'epoch': 1} +{'type': 'loss', 'content': 0.03332886844873428, 'timestamp': '2025-09-30 22:11:10.157740', 'step': 913, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:10.214129', 'step': 913, 'epoch': 1} +{'type': 'loss', 'content': 0.013211602345108986, 'timestamp': '2025-09-30 22:11:10.218989', 'step': 914, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:10.272896', 'step': 914, 'epoch': 1} +{'type': 'loss', 'content': 0.009566955268383026, 'timestamp': '2025-09-30 22:11:10.284016', 'step': 915, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:10.348896', 'step': 915, 'epoch': 1} +{'type': 'loss', 'content': 0.008288740180432796, 'timestamp': '2025-09-30 22:11:10.356530', 'step': 916, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:10.411350', 'step': 916, 'epoch': 1} +{'type': 'loss', 'content': 0.028552187606692314, 'timestamp': '2025-09-30 22:11:10.421464', 'step': 917, 'epoch': 1} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:10.483632', 'step': 917, 'epoch': 2} +{'type': 'loss', 'content': 0.04483124613761902, 'timestamp': '2025-09-30 22:11:10.487570', 'step': 918, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:10.541429', 'step': 918, 'epoch': 2} +{'type': 'loss', 'content': 0.019391506910324097, 'timestamp': '2025-09-30 22:11:10.557400', 'step': 919, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:10.611653', 'step': 919, 'epoch': 2} +{'type': 'loss', 'content': 0.054323118180036545, 'timestamp': '2025-09-30 22:11:10.629999', 'step': 920, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:10.691250', 'step': 920, 'epoch': 2} +{'type': 'loss', 'content': 0.02535811997950077, 'timestamp': '2025-09-30 22:11:10.698154', 'step': 921, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:10.752424', 'step': 921, 'epoch': 2} +{'type': 'loss', 'content': 0.04979207366704941, 'timestamp': '2025-09-30 22:11:10.756092', 'step': 922, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:10.811296', 'step': 922, 'epoch': 2} +{'type': 'loss', 'content': 0.02441842295229435, 'timestamp': '2025-09-30 22:11:10.814036', 'step': 923, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:10.867564', 'step': 923, 'epoch': 2} +{'type': 'loss', 'content': 0.04180010035634041, 'timestamp': '2025-09-30 22:11:10.874041', 'step': 924, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:10.927346', 'step': 924, 'epoch': 2} +{'type': 'loss', 'content': 0.01361654233187437, 'timestamp': '2025-09-30 22:11:10.937917', 'step': 925, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:10.993247', 'step': 925, 'epoch': 2} +{'type': 'loss', 'content': 0.003717446932569146, 'timestamp': '2025-09-30 22:11:10.997274', 'step': 926, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:11.051890', 'step': 926, 'epoch': 2} +{'type': 'loss', 'content': 0.03469868749380112, 'timestamp': '2025-09-30 22:11:11.055014', 'step': 927, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:11.109765', 'step': 927, 'epoch': 2} +{'type': 'loss', 'content': 0.022886861115694046, 'timestamp': '2025-09-30 22:11:11.116739', 'step': 928, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:11.176655', 'step': 928, 'epoch': 2} +{'type': 'loss', 'content': 0.005226781126111746, 'timestamp': '2025-09-30 22:11:11.179549', 'step': 929, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:11.233548', 'step': 929, 'epoch': 2} +{'type': 'loss', 'content': 0.017794528976082802, 'timestamp': '2025-09-30 22:11:11.237401', 'step': 930, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:11.291906', 'step': 930, 'epoch': 2} +{'type': 'loss', 'content': 0.034500379115343094, 'timestamp': '2025-09-30 22:11:11.294980', 'step': 931, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:11.349397', 'step': 931, 'epoch': 2} +{'type': 'loss', 'content': 0.030988583341240883, 'timestamp': '2025-09-30 22:11:11.355998', 'step': 932, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:11.410976', 'step': 932, 'epoch': 2} +{'type': 'loss', 'content': 0.017014091834425926, 'timestamp': '2025-09-30 22:11:11.414367', 'step': 933, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:11.468837', 'step': 933, 'epoch': 2} +{'type': 'loss', 'content': 0.010941265150904655, 'timestamp': '2025-09-30 22:11:11.479844', 'step': 934, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:11.536379', 'step': 934, 'epoch': 2} +{'type': 'loss', 'content': 0.01286663394421339, 'timestamp': '2025-09-30 22:11:11.539828', 'step': 935, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:11.595693', 'step': 935, 'epoch': 2} +{'type': 'loss', 'content': 0.015257543884217739, 'timestamp': '2025-09-30 22:11:11.602568', 'step': 936, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:11.656340', 'step': 936, 'epoch': 2} +{'type': 'loss', 'content': 0.021351372823119164, 'timestamp': '2025-09-30 22:11:11.659197', 'step': 937, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:11.727179', 'step': 937, 'epoch': 2} +{'type': 'loss', 'content': 0.02581597864627838, 'timestamp': '2025-09-30 22:11:11.731138', 'step': 938, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:11.786742', 'step': 938, 'epoch': 2} +{'type': 'loss', 'content': 0.026747452095150948, 'timestamp': '2025-09-30 22:11:11.790349', 'step': 939, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:11.845472', 'step': 939, 'epoch': 2} +{'type': 'loss', 'content': 0.017408985644578934, 'timestamp': '2025-09-30 22:11:11.851936', 'step': 940, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:11.905233', 'step': 940, 'epoch': 2} +{'type': 'loss', 'content': 0.023534327745437622, 'timestamp': '2025-09-30 22:11:11.907889', 'step': 941, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:11.963045', 'step': 941, 'epoch': 2} +{'type': 'loss', 'content': 0.018282432109117508, 'timestamp': '2025-09-30 22:11:11.968359', 'step': 942, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:12.025145', 'step': 942, 'epoch': 2} +{'type': 'loss', 'content': 0.016469817608594894, 'timestamp': '2025-09-30 22:11:12.029379', 'step': 943, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:12.085252', 'step': 943, 'epoch': 2} +{'type': 'loss', 'content': 0.014783737249672413, 'timestamp': '2025-09-30 22:11:12.099785', 'step': 944, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:12.160940', 'step': 944, 'epoch': 2} +{'type': 'loss', 'content': 0.024034133180975914, 'timestamp': '2025-09-30 22:11:12.164325', 'step': 945, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:12.217753', 'step': 945, 'epoch': 2} +{'type': 'loss', 'content': 0.021934911608695984, 'timestamp': '2025-09-30 22:11:12.220707', 'step': 946, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:12.280206', 'step': 946, 'epoch': 2} +{'type': 'loss', 'content': 0.020119894295930862, 'timestamp': '2025-09-30 22:11:12.283544', 'step': 947, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:12.337534', 'step': 947, 'epoch': 2} +{'type': 'loss', 'content': 0.023923667147755623, 'timestamp': '2025-09-30 22:11:12.343895', 'step': 948, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:12.398463', 'step': 948, 'epoch': 2} +{'type': 'loss', 'content': 0.022407446056604385, 'timestamp': '2025-09-30 22:11:12.402721', 'step': 949, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:12.456921', 'step': 949, 'epoch': 2} +{'type': 'loss', 'content': 0.013444255106151104, 'timestamp': '2025-09-30 22:11:12.467226', 'step': 950, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:12.530025', 'step': 950, 'epoch': 2} +{'type': 'loss', 'content': 0.027343014255166054, 'timestamp': '2025-09-30 22:11:12.542380', 'step': 951, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:12.595965', 'step': 951, 'epoch': 2} +{'type': 'loss', 'content': 0.01585647463798523, 'timestamp': '2025-09-30 22:11:12.612721', 'step': 952, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:12.666604', 'step': 952, 'epoch': 2} +{'type': 'loss', 'content': 0.021314824000000954, 'timestamp': '2025-09-30 22:11:12.669789', 'step': 953, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:12.723426', 'step': 953, 'epoch': 2} +{'type': 'loss', 'content': 0.025206713005900383, 'timestamp': '2025-09-30 22:11:12.738101', 'step': 954, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:12.791302', 'step': 954, 'epoch': 2} +{'type': 'loss', 'content': 0.019110916182398796, 'timestamp': '2025-09-30 22:11:12.794999', 'step': 955, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:12.850074', 'step': 955, 'epoch': 2} +{'type': 'loss', 'content': 0.017940467223525047, 'timestamp': '2025-09-30 22:11:12.857144', 'step': 956, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:12.912948', 'step': 956, 'epoch': 2} +{'type': 'loss', 'content': 0.01818234659731388, 'timestamp': '2025-09-30 22:11:12.916307', 'step': 957, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:12.971346', 'step': 957, 'epoch': 2} +{'type': 'loss', 'content': 0.0146263986825943, 'timestamp': '2025-09-30 22:11:12.975228', 'step': 958, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:13.028830', 'step': 958, 'epoch': 2} +{'type': 'loss', 'content': 0.01784416474401951, 'timestamp': '2025-09-30 22:11:13.032715', 'step': 959, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:13.087234', 'step': 959, 'epoch': 2} +{'type': 'loss', 'content': 0.01846710965037346, 'timestamp': '2025-09-30 22:11:13.100827', 'step': 960, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:13.162630', 'step': 960, 'epoch': 2} +{'type': 'loss', 'content': 0.006946962792426348, 'timestamp': '2025-09-30 22:11:13.165285', 'step': 961, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:13.223612', 'step': 961, 'epoch': 2} +{'type': 'loss', 'content': 0.013810682110488415, 'timestamp': '2025-09-30 22:11:13.227015', 'step': 962, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:13.281240', 'step': 962, 'epoch': 2} +{'type': 'loss', 'content': 0.011714011430740356, 'timestamp': '2025-09-30 22:11:13.284618', 'step': 963, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:13.338804', 'step': 963, 'epoch': 2} +{'type': 'loss', 'content': 0.01336191687732935, 'timestamp': '2025-09-30 22:11:13.344647', 'step': 964, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:13.399856', 'step': 964, 'epoch': 2} +{'type': 'loss', 'content': 0.0091180969029665, 'timestamp': '2025-09-30 22:11:13.407761', 'step': 965, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:13.462252', 'step': 965, 'epoch': 2} +{'type': 'loss', 'content': 0.006548265926539898, 'timestamp': '2025-09-30 22:11:13.472964', 'step': 966, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:13.528676', 'step': 966, 'epoch': 2} +{'type': 'loss', 'content': 0.02103179506957531, 'timestamp': '2025-09-30 22:11:13.531427', 'step': 967, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:13.585497', 'step': 967, 'epoch': 2} +{'type': 'loss', 'content': 0.010586266405880451, 'timestamp': '2025-09-30 22:11:13.598192', 'step': 968, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:13.651417', 'step': 968, 'epoch': 2} +{'type': 'loss', 'content': 0.03246624767780304, 'timestamp': '2025-09-30 22:11:13.654241', 'step': 969, 'epoch': 2} +{'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}], 'timestamp': '2025-09-30 22:11:14.864683', 'step': 969, 'epoch': 2} +{'type': 'pplx', 'content': 28798242.8768259, 'timestamp': '2025-09-30 22:11:14.874517', 'step': 969, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:14.929301', 'step': 969, 'epoch': 2} +{'type': 'loss', 'content': 0.02270360477268696, 'timestamp': '2025-09-30 22:11:14.932975', 'step': 970, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:14.989521', 'step': 970, 'epoch': 2} +{'type': 'loss', 'content': 0.023568162694573402, 'timestamp': '2025-09-30 22:11:14.993038', 'step': 971, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:15.047908', 'step': 971, 'epoch': 2} +{'type': 'loss', 'content': 0.014281951822340488, 'timestamp': '2025-09-30 22:11:15.056516', 'step': 972, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:15.115587', 'step': 972, 'epoch': 2} +{'type': 'loss', 'content': 0.02344512939453125, 'timestamp': '2025-09-30 22:11:15.118054', 'step': 973, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:15.171272', 'step': 973, 'epoch': 2} +{'type': 'loss', 'content': 0.015097950585186481, 'timestamp': '2025-09-30 22:11:15.180781', 'step': 974, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:15.246952', 'step': 974, 'epoch': 2} +{'type': 'loss', 'content': 0.015720214694738388, 'timestamp': '2025-09-30 22:11:15.250015', 'step': 975, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:15.307234', 'step': 975, 'epoch': 2} +{'type': 'loss', 'content': 0.017806116491556168, 'timestamp': '2025-09-30 22:11:15.320373', 'step': 976, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:15.379570', 'step': 976, 'epoch': 2} +{'type': 'loss', 'content': 0.028563665226101875, 'timestamp': '2025-09-30 22:11:15.383120', 'step': 977, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:15.437607', 'step': 977, 'epoch': 2} +{'type': 'loss', 'content': 0.01088719256222248, 'timestamp': '2025-09-30 22:11:15.449621', 'step': 978, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:15.509702', 'step': 978, 'epoch': 2} +{'type': 'loss', 'content': 0.01909536123275757, 'timestamp': '2025-09-30 22:11:15.523252', 'step': 979, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:15.588137', 'step': 979, 'epoch': 2} +{'type': 'loss', 'content': 0.0036833793856203556, 'timestamp': '2025-09-30 22:11:15.594879', 'step': 980, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:15.648809', 'step': 980, 'epoch': 2} +{'type': 'loss', 'content': 0.0032408982515335083, 'timestamp': '2025-09-30 22:11:15.651580', 'step': 981, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:15.713924', 'step': 981, 'epoch': 2} +{'type': 'loss', 'content': 0.0031015234999358654, 'timestamp': '2025-09-30 22:11:15.727931', 'step': 982, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:15.785197', 'step': 982, 'epoch': 2} +{'type': 'loss', 'content': 0.03271304816007614, 'timestamp': '2025-09-30 22:11:15.788925', 'step': 983, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:15.844036', 'step': 983, 'epoch': 2} +{'type': 'loss', 'content': 0.004379452671855688, 'timestamp': '2025-09-30 22:11:15.851440', 'step': 984, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:15.905298', 'step': 984, 'epoch': 2} +{'type': 'loss', 'content': 0.027334626764059067, 'timestamp': '2025-09-30 22:11:15.909656', 'step': 985, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:15.969612', 'step': 985, 'epoch': 2} +{'type': 'loss', 'content': 0.03557107597589493, 'timestamp': '2025-09-30 22:11:15.973229', 'step': 986, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:16.028026', 'step': 986, 'epoch': 2} +{'type': 'loss', 'content': 0.02836051769554615, 'timestamp': '2025-09-30 22:11:16.030794', 'step': 987, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:16.086426', 'step': 987, 'epoch': 2} +{'type': 'loss', 'content': 0.0021814878564327955, 'timestamp': '2025-09-30 22:11:16.100933', 'step': 988, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:16.162657', 'step': 988, 'epoch': 2} +{'type': 'loss', 'content': 0.045109979808330536, 'timestamp': '2025-09-30 22:11:16.165248', 'step': 989, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:16.221746', 'step': 989, 'epoch': 2} +{'type': 'loss', 'content': 0.008987918496131897, 'timestamp': '2025-09-30 22:11:16.231912', 'step': 990, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:16.293739', 'step': 990, 'epoch': 2} +{'type': 'loss', 'content': 0.05426352098584175, 'timestamp': '2025-09-30 22:11:16.297126', 'step': 991, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:16.352232', 'step': 991, 'epoch': 2} +{'type': 'loss', 'content': 0.023926403373479843, 'timestamp': '2025-09-30 22:11:16.365565', 'step': 992, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:16.430875', 'step': 992, 'epoch': 2} +{'type': 'loss', 'content': 0.021941417828202248, 'timestamp': '2025-09-30 22:11:16.439585', 'step': 993, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:16.498898', 'step': 993, 'epoch': 2} +{'type': 'loss', 'content': 0.022387471050024033, 'timestamp': '2025-09-30 22:11:16.503554', 'step': 994, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:16.559981', 'step': 994, 'epoch': 2} +{'type': 'loss', 'content': 0.034199006855487823, 'timestamp': '2025-09-30 22:11:16.563770', 'step': 995, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:16.626143', 'step': 995, 'epoch': 2} +{'type': 'loss', 'content': 0.03138425573706627, 'timestamp': '2025-09-30 22:11:16.633281', 'step': 996, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:16.699866', 'step': 996, 'epoch': 2} +{'type': 'loss', 'content': 0.01781405135989189, 'timestamp': '2025-09-30 22:11:16.711858', 'step': 997, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:16.774422', 'step': 997, 'epoch': 2} +{'type': 'loss', 'content': 0.01842852309346199, 'timestamp': '2025-09-30 22:11:16.778088', 'step': 998, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:16.837439', 'step': 998, 'epoch': 2} +{'type': 'loss', 'content': 0.015847353264689445, 'timestamp': '2025-09-30 22:11:16.840628', 'step': 999, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:16.905813', 'step': 999, 'epoch': 2} +{'type': 'loss', 'content': 0.023063872009515762, 'timestamp': '2025-09-30 22:11:16.922447', 'step': 1000, 'epoch': 2} +{'type': 'info', 'content': 'Checkpoint saved at step 1000', 'timestamp': '2025-09-30 22:11:17.342366', 'step': 1000, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:17.406168', 'step': 1000, 'epoch': 2} +{'type': 'loss', 'content': 0.01709248684346676, 'timestamp': '2025-09-30 22:11:17.410020', 'step': 1001, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:17.475350', 'step': 1001, 'epoch': 2} +{'type': 'loss', 'content': 0.020289698615670204, 'timestamp': '2025-09-30 22:11:17.479027', 'step': 1002, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:17.537016', 'step': 1002, 'epoch': 2} +{'type': 'loss', 'content': 0.008975411765277386, 'timestamp': '2025-09-30 22:11:17.540174', 'step': 1003, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:17.612218', 'step': 1003, 'epoch': 2} +{'type': 'loss', 'content': 0.008156144060194492, 'timestamp': '2025-09-30 22:11:17.625039', 'step': 1004, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:17.681030', 'step': 1004, 'epoch': 2} +{'type': 'loss', 'content': 0.01333966851234436, 'timestamp': '2025-09-30 22:11:17.684917', 'step': 1005, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:17.739488', 'step': 1005, 'epoch': 2} +{'type': 'loss', 'content': 0.017511749640107155, 'timestamp': '2025-09-30 22:11:17.749395', 'step': 1006, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:17.812097', 'step': 1006, 'epoch': 2} +{'type': 'loss', 'content': 0.015036171302199364, 'timestamp': '2025-09-30 22:11:17.814729', 'step': 1007, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:17.870282', 'step': 1007, 'epoch': 2} +{'type': 'loss', 'content': 0.027377020567655563, 'timestamp': '2025-09-30 22:11:17.884203', 'step': 1008, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:17.939068', 'step': 1008, 'epoch': 2} +{'type': 'loss', 'content': 0.025792699307203293, 'timestamp': '2025-09-30 22:11:17.941916', 'step': 1009, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:17.997774', 'step': 1009, 'epoch': 2} +{'type': 'loss', 'content': 0.014380039647221565, 'timestamp': '2025-09-30 22:11:18.000089', 'step': 1010, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:18.061728', 'step': 1010, 'epoch': 2} +{'type': 'loss', 'content': 0.010104767978191376, 'timestamp': '2025-09-30 22:11:18.065231', 'step': 1011, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:18.119492', 'step': 1011, 'epoch': 2} +{'type': 'loss', 'content': 0.026348626241087914, 'timestamp': '2025-09-30 22:11:18.136369', 'step': 1012, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:18.198516', 'step': 1012, 'epoch': 2} +{'type': 'loss', 'content': 0.04897860437631607, 'timestamp': '2025-09-30 22:11:18.202468', 'step': 1013, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:18.256848', 'step': 1013, 'epoch': 2} +{'type': 'loss', 'content': 0.022462058812379837, 'timestamp': '2025-09-30 22:11:18.260532', 'step': 1014, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:18.315123', 'step': 1014, 'epoch': 2} +{'type': 'loss', 'content': 0.01851494610309601, 'timestamp': '2025-09-30 22:11:18.318588', 'step': 1015, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:18.372184', 'step': 1015, 'epoch': 2} +{'type': 'loss', 'content': 0.01477868389338255, 'timestamp': '2025-09-30 22:11:18.380950', 'step': 1016, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:18.435393', 'step': 1016, 'epoch': 2} +{'type': 'loss', 'content': 0.024721411988139153, 'timestamp': '2025-09-30 22:11:18.439119', 'step': 1017, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:18.494363', 'step': 1017, 'epoch': 2} +{'type': 'loss', 'content': 0.01894466020166874, 'timestamp': '2025-09-30 22:11:18.497219', 'step': 1018, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:18.558038', 'step': 1018, 'epoch': 2} +{'type': 'loss', 'content': 0.02108973078429699, 'timestamp': '2025-09-30 22:11:18.562002', 'step': 1019, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:18.626079', 'step': 1019, 'epoch': 2} +{'type': 'loss', 'content': 0.014491016045212746, 'timestamp': '2025-09-30 22:11:18.639370', 'step': 1020, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:18.695578', 'step': 1020, 'epoch': 2} +{'type': 'loss', 'content': 0.01586849056184292, 'timestamp': '2025-09-30 22:11:18.704763', 'step': 1021, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:18.758234', 'step': 1021, 'epoch': 2} +{'type': 'loss', 'content': 0.024214720353484154, 'timestamp': '2025-09-30 22:11:18.761605', 'step': 1022, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:18.815061', 'step': 1022, 'epoch': 2} +{'type': 'loss', 'content': 0.01589794084429741, 'timestamp': '2025-09-30 22:11:18.820225', 'step': 1023, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:18.875416', 'step': 1023, 'epoch': 2} +{'type': 'loss', 'content': 0.013331546448171139, 'timestamp': '2025-09-30 22:11:18.882243', 'step': 1024, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:18.936663', 'step': 1024, 'epoch': 2} +{'type': 'loss', 'content': 0.024225672706961632, 'timestamp': '2025-09-30 22:11:18.940125', 'step': 1025, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:18.996413', 'step': 1025, 'epoch': 2} +{'type': 'loss', 'content': 0.025553515180945396, 'timestamp': '2025-09-30 22:11:19.013028', 'step': 1026, 'epoch': 2} +{'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}], 'timestamp': '2025-09-30 22:11:20.209695', 'step': 1026, 'epoch': 2} +{'type': 'pplx', 'content': 28815333.87535932, 'timestamp': '2025-09-30 22:11:20.215325', 'step': 1026, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-30 22:11:20.269869', 'step': 1026, 'epoch': 2} +{'type': 'loss', 'content': 0.03137680143117905, 'timestamp': '2025-09-30 22:11:20.275244', 'step': 1027, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:20.330426', 'step': 1027, 'epoch': 2} +{'type': 'loss', 'content': 0.012752371840178967, 'timestamp': '2025-09-30 22:11:20.338824', 'step': 1028, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:20.392801', 'step': 1028, 'epoch': 2} +{'type': 'loss', 'content': 0.01019949372857809, 'timestamp': '2025-09-30 22:11:20.395750', 'step': 1029, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:20.449777', 'step': 1029, 'epoch': 2} +{'type': 'loss', 'content': 0.00877032708376646, 'timestamp': '2025-09-30 22:11:20.454485', 'step': 1030, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:20.510207', 'step': 1030, 'epoch': 2} +{'type': 'loss', 'content': 0.011759008280932903, 'timestamp': '2025-09-30 22:11:20.513472', 'step': 1031, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:20.567688', 'step': 1031, 'epoch': 2} +{'type': 'loss', 'content': 0.011547918431460857, 'timestamp': '2025-09-30 22:11:20.574407', 'step': 1032, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:20.629320', 'step': 1032, 'epoch': 2} +{'type': 'loss', 'content': 0.010614079423248768, 'timestamp': '2025-09-30 22:11:20.631833', 'step': 1033, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:20.693831', 'step': 1033, 'epoch': 2} +{'type': 'loss', 'content': 0.021462971344590187, 'timestamp': '2025-09-30 22:11:20.698081', 'step': 1034, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:20.752860', 'step': 1034, 'epoch': 2} +{'type': 'loss', 'content': 0.012604753486812115, 'timestamp': '2025-09-30 22:11:20.755848', 'step': 1035, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:20.811454', 'step': 1035, 'epoch': 2} +{'type': 'loss', 'content': 0.0071957902982831, 'timestamp': '2025-09-30 22:11:20.818646', 'step': 1036, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:20.871794', 'step': 1036, 'epoch': 2} +{'type': 'loss', 'content': 0.007674416061490774, 'timestamp': '2025-09-30 22:11:20.875567', 'step': 1037, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:20.930788', 'step': 1037, 'epoch': 2} +{'type': 'loss', 'content': 0.030532391741871834, 'timestamp': '2025-09-30 22:11:20.935486', 'step': 1038, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:20.997287', 'step': 1038, 'epoch': 2} +{'type': 'loss', 'content': 0.020820874720811844, 'timestamp': '2025-09-30 22:11:21.000735', 'step': 1039, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:21.056898', 'step': 1039, 'epoch': 2} +{'type': 'loss', 'content': 0.02545594982802868, 'timestamp': '2025-09-30 22:11:21.074256', 'step': 1040, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:21.132305', 'step': 1040, 'epoch': 2} +{'type': 'loss', 'content': 0.009167616255581379, 'timestamp': '2025-09-30 22:11:21.135789', 'step': 1041, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:21.193450', 'step': 1041, 'epoch': 2} +{'type': 'loss', 'content': 0.009028018452227116, 'timestamp': '2025-09-30 22:11:21.197632', 'step': 1042, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:21.257441', 'step': 1042, 'epoch': 2} +{'type': 'loss', 'content': 0.05561920627951622, 'timestamp': '2025-09-30 22:11:21.261241', 'step': 1043, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:21.317907', 'step': 1043, 'epoch': 2} +{'type': 'loss', 'content': 0.010370013304054737, 'timestamp': '2025-09-30 22:11:21.325239', 'step': 1044, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:21.389422', 'step': 1044, 'epoch': 2} +{'type': 'loss', 'content': 0.00789992231875658, 'timestamp': '2025-09-30 22:11:21.396542', 'step': 1045, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:21.458698', 'step': 1045, 'epoch': 2} +{'type': 'loss', 'content': 0.015907449647784233, 'timestamp': '2025-09-30 22:11:21.461609', 'step': 1046, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:21.522637', 'step': 1046, 'epoch': 2} +{'type': 'loss', 'content': 0.03674934431910515, 'timestamp': '2025-09-30 22:11:21.525264', 'step': 1047, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:21.583737', 'step': 1047, 'epoch': 2} +{'type': 'loss', 'content': 0.01920946128666401, 'timestamp': '2025-09-30 22:11:21.590229', 'step': 1048, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:21.645190', 'step': 1048, 'epoch': 2} +{'type': 'loss', 'content': 0.009556346572935581, 'timestamp': '2025-09-30 22:11:21.653972', 'step': 1049, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:21.718981', 'step': 1049, 'epoch': 2} +{'type': 'loss', 'content': 0.007382354233413935, 'timestamp': '2025-09-30 22:11:21.723397', 'step': 1050, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:21.787348', 'step': 1050, 'epoch': 2} +{'type': 'loss', 'content': 0.013103391043841839, 'timestamp': '2025-09-30 22:11:21.796826', 'step': 1051, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:21.859537', 'step': 1051, 'epoch': 2} +{'type': 'loss', 'content': 0.012127166613936424, 'timestamp': '2025-09-30 22:11:21.870499', 'step': 1052, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:21.929892', 'step': 1052, 'epoch': 2} +{'type': 'loss', 'content': 0.015366926789283752, 'timestamp': '2025-09-30 22:11:21.938332', 'step': 1053, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:21.999245', 'step': 1053, 'epoch': 2} +{'type': 'loss', 'content': 0.014787280932068825, 'timestamp': '2025-09-30 22:11:22.008025', 'step': 1054, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:22.068794', 'step': 1054, 'epoch': 2} +{'type': 'loss', 'content': 0.03266071528196335, 'timestamp': '2025-09-30 22:11:22.072613', 'step': 1055, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:22.132559', 'step': 1055, 'epoch': 2} +{'type': 'loss', 'content': 0.02072693593800068, 'timestamp': '2025-09-30 22:11:22.144141', 'step': 1056, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:22.204106', 'step': 1056, 'epoch': 2} +{'type': 'loss', 'content': 0.011774016544222832, 'timestamp': '2025-09-30 22:11:22.213022', 'step': 1057, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:22.273981', 'step': 1057, 'epoch': 2} +{'type': 'loss', 'content': 0.016940701752901077, 'timestamp': '2025-09-30 22:11:22.279224', 'step': 1058, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:22.337089', 'step': 1058, 'epoch': 2} +{'type': 'loss', 'content': 0.022047163918614388, 'timestamp': '2025-09-30 22:11:22.347032', 'step': 1059, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:22.420003', 'step': 1059, 'epoch': 2} +{'type': 'loss', 'content': 0.011164650321006775, 'timestamp': '2025-09-30 22:11:22.433830', 'step': 1060, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:22.504073', 'step': 1060, 'epoch': 2} +{'type': 'loss', 'content': 0.011306509375572205, 'timestamp': '2025-09-30 22:11:22.506988', 'step': 1061, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:22.569632', 'step': 1061, 'epoch': 2} +{'type': 'loss', 'content': 0.012297945097088814, 'timestamp': '2025-09-30 22:11:22.578635', 'step': 1062, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:22.642090', 'step': 1062, 'epoch': 2} +{'type': 'loss', 'content': 0.005849027074873447, 'timestamp': '2025-09-30 22:11:22.645365', 'step': 1063, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:22.702258', 'step': 1063, 'epoch': 2} +{'type': 'loss', 'content': 0.009956971742212772, 'timestamp': '2025-09-30 22:11:22.708813', 'step': 1064, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:22.767276', 'step': 1064, 'epoch': 2} +{'type': 'loss', 'content': 0.02906261757016182, 'timestamp': '2025-09-30 22:11:22.770910', 'step': 1065, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:22.831779', 'step': 1065, 'epoch': 2} +{'type': 'loss', 'content': 0.034898433834314346, 'timestamp': '2025-09-30 22:11:22.842828', 'step': 1066, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:22.919431', 'step': 1066, 'epoch': 2} +{'type': 'loss', 'content': 0.02677224576473236, 'timestamp': '2025-09-30 22:11:22.922878', 'step': 1067, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:22.981516', 'step': 1067, 'epoch': 2} +{'type': 'loss', 'content': 0.011528550647199154, 'timestamp': '2025-09-30 22:11:22.990009', 'step': 1068, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:23.046267', 'step': 1068, 'epoch': 2} +{'type': 'loss', 'content': 0.005817048251628876, 'timestamp': '2025-09-30 22:11:23.054922', 'step': 1069, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:23.110207', 'step': 1069, 'epoch': 2} +{'type': 'loss', 'content': 0.014719395898282528, 'timestamp': '2025-09-30 22:11:23.113003', 'step': 1070, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:23.170356', 'step': 1070, 'epoch': 2} +{'type': 'loss', 'content': 0.02249346859753132, 'timestamp': '2025-09-30 22:11:23.173044', 'step': 1071, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:23.226950', 'step': 1071, 'epoch': 2} +{'type': 'loss', 'content': 0.035828422755002975, 'timestamp': '2025-09-30 22:11:23.233136', 'step': 1072, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:23.286002', 'step': 1072, 'epoch': 2} +{'type': 'loss', 'content': 0.010695637203752995, 'timestamp': '2025-09-30 22:11:23.288722', 'step': 1073, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:23.345359', 'step': 1073, 'epoch': 2} +{'type': 'loss', 'content': 0.0401848740875721, 'timestamp': '2025-09-30 22:11:23.352471', 'step': 1074, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:23.413270', 'step': 1074, 'epoch': 2} +{'type': 'loss', 'content': 0.05746329203248024, 'timestamp': '2025-09-30 22:11:23.416088', 'step': 1075, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:23.479871', 'step': 1075, 'epoch': 2} +{'type': 'loss', 'content': 0.04341733828186989, 'timestamp': '2025-09-30 22:11:23.494593', 'step': 1076, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:23.552516', 'step': 1076, 'epoch': 2} +{'type': 'loss', 'content': 0.02323761023581028, 'timestamp': '2025-09-30 22:11:23.555673', 'step': 1077, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:23.610535', 'step': 1077, 'epoch': 2} +{'type': 'loss', 'content': 0.02646273747086525, 'timestamp': '2025-09-30 22:11:23.617313', 'step': 1078, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:23.679251', 'step': 1078, 'epoch': 2} +{'type': 'loss', 'content': 0.017116302624344826, 'timestamp': '2025-09-30 22:11:23.684229', 'step': 1079, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:23.739927', 'step': 1079, 'epoch': 2} +{'type': 'loss', 'content': 0.018749186769127846, 'timestamp': '2025-09-30 22:11:23.757600', 'step': 1080, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:23.813780', 'step': 1080, 'epoch': 2} +{'type': 'loss', 'content': 0.025141606107354164, 'timestamp': '2025-09-30 22:11:23.828231', 'step': 1081, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:23.890960', 'step': 1081, 'epoch': 2} +{'type': 'loss', 'content': 0.026854228228330612, 'timestamp': '2025-09-30 22:11:23.895100', 'step': 1082, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:23.953254', 'step': 1082, 'epoch': 2} +{'type': 'loss', 'content': 0.019390176981687546, 'timestamp': '2025-09-30 22:11:23.956489', 'step': 1083, 'epoch': 2} +{'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}], 'timestamp': '2025-09-30 22:11:25.186778', 'step': 1083, 'epoch': 2} +{'type': 'pplx', 'content': 30432312.81053753, 'timestamp': '2025-09-30 22:11:25.198564', 'step': 1083, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:25.261335', 'step': 1083, 'epoch': 2} +{'type': 'loss', 'content': 0.01611759141087532, 'timestamp': '2025-09-30 22:11:25.269310', 'step': 1084, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:25.325196', 'step': 1084, 'epoch': 2} +{'type': 'loss', 'content': 0.006585521157830954, 'timestamp': '2025-09-30 22:11:25.329013', 'step': 1085, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:25.389748', 'step': 1085, 'epoch': 2} +{'type': 'loss', 'content': 0.01776345819234848, 'timestamp': '2025-09-30 22:11:25.393830', 'step': 1086, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:25.449930', 'step': 1086, 'epoch': 2} +{'type': 'loss', 'content': 0.02161712571978569, 'timestamp': '2025-09-30 22:11:25.462318', 'step': 1087, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:25.525802', 'step': 1087, 'epoch': 2} +{'type': 'loss', 'content': 0.008740507997572422, 'timestamp': '2025-09-30 22:11:25.533504', 'step': 1088, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:25.591122', 'step': 1088, 'epoch': 2} +{'type': 'loss', 'content': 0.03131970018148422, 'timestamp': '2025-09-30 22:11:25.595109', 'step': 1089, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:25.649810', 'step': 1089, 'epoch': 2} +{'type': 'loss', 'content': 0.021836699917912483, 'timestamp': '2025-09-30 22:11:25.652709', 'step': 1090, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:25.715906', 'step': 1090, 'epoch': 2} +{'type': 'loss', 'content': 0.02508729137480259, 'timestamp': '2025-09-30 22:11:25.727891', 'step': 1091, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:25.791875', 'step': 1091, 'epoch': 2} +{'type': 'loss', 'content': 0.013201175257563591, 'timestamp': '2025-09-30 22:11:25.799459', 'step': 1092, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:25.854161', 'step': 1092, 'epoch': 2} +{'type': 'loss', 'content': 0.025001246482133865, 'timestamp': '2025-09-30 22:11:25.860494', 'step': 1093, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:25.918284', 'step': 1093, 'epoch': 2} +{'type': 'loss', 'content': 0.018011152744293213, 'timestamp': '2025-09-30 22:11:25.922173', 'step': 1094, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:25.977889', 'step': 1094, 'epoch': 2} +{'type': 'loss', 'content': 0.015894226729869843, 'timestamp': '2025-09-30 22:11:25.980986', 'step': 1095, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:26.037531', 'step': 1095, 'epoch': 2} +{'type': 'loss', 'content': 0.008476397953927517, 'timestamp': '2025-09-30 22:11:26.044321', 'step': 1096, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:26.097394', 'step': 1096, 'epoch': 2} +{'type': 'loss', 'content': 0.017200523987412453, 'timestamp': '2025-09-30 22:11:26.100930', 'step': 1097, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:26.158557', 'step': 1097, 'epoch': 2} +{'type': 'loss', 'content': 0.020911503583192825, 'timestamp': '2025-09-30 22:11:26.161645', 'step': 1098, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:26.216231', 'step': 1098, 'epoch': 2} +{'type': 'loss', 'content': 0.0157220046967268, 'timestamp': '2025-09-30 22:11:26.228058', 'step': 1099, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:26.283086', 'step': 1099, 'epoch': 2} +{'type': 'loss', 'content': 0.020792776718735695, 'timestamp': '2025-09-30 22:11:26.291051', 'step': 1100, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:26.345330', 'step': 1100, 'epoch': 2} +{'type': 'loss', 'content': 0.026384010910987854, 'timestamp': '2025-09-30 22:11:26.349217', 'step': 1101, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:26.404260', 'step': 1101, 'epoch': 2} +{'type': 'loss', 'content': 0.00842324923723936, 'timestamp': '2025-09-30 22:11:26.408124', 'step': 1102, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:26.468374', 'step': 1102, 'epoch': 2} +{'type': 'loss', 'content': 0.018321281298995018, 'timestamp': '2025-09-30 22:11:26.480784', 'step': 1103, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:26.545071', 'step': 1103, 'epoch': 2} +{'type': 'loss', 'content': 0.016511019319295883, 'timestamp': '2025-09-30 22:11:26.556678', 'step': 1104, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:26.610703', 'step': 1104, 'epoch': 2} +{'type': 'loss', 'content': 0.010460903868079185, 'timestamp': '2025-09-30 22:11:26.622778', 'step': 1105, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:26.687316', 'step': 1105, 'epoch': 2} +{'type': 'loss', 'content': 0.00898995902389288, 'timestamp': '2025-09-30 22:11:26.690735', 'step': 1106, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:26.759562', 'step': 1106, 'epoch': 2} +{'type': 'loss', 'content': 0.027517573907971382, 'timestamp': '2025-09-30 22:11:26.764616', 'step': 1107, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:26.821596', 'step': 1107, 'epoch': 2} +{'type': 'loss', 'content': 0.021612342447042465, 'timestamp': '2025-09-30 22:11:26.839263', 'step': 1108, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:26.895507', 'step': 1108, 'epoch': 2} +{'type': 'loss', 'content': 0.009821252897381783, 'timestamp': '2025-09-30 22:11:26.899450', 'step': 1109, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:26.955573', 'step': 1109, 'epoch': 2} +{'type': 'loss', 'content': 0.018084069713950157, 'timestamp': '2025-09-30 22:11:26.958415', 'step': 1110, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:27.012521', 'step': 1110, 'epoch': 2} +{'type': 'loss', 'content': 0.014403236098587513, 'timestamp': '2025-09-30 22:11:27.016372', 'step': 1111, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:27.070513', 'step': 1111, 'epoch': 2} +{'type': 'loss', 'content': 0.02946326695382595, 'timestamp': '2025-09-30 22:11:27.085522', 'step': 1112, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:27.139790', 'step': 1112, 'epoch': 2} +{'type': 'loss', 'content': 0.01968367025256157, 'timestamp': '2025-09-30 22:11:27.143655', 'step': 1113, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:27.200252', 'step': 1113, 'epoch': 2} +{'type': 'loss', 'content': 0.00848131999373436, 'timestamp': '2025-09-30 22:11:27.204535', 'step': 1114, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:27.268243', 'step': 1114, 'epoch': 2} +{'type': 'loss', 'content': 0.021036352962255478, 'timestamp': '2025-09-30 22:11:27.271299', 'step': 1115, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:27.325982', 'step': 1115, 'epoch': 2} +{'type': 'loss', 'content': 0.023403389379382133, 'timestamp': '2025-09-30 22:11:27.332962', 'step': 1116, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:27.396211', 'step': 1116, 'epoch': 2} +{'type': 'loss', 'content': 0.023917311802506447, 'timestamp': '2025-09-30 22:11:27.401426', 'step': 1117, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:27.455716', 'step': 1117, 'epoch': 2} +{'type': 'loss', 'content': 0.013939259573817253, 'timestamp': '2025-09-30 22:11:27.458932', 'step': 1118, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:27.515666', 'step': 1118, 'epoch': 2} +{'type': 'loss', 'content': 0.0158989354968071, 'timestamp': '2025-09-30 22:11:27.519650', 'step': 1119, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:27.575322', 'step': 1119, 'epoch': 2} +{'type': 'loss', 'content': 0.02595258131623268, 'timestamp': '2025-09-30 22:11:27.583748', 'step': 1120, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:27.640282', 'step': 1120, 'epoch': 2} +{'type': 'loss', 'content': 0.021582497283816338, 'timestamp': '2025-09-30 22:11:27.644484', 'step': 1121, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:27.699420', 'step': 1121, 'epoch': 2} +{'type': 'loss', 'content': 0.005752278957515955, 'timestamp': '2025-09-30 22:11:27.704823', 'step': 1122, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:27.765007', 'step': 1122, 'epoch': 2} +{'type': 'loss', 'content': 0.010556980967521667, 'timestamp': '2025-09-30 22:11:27.767848', 'step': 1123, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:27.823206', 'step': 1123, 'epoch': 2} +{'type': 'loss', 'content': 0.01841617561876774, 'timestamp': '2025-09-30 22:11:27.835200', 'step': 1124, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:27.898988', 'step': 1124, 'epoch': 2} +{'type': 'loss', 'content': 0.00747791538015008, 'timestamp': '2025-09-30 22:11:27.903636', 'step': 1125, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:27.958240', 'step': 1125, 'epoch': 2} +{'type': 'loss', 'content': 0.023193703964352608, 'timestamp': '2025-09-30 22:11:27.969994', 'step': 1126, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:28.033276', 'step': 1126, 'epoch': 2} +{'type': 'loss', 'content': 0.03435632213950157, 'timestamp': '2025-09-30 22:11:28.037037', 'step': 1127, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:28.092858', 'step': 1127, 'epoch': 2} +{'type': 'loss', 'content': 0.006761268712580204, 'timestamp': '2025-09-30 22:11:28.109904', 'step': 1128, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:28.175862', 'step': 1128, 'epoch': 2} +{'type': 'loss', 'content': 0.055748604238033295, 'timestamp': '2025-09-30 22:11:28.189431', 'step': 1129, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:28.252544', 'step': 1129, 'epoch': 2} +{'type': 'loss', 'content': 0.035190846771001816, 'timestamp': '2025-09-30 22:11:28.257262', 'step': 1130, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:28.322185', 'step': 1130, 'epoch': 2} +{'type': 'loss', 'content': 0.01312336977571249, 'timestamp': '2025-09-30 22:11:28.334537', 'step': 1131, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:28.400136', 'step': 1131, 'epoch': 2} +{'type': 'loss', 'content': 0.01045858021825552, 'timestamp': '2025-09-30 22:11:28.409011', 'step': 1132, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:28.471113', 'step': 1132, 'epoch': 2} +{'type': 'loss', 'content': 0.028751153498888016, 'timestamp': '2025-09-30 22:11:28.483658', 'step': 1133, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:28.550451', 'step': 1133, 'epoch': 2} +{'type': 'loss', 'content': 0.01273017842322588, 'timestamp': '2025-09-30 22:11:28.563256', 'step': 1134, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:28.619462', 'step': 1134, 'epoch': 2} +{'type': 'loss', 'content': 0.0197049081325531, 'timestamp': '2025-09-30 22:11:28.623295', 'step': 1135, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:28.688139', 'step': 1135, 'epoch': 2} +{'type': 'loss', 'content': 0.007282760459929705, 'timestamp': '2025-09-30 22:11:28.695520', 'step': 1136, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:28.766569', 'step': 1136, 'epoch': 2} +{'type': 'loss', 'content': 0.007011772133409977, 'timestamp': '2025-09-30 22:11:28.778955', 'step': 1137, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:28.834229', 'step': 1137, 'epoch': 2} +{'type': 'loss', 'content': 0.01817786693572998, 'timestamp': '2025-09-30 22:11:28.838713', 'step': 1138, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:28.894352', 'step': 1138, 'epoch': 2} +{'type': 'loss', 'content': 0.009522279724478722, 'timestamp': '2025-09-30 22:11:28.904447', 'step': 1139, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:28.958831', 'step': 1139, 'epoch': 2} +{'type': 'loss', 'content': 0.015000018291175365, 'timestamp': '2025-09-30 22:11:28.976470', 'step': 1140, 'epoch': 2} +{'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}], 'timestamp': '2025-09-30 22:11:30.202755', 'step': 1140, 'epoch': 2} +{'type': 'pplx', 'content': 31761433.22399976, 'timestamp': '2025-09-30 22:11:30.207738', 'step': 1140, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:30.261288', 'step': 1140, 'epoch': 2} +{'type': 'loss', 'content': 0.023212244734168053, 'timestamp': '2025-09-30 22:11:30.264791', 'step': 1141, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:30.320013', 'step': 1141, 'epoch': 2} +{'type': 'loss', 'content': 0.005753274541348219, 'timestamp': '2025-09-30 22:11:30.324165', 'step': 1142, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:30.383960', 'step': 1142, 'epoch': 2} +{'type': 'loss', 'content': 0.016708120703697205, 'timestamp': '2025-09-30 22:11:30.388811', 'step': 1143, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:30.453550', 'step': 1143, 'epoch': 2} +{'type': 'loss', 'content': 0.03149080649018288, 'timestamp': '2025-09-30 22:11:30.460546', 'step': 1144, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:30.515308', 'step': 1144, 'epoch': 2} +{'type': 'loss', 'content': 0.0065864152275025845, 'timestamp': '2025-09-30 22:11:30.519706', 'step': 1145, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:30.575127', 'step': 1145, 'epoch': 2} +{'type': 'loss', 'content': 0.015503957867622375, 'timestamp': '2025-09-30 22:11:30.578397', 'step': 1146, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:30.633640', 'step': 1146, 'epoch': 2} +{'type': 'loss', 'content': 0.011548278853297234, 'timestamp': '2025-09-30 22:11:30.639347', 'step': 1147, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:30.695135', 'step': 1147, 'epoch': 2} +{'type': 'loss', 'content': 0.014130154624581337, 'timestamp': '2025-09-30 22:11:30.710275', 'step': 1148, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:30.764519', 'step': 1148, 'epoch': 2} +{'type': 'loss', 'content': 0.022189956158399582, 'timestamp': '2025-09-30 22:11:30.769789', 'step': 1149, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:30.824056', 'step': 1149, 'epoch': 2} +{'type': 'loss', 'content': 0.01809052750468254, 'timestamp': '2025-09-30 22:11:30.828934', 'step': 1150, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:30.884363', 'step': 1150, 'epoch': 2} +{'type': 'loss', 'content': 0.0377984382212162, 'timestamp': '2025-09-30 22:11:30.888947', 'step': 1151, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:30.943518', 'step': 1151, 'epoch': 2} +{'type': 'loss', 'content': 0.014366830699145794, 'timestamp': '2025-09-30 22:11:30.953335', 'step': 1152, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:31.007608', 'step': 1152, 'epoch': 2} +{'type': 'loss', 'content': 0.019895801320672035, 'timestamp': '2025-09-30 22:11:31.017945', 'step': 1153, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:31.079661', 'step': 1153, 'epoch': 2} +{'type': 'loss', 'content': 0.01277296245098114, 'timestamp': '2025-09-30 22:11:31.086770', 'step': 1154, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:31.140713', 'step': 1154, 'epoch': 2} +{'type': 'loss', 'content': 0.027852704748511314, 'timestamp': '2025-09-30 22:11:31.146979', 'step': 1155, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:31.200899', 'step': 1155, 'epoch': 2} +{'type': 'loss', 'content': 0.01802685298025608, 'timestamp': '2025-09-30 22:11:31.207518', 'step': 1156, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:31.261180', 'step': 1156, 'epoch': 2} +{'type': 'loss', 'content': 0.0185939259827137, 'timestamp': '2025-09-30 22:11:31.264827', 'step': 1157, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:31.321946', 'step': 1157, 'epoch': 2} +{'type': 'loss', 'content': 0.00952514261007309, 'timestamp': '2025-09-30 22:11:31.325302', 'step': 1158, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:31.380426', 'step': 1158, 'epoch': 2} +{'type': 'loss', 'content': 0.014004341326653957, 'timestamp': '2025-09-30 22:11:31.383957', 'step': 1159, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:31.437605', 'step': 1159, 'epoch': 2} +{'type': 'loss', 'content': 0.011020747944712639, 'timestamp': '2025-09-30 22:11:31.444549', 'step': 1160, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:31.508814', 'step': 1160, 'epoch': 2} +{'type': 'loss', 'content': 0.007963932119309902, 'timestamp': '2025-09-30 22:11:31.512980', 'step': 1161, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:31.568675', 'step': 1161, 'epoch': 2} +{'type': 'loss', 'content': 0.029965534806251526, 'timestamp': '2025-09-30 22:11:31.574911', 'step': 1162, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:31.644846', 'step': 1162, 'epoch': 2} +{'type': 'loss', 'content': 0.012620776891708374, 'timestamp': '2025-09-30 22:11:31.658105', 'step': 1163, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:31.715055', 'step': 1163, 'epoch': 2} +{'type': 'loss', 'content': 0.017773713916540146, 'timestamp': '2025-09-30 22:11:31.722580', 'step': 1164, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:31.776212', 'step': 1164, 'epoch': 2} +{'type': 'loss', 'content': 0.015509136021137238, 'timestamp': '2025-09-30 22:11:31.780116', 'step': 1165, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:31.834854', 'step': 1165, 'epoch': 2} +{'type': 'loss', 'content': 0.01890859194099903, 'timestamp': '2025-09-30 22:11:31.837981', 'step': 1166, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:31.904513', 'step': 1166, 'epoch': 2} +{'type': 'loss', 'content': 0.010066986083984375, 'timestamp': '2025-09-30 22:11:31.911328', 'step': 1167, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:31.965873', 'step': 1167, 'epoch': 2} +{'type': 'loss', 'content': 0.03768039494752884, 'timestamp': '2025-09-30 22:11:31.974127', 'step': 1168, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:32.038694', 'step': 1168, 'epoch': 2} +{'type': 'loss', 'content': 0.01412307471036911, 'timestamp': '2025-09-30 22:11:32.042393', 'step': 1169, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:32.097717', 'step': 1169, 'epoch': 2} +{'type': 'loss', 'content': 0.005952424369752407, 'timestamp': '2025-09-30 22:11:32.105721', 'step': 1170, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:32.162544', 'step': 1170, 'epoch': 2} +{'type': 'loss', 'content': 0.02057734504342079, 'timestamp': '2025-09-30 22:11:32.166457', 'step': 1171, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:32.224240', 'step': 1171, 'epoch': 2} +{'type': 'loss', 'content': 0.008706753142178059, 'timestamp': '2025-09-30 22:11:32.231341', 'step': 1172, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-30 22:11:32.285741', 'step': 1172, 'epoch': 2} +{'type': 'loss', 'content': 0.011364555917680264, 'timestamp': '2025-09-30 22:11:32.289356', 'step': 1173, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:32.355027', 'step': 1173, 'epoch': 2} +{'type': 'loss', 'content': 0.008187648840248585, 'timestamp': '2025-09-30 22:11:32.369898', 'step': 1174, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:32.433314', 'step': 1174, 'epoch': 2} +{'type': 'loss', 'content': 0.01807296834886074, 'timestamp': '2025-09-30 22:11:32.445908', 'step': 1175, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:32.500680', 'step': 1175, 'epoch': 2} +{'type': 'loss', 'content': 0.009041151963174343, 'timestamp': '2025-09-30 22:11:32.517829', 'step': 1176, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:32.575103', 'step': 1176, 'epoch': 2} +{'type': 'loss', 'content': 0.022169558331370354, 'timestamp': '2025-09-30 22:11:32.587014', 'step': 1177, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:32.661750', 'step': 1177, 'epoch': 2} +{'type': 'loss', 'content': 0.009721535257995129, 'timestamp': '2025-09-30 22:11:32.674137', 'step': 1178, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:32.737930', 'step': 1178, 'epoch': 2} +{'type': 'loss', 'content': 0.029778840020298958, 'timestamp': '2025-09-30 22:11:32.741818', 'step': 1179, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:32.799497', 'step': 1179, 'epoch': 2} +{'type': 'loss', 'content': 0.012037808075547218, 'timestamp': '2025-09-30 22:11:32.808244', 'step': 1180, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:32.863512', 'step': 1180, 'epoch': 2} +{'type': 'loss', 'content': 0.028365235775709152, 'timestamp': '2025-09-30 22:11:32.875714', 'step': 1181, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:32.944132', 'step': 1181, 'epoch': 2} +{'type': 'loss', 'content': 0.03483065962791443, 'timestamp': '2025-09-30 22:11:32.947386', 'step': 1182, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:33.001792', 'step': 1182, 'epoch': 2} +{'type': 'loss', 'content': 0.018574940040707588, 'timestamp': '2025-09-30 22:11:33.005179', 'step': 1183, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:33.062620', 'step': 1183, 'epoch': 2} +{'type': 'loss', 'content': 0.005325800273567438, 'timestamp': '2025-09-30 22:11:33.069033', 'step': 1184, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:33.130798', 'step': 1184, 'epoch': 2} +{'type': 'loss', 'content': 0.020132439211010933, 'timestamp': '2025-09-30 22:11:33.141806', 'step': 1185, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:33.198157', 'step': 1185, 'epoch': 2} +{'type': 'loss', 'content': 0.01631121151149273, 'timestamp': '2025-09-30 22:11:33.210341', 'step': 1186, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:33.273179', 'step': 1186, 'epoch': 2} +{'type': 'loss', 'content': 0.004564606584608555, 'timestamp': '2025-09-30 22:11:33.278063', 'step': 1187, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:33.332605', 'step': 1187, 'epoch': 2} +{'type': 'loss', 'content': 0.014193967916071415, 'timestamp': '2025-09-30 22:11:33.347094', 'step': 1188, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:33.409336', 'step': 1188, 'epoch': 2} +{'type': 'loss', 'content': 0.0075381542555987835, 'timestamp': '2025-09-30 22:11:33.412982', 'step': 1189, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:33.466666', 'step': 1189, 'epoch': 2} +{'type': 'loss', 'content': 0.042488861829042435, 'timestamp': '2025-09-30 22:11:33.471047', 'step': 1190, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:33.534611', 'step': 1190, 'epoch': 2} +{'type': 'loss', 'content': 0.007255176547914743, 'timestamp': '2025-09-30 22:11:33.545876', 'step': 1191, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:33.602630', 'step': 1191, 'epoch': 2} +{'type': 'loss', 'content': 0.02486496791243553, 'timestamp': '2025-09-30 22:11:33.611199', 'step': 1192, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:33.673991', 'step': 1192, 'epoch': 2} +{'type': 'loss', 'content': 0.018168171867728233, 'timestamp': '2025-09-30 22:11:33.679538', 'step': 1193, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:33.735190', 'step': 1193, 'epoch': 2} +{'type': 'loss', 'content': 0.0016164934495463967, 'timestamp': '2025-09-30 22:11:33.740699', 'step': 1194, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:33.799993', 'step': 1194, 'epoch': 2} +{'type': 'loss', 'content': 0.0472748838365078, 'timestamp': '2025-09-30 22:11:33.813257', 'step': 1195, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:33.875956', 'step': 1195, 'epoch': 2} +{'type': 'loss', 'content': 0.018774310126900673, 'timestamp': '2025-09-30 22:11:33.883590', 'step': 1196, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:33.938359', 'step': 1196, 'epoch': 2} +{'type': 'loss', 'content': 0.054423097521066666, 'timestamp': '2025-09-30 22:11:33.949340', 'step': 1197, 'epoch': 2} +{'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}], 'timestamp': '2025-09-30 22:11:35.187868', 'step': 1197, 'epoch': 2} +{'type': 'pplx', 'content': 32067022.826058734, 'timestamp': '2025-09-30 22:11:35.192378', 'step': 1197, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:35.253335', 'step': 1197, 'epoch': 2} +{'type': 'loss', 'content': 0.007513918448239565, 'timestamp': '2025-09-30 22:11:35.256670', 'step': 1198, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:35.314883', 'step': 1198, 'epoch': 2} +{'type': 'loss', 'content': 0.014755907468497753, 'timestamp': '2025-09-30 22:11:35.318113', 'step': 1199, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:35.378580', 'step': 1199, 'epoch': 2} +{'type': 'loss', 'content': 0.021854715421795845, 'timestamp': '2025-09-30 22:11:35.390404', 'step': 1200, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:35.449304', 'step': 1200, 'epoch': 2} +{'type': 'loss', 'content': 0.041134320199489594, 'timestamp': '2025-09-30 22:11:35.452074', 'step': 1201, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:35.516034', 'step': 1201, 'epoch': 2} +{'type': 'loss', 'content': 0.005287197418510914, 'timestamp': '2025-09-30 22:11:35.520991', 'step': 1202, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:35.577432', 'step': 1202, 'epoch': 2} +{'type': 'loss', 'content': 0.0038279914297163486, 'timestamp': '2025-09-30 22:11:35.581096', 'step': 1203, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-30 22:11:35.637823', 'step': 1203, 'epoch': 2} +{'type': 'loss', 'content': 0.003830043599009514, 'timestamp': '2025-09-30 22:11:35.646802', 'step': 1204, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:35.700118', 'step': 1204, 'epoch': 2} +{'type': 'loss', 'content': 0.032837968319654465, 'timestamp': '2025-09-30 22:11:35.704811', 'step': 1205, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:35.760611', 'step': 1205, 'epoch': 2} +{'type': 'loss', 'content': 0.03423704952001572, 'timestamp': '2025-09-30 22:11:35.763826', 'step': 1206, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:35.823186', 'step': 1206, 'epoch': 2} +{'type': 'loss', 'content': 0.008218946866691113, 'timestamp': '2025-09-30 22:11:35.826450', 'step': 1207, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:35.880558', 'step': 1207, 'epoch': 2} +{'type': 'loss', 'content': 0.01721678301692009, 'timestamp': '2025-09-30 22:11:35.887864', 'step': 1208, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:35.941792', 'step': 1208, 'epoch': 2} +{'type': 'loss', 'content': 0.019656702876091003, 'timestamp': '2025-09-30 22:11:35.944512', 'step': 1209, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:35.998526', 'step': 1209, 'epoch': 2} +{'type': 'loss', 'content': 0.029059452936053276, 'timestamp': '2025-09-30 22:11:36.003965', 'step': 1210, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:36.058525', 'step': 1210, 'epoch': 2} +{'type': 'loss', 'content': 0.030055100098252296, 'timestamp': '2025-09-30 22:11:36.061611', 'step': 1211, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:36.117192', 'step': 1211, 'epoch': 2} +{'type': 'loss', 'content': 0.02078065276145935, 'timestamp': '2025-09-30 22:11:36.130008', 'step': 1212, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:36.184580', 'step': 1212, 'epoch': 2} +{'type': 'loss', 'content': 0.010412467643618584, 'timestamp': '2025-09-30 22:11:36.193851', 'step': 1213, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:36.250863', 'step': 1213, 'epoch': 2} +{'type': 'loss', 'content': 0.014479429461061954, 'timestamp': '2025-09-30 22:11:36.254945', 'step': 1214, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:36.312801', 'step': 1214, 'epoch': 2} +{'type': 'loss', 'content': 0.016420889645814896, 'timestamp': '2025-09-30 22:11:36.315643', 'step': 1215, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:36.377191', 'step': 1215, 'epoch': 2} +{'type': 'loss', 'content': 0.02330826409161091, 'timestamp': '2025-09-30 22:11:36.383470', 'step': 1216, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:36.437724', 'step': 1216, 'epoch': 2} +{'type': 'loss', 'content': 0.018506629392504692, 'timestamp': '2025-09-30 22:11:36.450976', 'step': 1217, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:36.505571', 'step': 1217, 'epoch': 2} +{'type': 'loss', 'content': 0.006810951977968216, 'timestamp': '2025-09-30 22:11:36.509502', 'step': 1218, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:36.567290', 'step': 1218, 'epoch': 2} +{'type': 'loss', 'content': 0.014696040190756321, 'timestamp': '2025-09-30 22:11:36.574004', 'step': 1219, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:36.631116', 'step': 1219, 'epoch': 2} +{'type': 'loss', 'content': 0.012312375009059906, 'timestamp': '2025-09-30 22:11:36.637927', 'step': 1220, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:36.694225', 'step': 1220, 'epoch': 2} +{'type': 'loss', 'content': 0.007672054227441549, 'timestamp': '2025-09-30 22:11:36.705527', 'step': 1221, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:36.763351', 'step': 1221, 'epoch': 2} +{'type': 'loss', 'content': 0.008700719103217125, 'timestamp': '2025-09-30 22:11:36.768175', 'step': 1222, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:36.821917', 'step': 1222, 'epoch': 2} +{'type': 'loss', 'content': 0.009297819808125496, 'timestamp': '2025-09-30 22:11:36.824822', 'step': 1223, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:36.879397', 'step': 1223, 'epoch': 2} +{'type': 'loss', 'content': 0.01288828905671835, 'timestamp': '2025-09-30 22:11:36.890240', 'step': 1224, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:36.955515', 'step': 1224, 'epoch': 2} +{'type': 'loss', 'content': 0.018189510330557823, 'timestamp': '2025-09-30 22:11:36.958618', 'step': 1225, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:37.012393', 'step': 1225, 'epoch': 2} +{'type': 'loss', 'content': 0.003722636727616191, 'timestamp': '2025-09-30 22:11:37.025019', 'step': 1226, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:37.080604', 'step': 1226, 'epoch': 2} +{'type': 'loss', 'content': 0.014420069754123688, 'timestamp': '2025-09-30 22:11:37.087616', 'step': 1227, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:37.142487', 'step': 1227, 'epoch': 2} +{'type': 'loss', 'content': 0.017777537927031517, 'timestamp': '2025-09-30 22:11:37.148574', 'step': 1228, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:37.202651', 'step': 1228, 'epoch': 2} +{'type': 'loss', 'content': 0.005553639028221369, 'timestamp': '2025-09-30 22:11:37.205175', 'step': 1229, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:37.269419', 'step': 1229, 'epoch': 2} +{'type': 'loss', 'content': 0.02909952774643898, 'timestamp': '2025-09-30 22:11:37.272973', 'step': 1230, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:37.327626', 'step': 1230, 'epoch': 2} +{'type': 'loss', 'content': 0.008751442655920982, 'timestamp': '2025-09-30 22:11:37.331343', 'step': 1231, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:37.386264', 'step': 1231, 'epoch': 2} +{'type': 'loss', 'content': 0.022108396515250206, 'timestamp': '2025-09-30 22:11:37.393911', 'step': 1232, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:37.447858', 'step': 1232, 'epoch': 2} +{'type': 'loss', 'content': 0.0038794793654233217, 'timestamp': '2025-09-30 22:11:37.454350', 'step': 1233, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:37.517045', 'step': 1233, 'epoch': 2} +{'type': 'loss', 'content': 0.009794176556169987, 'timestamp': '2025-09-30 22:11:37.520053', 'step': 1234, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:37.575368', 'step': 1234, 'epoch': 2} +{'type': 'loss', 'content': 0.02226036600768566, 'timestamp': '2025-09-30 22:11:37.578470', 'step': 1235, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:37.638983', 'step': 1235, 'epoch': 2} +{'type': 'loss', 'content': 0.011220982298254967, 'timestamp': '2025-09-30 22:11:37.645391', 'step': 1236, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:37.705527', 'step': 1236, 'epoch': 2} +{'type': 'loss', 'content': 0.010935097001492977, 'timestamp': '2025-09-30 22:11:37.717267', 'step': 1237, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:37.771780', 'step': 1237, 'epoch': 2} +{'type': 'loss', 'content': 0.015008168295025826, 'timestamp': '2025-09-30 22:11:37.774709', 'step': 1238, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:37.828421', 'step': 1238, 'epoch': 2} +{'type': 'loss', 'content': 0.03053663857281208, 'timestamp': '2025-09-30 22:11:37.831613', 'step': 1239, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:37.887977', 'step': 1239, 'epoch': 2} +{'type': 'loss', 'content': 0.02169874683022499, 'timestamp': '2025-09-30 22:11:37.894966', 'step': 1240, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:37.949789', 'step': 1240, 'epoch': 2} +{'type': 'loss', 'content': 0.012753850780427456, 'timestamp': '2025-09-30 22:11:37.952719', 'step': 1241, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:38.006491', 'step': 1241, 'epoch': 2} +{'type': 'loss', 'content': 0.013386455364525318, 'timestamp': '2025-09-30 22:11:38.009169', 'step': 1242, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:38.063334', 'step': 1242, 'epoch': 2} +{'type': 'loss', 'content': 0.029180806130170822, 'timestamp': '2025-09-30 22:11:38.067437', 'step': 1243, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:38.122343', 'step': 1243, 'epoch': 2} +{'type': 'loss', 'content': 0.02196519263088703, 'timestamp': '2025-09-30 22:11:38.135050', 'step': 1244, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-30 22:11:38.188177', 'step': 1244, 'epoch': 2} +{'type': 'loss', 'content': 0.02870265766978264, 'timestamp': '2025-09-30 22:11:38.190963', 'step': 1245, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:38.245249', 'step': 1245, 'epoch': 2} +{'type': 'loss', 'content': 0.023608211427927017, 'timestamp': '2025-09-30 22:11:38.257228', 'step': 1246, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:38.322615', 'step': 1246, 'epoch': 2} +{'type': 'loss', 'content': 0.032148636877536774, 'timestamp': '2025-09-30 22:11:38.326587', 'step': 1247, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:38.380869', 'step': 1247, 'epoch': 2} +{'type': 'loss', 'content': 0.014642714522778988, 'timestamp': '2025-09-30 22:11:38.387162', 'step': 1248, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:38.446848', 'step': 1248, 'epoch': 2} +{'type': 'loss', 'content': 0.027036087587475777, 'timestamp': '2025-09-30 22:11:38.450541', 'step': 1249, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:38.505596', 'step': 1249, 'epoch': 2} +{'type': 'loss', 'content': 0.02789353020489216, 'timestamp': '2025-09-30 22:11:38.518420', 'step': 1250, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:38.575018', 'step': 1250, 'epoch': 2} +{'type': 'loss', 'content': 0.02393009327352047, 'timestamp': '2025-09-30 22:11:38.578596', 'step': 1251, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:38.632056', 'step': 1251, 'epoch': 2} +{'type': 'loss', 'content': 0.010679802857339382, 'timestamp': '2025-09-30 22:11:38.638585', 'step': 1252, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-30 22:11:38.693945', 'step': 1252, 'epoch': 2} +{'type': 'loss', 'content': 0.005615527741611004, 'timestamp': '2025-09-30 22:11:38.697404', 'step': 1253, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:38.753186', 'step': 1253, 'epoch': 2} +{'type': 'loss', 'content': 0.008227822370827198, 'timestamp': '2025-09-30 22:11:38.756726', 'step': 1254, 'epoch': 2} +{'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}], 'timestamp': '2025-09-30 22:11:39.978401', 'step': 1254, 'epoch': 2} +{'type': 'pplx', 'content': 30062323.890613366, 'timestamp': '2025-09-30 22:11:39.989728', 'step': 1254, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:40.042732', 'step': 1254, 'epoch': 2} +{'type': 'loss', 'content': 0.007192269433289766, 'timestamp': '2025-09-30 22:11:40.046046', 'step': 1255, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:40.100429', 'step': 1255, 'epoch': 2} +{'type': 'loss', 'content': 0.026882369071245193, 'timestamp': '2025-09-30 22:11:40.106496', 'step': 1256, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:40.159745', 'step': 1256, 'epoch': 2} +{'type': 'loss', 'content': 0.01234396081417799, 'timestamp': '2025-09-30 22:11:40.164224', 'step': 1257, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:40.220058', 'step': 1257, 'epoch': 2} +{'type': 'loss', 'content': 0.02563643269240856, 'timestamp': '2025-09-30 22:11:40.223694', 'step': 1258, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:40.279550', 'step': 1258, 'epoch': 2} +{'type': 'loss', 'content': 0.008174607530236244, 'timestamp': '2025-09-30 22:11:40.283730', 'step': 1259, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:40.344881', 'step': 1259, 'epoch': 2} +{'type': 'loss', 'content': 0.015055462718009949, 'timestamp': '2025-09-30 22:11:40.351884', 'step': 1260, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:40.404793', 'step': 1260, 'epoch': 2} +{'type': 'loss', 'content': 0.044176094233989716, 'timestamp': '2025-09-30 22:11:40.407606', 'step': 1261, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:40.462171', 'step': 1261, 'epoch': 2} +{'type': 'loss', 'content': 0.012365161441266537, 'timestamp': '2025-09-30 22:11:40.466037', 'step': 1262, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:40.526018', 'step': 1262, 'epoch': 2} +{'type': 'loss', 'content': 0.021339308470487595, 'timestamp': '2025-09-30 22:11:40.539809', 'step': 1263, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:40.594211', 'step': 1263, 'epoch': 2} +{'type': 'loss', 'content': 0.013789476826786995, 'timestamp': '2025-09-30 22:11:40.611854', 'step': 1264, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:40.676229', 'step': 1264, 'epoch': 2} +{'type': 'loss', 'content': 0.010516809299588203, 'timestamp': '2025-09-30 22:11:40.679039', 'step': 1265, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:40.733124', 'step': 1265, 'epoch': 2} +{'type': 'loss', 'content': 0.01804107055068016, 'timestamp': '2025-09-30 22:11:40.736506', 'step': 1266, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:40.798080', 'step': 1266, 'epoch': 2} +{'type': 'loss', 'content': 0.016653243452310562, 'timestamp': '2025-09-30 22:11:40.801372', 'step': 1267, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:40.863841', 'step': 1267, 'epoch': 2} +{'type': 'loss', 'content': 0.023181870579719543, 'timestamp': '2025-09-30 22:11:40.870269', 'step': 1268, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:40.923609', 'step': 1268, 'epoch': 2} +{'type': 'loss', 'content': 0.016313303261995316, 'timestamp': '2025-09-30 22:11:40.926436', 'step': 1269, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:40.981079', 'step': 1269, 'epoch': 2} +{'type': 'loss', 'content': 0.018928783014416695, 'timestamp': '2025-09-30 22:11:40.994550', 'step': 1270, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:41.049139', 'step': 1270, 'epoch': 2} +{'type': 'loss', 'content': 0.009127925150096416, 'timestamp': '2025-09-30 22:11:41.057956', 'step': 1271, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:41.120907', 'step': 1271, 'epoch': 2} +{'type': 'loss', 'content': 0.02112032100558281, 'timestamp': '2025-09-30 22:11:41.128416', 'step': 1272, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:41.182515', 'step': 1272, 'epoch': 2} +{'type': 'loss', 'content': 0.009894484654068947, 'timestamp': '2025-09-30 22:11:41.187843', 'step': 1273, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:41.249575', 'step': 1273, 'epoch': 2} +{'type': 'loss', 'content': 0.022361796349287033, 'timestamp': '2025-09-30 22:11:41.252800', 'step': 1274, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:41.313068', 'step': 1274, 'epoch': 2} +{'type': 'loss', 'content': 0.011399311013519764, 'timestamp': '2025-09-30 22:11:41.317342', 'step': 1275, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:41.371745', 'step': 1275, 'epoch': 2} +{'type': 'loss', 'content': 0.009731536731123924, 'timestamp': '2025-09-30 22:11:41.378459', 'step': 1276, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:41.433288', 'step': 1276, 'epoch': 2} +{'type': 'loss', 'content': 0.015632973983883858, 'timestamp': '2025-09-30 22:11:41.436646', 'step': 1277, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:41.497498', 'step': 1277, 'epoch': 2} +{'type': 'loss', 'content': 0.012282473966479301, 'timestamp': '2025-09-30 22:11:41.501010', 'step': 1278, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:41.554689', 'step': 1278, 'epoch': 2} +{'type': 'loss', 'content': 0.03517066687345505, 'timestamp': '2025-09-30 22:11:41.558237', 'step': 1279, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:41.613087', 'step': 1279, 'epoch': 2} +{'type': 'loss', 'content': 0.018857363611459732, 'timestamp': '2025-09-30 22:11:41.629174', 'step': 1280, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:41.684409', 'step': 1280, 'epoch': 2} +{'type': 'loss', 'content': 0.009674946777522564, 'timestamp': '2025-09-30 22:11:41.698238', 'step': 1281, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:41.757905', 'step': 1281, 'epoch': 2} +{'type': 'loss', 'content': 0.01863682270050049, 'timestamp': '2025-09-30 22:11:41.761075', 'step': 1282, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:41.818226', 'step': 1282, 'epoch': 2} +{'type': 'loss', 'content': 0.00765694584697485, 'timestamp': '2025-09-30 22:11:41.821824', 'step': 1283, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-30 22:11:41.876313', 'step': 1283, 'epoch': 2} +{'type': 'loss', 'content': 0.009559721685945988, 'timestamp': '2025-09-30 22:11:41.883478', 'step': 1284, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:41.936511', 'step': 1284, 'epoch': 2} +{'type': 'loss', 'content': 0.024069128558039665, 'timestamp': '2025-09-30 22:11:41.939872', 'step': 1285, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:41.993918', 'step': 1285, 'epoch': 2} +{'type': 'loss', 'content': 0.006193791516125202, 'timestamp': '2025-09-30 22:11:41.998037', 'step': 1286, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:42.053332', 'step': 1286, 'epoch': 2} +{'type': 'loss', 'content': 0.015762005001306534, 'timestamp': '2025-09-30 22:11:42.063465', 'step': 1287, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:42.134164', 'step': 1287, 'epoch': 2} +{'type': 'loss', 'content': 0.005604383070021868, 'timestamp': '2025-09-30 22:11:42.143673', 'step': 1288, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:42.197288', 'step': 1288, 'epoch': 2} +{'type': 'loss', 'content': 0.03566458821296692, 'timestamp': '2025-09-30 22:11:42.201038', 'step': 1289, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:42.256727', 'step': 1289, 'epoch': 2} +{'type': 'loss', 'content': 0.012333127669990063, 'timestamp': '2025-09-30 22:11:42.260609', 'step': 1290, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:42.316139', 'step': 1290, 'epoch': 2} +{'type': 'loss', 'content': 0.020386451855301857, 'timestamp': '2025-09-30 22:11:42.321025', 'step': 1291, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:42.375757', 'step': 1291, 'epoch': 2} +{'type': 'loss', 'content': 0.013080189004540443, 'timestamp': '2025-09-30 22:11:42.382534', 'step': 1292, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:42.444980', 'step': 1292, 'epoch': 2} +{'type': 'loss', 'content': 0.004526190459728241, 'timestamp': '2025-09-30 22:11:42.448558', 'step': 1293, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:42.503079', 'step': 1293, 'epoch': 2} +{'type': 'loss', 'content': 0.0055126287043094635, 'timestamp': '2025-09-30 22:11:42.506004', 'step': 1294, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:42.572615', 'step': 1294, 'epoch': 2} +{'type': 'loss', 'content': 0.005155415739864111, 'timestamp': '2025-09-30 22:11:42.575746', 'step': 1295, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:42.629780', 'step': 1295, 'epoch': 2} +{'type': 'loss', 'content': 0.02876908890902996, 'timestamp': '2025-09-30 22:11:42.637312', 'step': 1296, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:42.692052', 'step': 1296, 'epoch': 2} +{'type': 'loss', 'content': 0.019892612472176552, 'timestamp': '2025-09-30 22:11:42.695367', 'step': 1297, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:42.750139', 'step': 1297, 'epoch': 2} +{'type': 'loss', 'content': 0.024674100801348686, 'timestamp': '2025-09-30 22:11:42.753989', 'step': 1298, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:42.809884', 'step': 1298, 'epoch': 2} +{'type': 'loss', 'content': 0.024183692410588264, 'timestamp': '2025-09-30 22:11:42.822187', 'step': 1299, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:42.880441', 'step': 1299, 'epoch': 2} +{'type': 'loss', 'content': 0.0076071759685873985, 'timestamp': '2025-09-30 22:11:42.887514', 'step': 1300, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:42.945521', 'step': 1300, 'epoch': 2} +{'type': 'loss', 'content': 0.014696098864078522, 'timestamp': '2025-09-30 22:11:42.956393', 'step': 1301, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:43.020195', 'step': 1301, 'epoch': 2} +{'type': 'loss', 'content': 0.009948963299393654, 'timestamp': '2025-09-30 22:11:43.024109', 'step': 1302, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:43.079274', 'step': 1302, 'epoch': 2} +{'type': 'loss', 'content': 0.018407296389341354, 'timestamp': '2025-09-30 22:11:43.084424', 'step': 1303, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:43.141234', 'step': 1303, 'epoch': 2} +{'type': 'loss', 'content': 0.006092921365052462, 'timestamp': '2025-09-30 22:11:43.148848', 'step': 1304, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:43.213109', 'step': 1304, 'epoch': 2} +{'type': 'loss', 'content': 0.011458395048975945, 'timestamp': '2025-09-30 22:11:43.216786', 'step': 1305, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:43.270249', 'step': 1305, 'epoch': 2} +{'type': 'loss', 'content': 0.00822626892477274, 'timestamp': '2025-09-30 22:11:43.273048', 'step': 1306, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:43.328334', 'step': 1306, 'epoch': 2} +{'type': 'loss', 'content': 0.016102908179163933, 'timestamp': '2025-09-30 22:11:43.332627', 'step': 1307, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:43.388206', 'step': 1307, 'epoch': 2} +{'type': 'loss', 'content': 0.013590490445494652, 'timestamp': '2025-09-30 22:11:43.395291', 'step': 1308, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:43.449410', 'step': 1308, 'epoch': 2} +{'type': 'loss', 'content': 0.005517884157598019, 'timestamp': '2025-09-30 22:11:43.456366', 'step': 1309, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:43.510445', 'step': 1309, 'epoch': 2} +{'type': 'loss', 'content': 0.014826023019850254, 'timestamp': '2025-09-30 22:11:43.513652', 'step': 1310, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-30 22:11:43.577385', 'step': 1310, 'epoch': 2} +{'type': 'loss', 'content': 0.019576190039515495, 'timestamp': '2025-09-30 22:11:43.580375', 'step': 1311, 'epoch': 2} +{'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}], 'timestamp': '2025-09-30 22:11:44.824236', 'step': 1311, 'epoch': 2} +{'type': 'pplx', 'content': 32609117.843413066, 'timestamp': '2025-09-30 22:11:44.828325', 'step': 1311, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:44.880343', 'step': 1311, 'epoch': 2} +{'type': 'loss', 'content': 0.004271751269698143, 'timestamp': '2025-09-30 22:11:44.887291', 'step': 1312, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:44.943672', 'step': 1312, 'epoch': 2} +{'type': 'loss', 'content': 0.00479540228843689, 'timestamp': '2025-09-30 22:11:44.947951', 'step': 1313, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:45.004542', 'step': 1313, 'epoch': 2} +{'type': 'loss', 'content': 0.0011680542957037687, 'timestamp': '2025-09-30 22:11:45.014421', 'step': 1314, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:45.068628', 'step': 1314, 'epoch': 2} +{'type': 'loss', 'content': 0.014515669085085392, 'timestamp': '2025-09-30 22:11:45.071802', 'step': 1315, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:45.125654', 'step': 1315, 'epoch': 2} +{'type': 'loss', 'content': 0.008202152326703072, 'timestamp': '2025-09-30 22:11:45.133686', 'step': 1316, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:45.188161', 'step': 1316, 'epoch': 2} +{'type': 'loss', 'content': 0.012495539151132107, 'timestamp': '2025-09-30 22:11:45.192099', 'step': 1317, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:45.246718', 'step': 1317, 'epoch': 2} +{'type': 'loss', 'content': 0.0031438537407666445, 'timestamp': '2025-09-30 22:11:45.250209', 'step': 1318, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:45.309314', 'step': 1318, 'epoch': 2} +{'type': 'loss', 'content': 0.018467547371983528, 'timestamp': '2025-09-30 22:11:45.312481', 'step': 1319, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:45.378622', 'step': 1319, 'epoch': 2} +{'type': 'loss', 'content': 0.005382470320910215, 'timestamp': '2025-09-30 22:11:45.389200', 'step': 1320, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:45.449995', 'step': 1320, 'epoch': 2} +{'type': 'loss', 'content': 0.04153914004564285, 'timestamp': '2025-09-30 22:11:45.453345', 'step': 1321, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:45.506615', 'step': 1321, 'epoch': 2} +{'type': 'loss', 'content': 0.023785650730133057, 'timestamp': '2025-09-30 22:11:45.509560', 'step': 1322, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:45.565186', 'step': 1322, 'epoch': 2} +{'type': 'loss', 'content': 0.010426363907754421, 'timestamp': '2025-09-30 22:11:45.569391', 'step': 1323, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:45.623237', 'step': 1323, 'epoch': 2} +{'type': 'loss', 'content': 0.007811260875314474, 'timestamp': '2025-09-30 22:11:45.631937', 'step': 1324, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:45.684753', 'step': 1324, 'epoch': 2} +{'type': 'loss', 'content': 0.00887636374682188, 'timestamp': '2025-09-30 22:11:45.689437', 'step': 1325, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:45.743278', 'step': 1325, 'epoch': 2} +{'type': 'loss', 'content': 0.007312100380659103, 'timestamp': '2025-09-30 22:11:45.746821', 'step': 1326, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:45.801232', 'step': 1326, 'epoch': 2} +{'type': 'loss', 'content': 0.003930172882974148, 'timestamp': '2025-09-30 22:11:45.804113', 'step': 1327, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:45.866226', 'step': 1327, 'epoch': 2} +{'type': 'loss', 'content': 0.011160810478031635, 'timestamp': '2025-09-30 22:11:45.881512', 'step': 1328, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:45.935904', 'step': 1328, 'epoch': 2} +{'type': 'loss', 'content': 0.017904015257954597, 'timestamp': '2025-09-30 22:11:45.942772', 'step': 1329, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:45.997789', 'step': 1329, 'epoch': 2} +{'type': 'loss', 'content': 0.00545323733240366, 'timestamp': '2025-09-30 22:11:46.006031', 'step': 1330, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:46.062131', 'step': 1330, 'epoch': 2} +{'type': 'loss', 'content': 0.04627959802746773, 'timestamp': '2025-09-30 22:11:46.066083', 'step': 1331, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:46.120957', 'step': 1331, 'epoch': 2} +{'type': 'loss', 'content': 0.02013224922120571, 'timestamp': '2025-09-30 22:11:46.127254', 'step': 1332, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:46.180087', 'step': 1332, 'epoch': 2} +{'type': 'loss', 'content': 0.01763824000954628, 'timestamp': '2025-09-30 22:11:46.183576', 'step': 1333, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:46.238119', 'step': 1333, 'epoch': 2} +{'type': 'loss', 'content': 0.01840882934629917, 'timestamp': '2025-09-30 22:11:46.241142', 'step': 1334, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:46.302685', 'step': 1334, 'epoch': 2} +{'type': 'loss', 'content': 0.029163610190153122, 'timestamp': '2025-09-30 22:11:46.306239', 'step': 1335, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:46.365479', 'step': 1335, 'epoch': 2} +{'type': 'loss', 'content': 0.005280309822410345, 'timestamp': '2025-09-30 22:11:46.372879', 'step': 1336, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:46.427591', 'step': 1336, 'epoch': 2} +{'type': 'loss', 'content': 0.016217708587646484, 'timestamp': '2025-09-30 22:11:46.430872', 'step': 1337, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:46.485684', 'step': 1337, 'epoch': 2} +{'type': 'loss', 'content': 0.016818564385175705, 'timestamp': '2025-09-30 22:11:46.491284', 'step': 1338, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:46.566008', 'step': 1338, 'epoch': 2} +{'type': 'loss', 'content': 0.005953342653810978, 'timestamp': '2025-09-30 22:11:46.569427', 'step': 1339, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:46.626744', 'step': 1339, 'epoch': 2} +{'type': 'loss', 'content': 0.019312188029289246, 'timestamp': '2025-09-30 22:11:46.633310', 'step': 1340, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:46.694266', 'step': 1340, 'epoch': 2} +{'type': 'loss', 'content': 0.014265080913901329, 'timestamp': '2025-09-30 22:11:46.697938', 'step': 1341, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:46.759167', 'step': 1341, 'epoch': 2} +{'type': 'loss', 'content': 0.014429310336709023, 'timestamp': '2025-09-30 22:11:46.762088', 'step': 1342, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:46.833476', 'step': 1342, 'epoch': 2} +{'type': 'loss', 'content': 0.01076581608504057, 'timestamp': '2025-09-30 22:11:46.843812', 'step': 1343, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:46.909121', 'step': 1343, 'epoch': 2} +{'type': 'loss', 'content': 0.019898299127817154, 'timestamp': '2025-09-30 22:11:46.917240', 'step': 1344, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:46.977775', 'step': 1344, 'epoch': 2} +{'type': 'loss', 'content': 0.0010426754597574472, 'timestamp': '2025-09-30 22:11:46.981247', 'step': 1345, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:47.039218', 'step': 1345, 'epoch': 2} +{'type': 'loss', 'content': 0.022380618378520012, 'timestamp': '2025-09-30 22:11:47.045477', 'step': 1346, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:47.100496', 'step': 1346, 'epoch': 2} +{'type': 'loss', 'content': 0.012452373281121254, 'timestamp': '2025-09-30 22:11:47.106816', 'step': 1347, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:47.161611', 'step': 1347, 'epoch': 2} +{'type': 'loss', 'content': 0.002341157989576459, 'timestamp': '2025-09-30 22:11:47.169873', 'step': 1348, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:47.224614', 'step': 1348, 'epoch': 2} +{'type': 'loss', 'content': 0.002771148458123207, 'timestamp': '2025-09-30 22:11:47.227269', 'step': 1349, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:47.282287', 'step': 1349, 'epoch': 2} +{'type': 'loss', 'content': 0.010354334488511086, 'timestamp': '2025-09-30 22:11:47.285453', 'step': 1350, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-30 22:11:47.347031', 'step': 1350, 'epoch': 2} +{'type': 'loss', 'content': 0.009396574459969997, 'timestamp': '2025-09-30 22:11:47.349952', 'step': 1351, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:47.406731', 'step': 1351, 'epoch': 2} +{'type': 'loss', 'content': 0.003213896183297038, 'timestamp': '2025-09-30 22:11:47.413957', 'step': 1352, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:47.469778', 'step': 1352, 'epoch': 2} +{'type': 'loss', 'content': 0.004314431454986334, 'timestamp': '2025-09-30 22:11:47.473353', 'step': 1353, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:47.529789', 'step': 1353, 'epoch': 2} +{'type': 'loss', 'content': 0.013174169696867466, 'timestamp': '2025-09-30 22:11:47.533184', 'step': 1354, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:47.590717', 'step': 1354, 'epoch': 2} +{'type': 'loss', 'content': 0.015209296718239784, 'timestamp': '2025-09-30 22:11:47.594409', 'step': 1355, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:47.650799', 'step': 1355, 'epoch': 2} +{'type': 'loss', 'content': 0.038054872304201126, 'timestamp': '2025-09-30 22:11:47.658833', 'step': 1356, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:47.712960', 'step': 1356, 'epoch': 2} +{'type': 'loss', 'content': 0.009656942449510098, 'timestamp': '2025-09-30 22:11:47.716647', 'step': 1357, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:47.777202', 'step': 1357, 'epoch': 2} +{'type': 'loss', 'content': 0.01215650700032711, 'timestamp': '2025-09-30 22:11:47.786134', 'step': 1358, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:47.846371', 'step': 1358, 'epoch': 2} +{'type': 'loss', 'content': 0.015105058439075947, 'timestamp': '2025-09-30 22:11:47.854309', 'step': 1359, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:47.914321', 'step': 1359, 'epoch': 2} +{'type': 'loss', 'content': 0.002900704275816679, 'timestamp': '2025-09-30 22:11:47.926334', 'step': 1360, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:47.980560', 'step': 1360, 'epoch': 2} +{'type': 'loss', 'content': 0.020952101796865463, 'timestamp': '2025-09-30 22:11:47.987594', 'step': 1361, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:48.043298', 'step': 1361, 'epoch': 2} +{'type': 'loss', 'content': 0.015315545722842216, 'timestamp': '2025-09-30 22:11:48.050262', 'step': 1362, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:48.107218', 'step': 1362, 'epoch': 2} +{'type': 'loss', 'content': 0.0064353845082223415, 'timestamp': '2025-09-30 22:11:48.110270', 'step': 1363, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:48.169342', 'step': 1363, 'epoch': 2} +{'type': 'loss', 'content': 0.006530344020575285, 'timestamp': '2025-09-30 22:11:48.179583', 'step': 1364, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:48.241263', 'step': 1364, 'epoch': 2} +{'type': 'loss', 'content': 0.017590684816241264, 'timestamp': '2025-09-30 22:11:48.249981', 'step': 1365, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:48.310235', 'step': 1365, 'epoch': 2} +{'type': 'loss', 'content': 0.007951964624226093, 'timestamp': '2025-09-30 22:11:48.318949', 'step': 1366, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:48.374804', 'step': 1366, 'epoch': 2} +{'type': 'loss', 'content': 0.01660231314599514, 'timestamp': '2025-09-30 22:11:48.377100', 'step': 1367, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:48.438207', 'step': 1367, 'epoch': 2} +{'type': 'loss', 'content': 0.007098283153027296, 'timestamp': '2025-09-30 22:11:48.445739', 'step': 1368, 'epoch': 2} +{'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}], 'timestamp': '2025-09-30 22:11:49.664039', 'step': 1368, 'epoch': 2} +{'type': 'pplx', 'content': 33795719.590478756, 'timestamp': '2025-09-30 22:11:49.666611', 'step': 1368, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:49.719047', 'step': 1368, 'epoch': 2} +{'type': 'loss', 'content': 0.0031422681640833616, 'timestamp': '2025-09-30 22:11:49.721231', 'step': 1369, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:49.775273', 'step': 1369, 'epoch': 2} +{'type': 'loss', 'content': 0.007872308604419231, 'timestamp': '2025-09-30 22:11:49.778345', 'step': 1370, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-30 22:11:49.832827', 'step': 1370, 'epoch': 2} +{'type': 'loss', 'content': 0.011166645213961601, 'timestamp': '2025-09-30 22:11:49.838286', 'step': 1371, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:49.892684', 'step': 1371, 'epoch': 2} +{'type': 'loss', 'content': 0.008558930829167366, 'timestamp': '2025-09-30 22:11:49.899692', 'step': 1372, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:49.956402', 'step': 1372, 'epoch': 2} +{'type': 'loss', 'content': 0.01813081093132496, 'timestamp': '2025-09-30 22:11:49.960747', 'step': 1373, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:50.015076', 'step': 1373, 'epoch': 2} +{'type': 'loss', 'content': 0.014572428539395332, 'timestamp': '2025-09-30 22:11:50.017794', 'step': 1374, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:50.071943', 'step': 1374, 'epoch': 2} +{'type': 'loss', 'content': 0.025154201313853264, 'timestamp': '2025-09-30 22:11:50.074765', 'step': 1375, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:50.134387', 'step': 1375, 'epoch': 2} +{'type': 'loss', 'content': 0.006909341551363468, 'timestamp': '2025-09-30 22:11:50.143622', 'step': 1376, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:50.199261', 'step': 1376, 'epoch': 2} +{'type': 'loss', 'content': 0.024538232013583183, 'timestamp': '2025-09-30 22:11:50.201749', 'step': 1377, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:50.258024', 'step': 1377, 'epoch': 2} +{'type': 'loss', 'content': 0.016891105100512505, 'timestamp': '2025-09-30 22:11:50.261650', 'step': 1378, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:50.316502', 'step': 1378, 'epoch': 2} +{'type': 'loss', 'content': 0.02443709410727024, 'timestamp': '2025-09-30 22:11:50.319617', 'step': 1379, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:50.373500', 'step': 1379, 'epoch': 2} +{'type': 'loss', 'content': 0.00553086306899786, 'timestamp': '2025-09-30 22:11:50.381388', 'step': 1380, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:50.435108', 'step': 1380, 'epoch': 2} +{'type': 'loss', 'content': 0.02132841758430004, 'timestamp': '2025-09-30 22:11:50.442804', 'step': 1381, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:50.500292', 'step': 1381, 'epoch': 2} +{'type': 'loss', 'content': 0.00981497298926115, 'timestamp': '2025-09-30 22:11:50.503681', 'step': 1382, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:50.557827', 'step': 1382, 'epoch': 2} +{'type': 'loss', 'content': 0.0064003728330135345, 'timestamp': '2025-09-30 22:11:50.561098', 'step': 1383, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:50.615611', 'step': 1383, 'epoch': 2} +{'type': 'loss', 'content': 0.0022823542822152376, 'timestamp': '2025-09-30 22:11:50.622947', 'step': 1384, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:50.678363', 'step': 1384, 'epoch': 2} +{'type': 'loss', 'content': 0.02859966643154621, 'timestamp': '2025-09-30 22:11:50.681709', 'step': 1385, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:50.735485', 'step': 1385, 'epoch': 2} +{'type': 'loss', 'content': 0.017722919583320618, 'timestamp': '2025-09-30 22:11:50.738377', 'step': 1386, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:50.792609', 'step': 1386, 'epoch': 2} +{'type': 'loss', 'content': 0.03379681333899498, 'timestamp': '2025-09-30 22:11:50.796363', 'step': 1387, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:50.849961', 'step': 1387, 'epoch': 2} +{'type': 'loss', 'content': 0.003414042294025421, 'timestamp': '2025-09-30 22:11:50.859211', 'step': 1388, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:50.914429', 'step': 1388, 'epoch': 2} +{'type': 'loss', 'content': 0.014491712674498558, 'timestamp': '2025-09-30 22:11:50.921126', 'step': 1389, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:50.977992', 'step': 1389, 'epoch': 2} +{'type': 'loss', 'content': 0.009958495385944843, 'timestamp': '2025-09-30 22:11:50.980673', 'step': 1390, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:51.041085', 'step': 1390, 'epoch': 2} +{'type': 'loss', 'content': 0.006238664500415325, 'timestamp': '2025-09-30 22:11:51.047289', 'step': 1391, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:51.105263', 'step': 1391, 'epoch': 2} +{'type': 'loss', 'content': 0.014195479452610016, 'timestamp': '2025-09-30 22:11:51.118187', 'step': 1392, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:51.175096', 'step': 1392, 'epoch': 2} +{'type': 'loss', 'content': 0.00753815146163106, 'timestamp': '2025-09-30 22:11:51.178464', 'step': 1393, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:51.234581', 'step': 1393, 'epoch': 2} +{'type': 'loss', 'content': 0.013055319897830486, 'timestamp': '2025-09-30 22:11:51.237281', 'step': 1394, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:51.292813', 'step': 1394, 'epoch': 2} +{'type': 'loss', 'content': 0.004737398587167263, 'timestamp': '2025-09-30 22:11:51.294813', 'step': 1395, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:51.348986', 'step': 1395, 'epoch': 2} +{'type': 'loss', 'content': 0.002370731672272086, 'timestamp': '2025-09-30 22:11:51.357468', 'step': 1396, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:51.411158', 'step': 1396, 'epoch': 2} +{'type': 'loss', 'content': 0.002246677177026868, 'timestamp': '2025-09-30 22:11:51.417173', 'step': 1397, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:51.475633', 'step': 1397, 'epoch': 2} +{'type': 'loss', 'content': 0.05915753170847893, 'timestamp': '2025-09-30 22:11:51.481055', 'step': 1398, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:51.536694', 'step': 1398, 'epoch': 2} +{'type': 'loss', 'content': 0.007206898648291826, 'timestamp': '2025-09-30 22:11:51.540164', 'step': 1399, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:51.596906', 'step': 1399, 'epoch': 2} +{'type': 'loss', 'content': 0.003330029547214508, 'timestamp': '2025-09-30 22:11:51.604499', 'step': 1400, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:51.661337', 'step': 1400, 'epoch': 2} +{'type': 'loss', 'content': 0.01776135340332985, 'timestamp': '2025-09-30 22:11:51.664234', 'step': 1401, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:51.719714', 'step': 1401, 'epoch': 2} +{'type': 'loss', 'content': 0.007816934958100319, 'timestamp': '2025-09-30 22:11:51.722380', 'step': 1402, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:51.776606', 'step': 1402, 'epoch': 2} +{'type': 'loss', 'content': 0.006343926768749952, 'timestamp': '2025-09-30 22:11:51.779877', 'step': 1403, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:51.834652', 'step': 1403, 'epoch': 2} +{'type': 'loss', 'content': 0.01646248623728752, 'timestamp': '2025-09-30 22:11:51.842243', 'step': 1404, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:51.897403', 'step': 1404, 'epoch': 2} +{'type': 'loss', 'content': 0.011257813312113285, 'timestamp': '2025-09-30 22:11:51.900431', 'step': 1405, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:51.953481', 'step': 1405, 'epoch': 2} +{'type': 'loss', 'content': 0.010497097857296467, 'timestamp': '2025-09-30 22:11:51.959746', 'step': 1406, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:52.015068', 'step': 1406, 'epoch': 2} +{'type': 'loss', 'content': 0.002651577116921544, 'timestamp': '2025-09-30 22:11:52.019159', 'step': 1407, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:52.072606', 'step': 1407, 'epoch': 2} +{'type': 'loss', 'content': 0.03266320377588272, 'timestamp': '2025-09-30 22:11:52.085621', 'step': 1408, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:52.141617', 'step': 1408, 'epoch': 2} +{'type': 'loss', 'content': 0.01602781191468239, 'timestamp': '2025-09-30 22:11:52.146954', 'step': 1409, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:52.203737', 'step': 1409, 'epoch': 2} +{'type': 'loss', 'content': 0.002372512361034751, 'timestamp': '2025-09-30 22:11:52.207368', 'step': 1410, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:52.262628', 'step': 1410, 'epoch': 2} +{'type': 'loss', 'content': 0.014318128116428852, 'timestamp': '2025-09-30 22:11:52.266816', 'step': 1411, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:52.323101', 'step': 1411, 'epoch': 2} +{'type': 'loss', 'content': 0.00335204997099936, 'timestamp': '2025-09-30 22:11:52.333507', 'step': 1412, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:52.399815', 'step': 1412, 'epoch': 2} +{'type': 'loss', 'content': 0.006724204868078232, 'timestamp': '2025-09-30 22:11:52.405354', 'step': 1413, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:52.460557', 'step': 1413, 'epoch': 2} +{'type': 'loss', 'content': 0.008304744958877563, 'timestamp': '2025-09-30 22:11:52.470446', 'step': 1414, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:52.532792', 'step': 1414, 'epoch': 2} +{'type': 'loss', 'content': 0.013678102754056454, 'timestamp': '2025-09-30 22:11:52.535675', 'step': 1415, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:52.590913', 'step': 1415, 'epoch': 2} +{'type': 'loss', 'content': 0.00877212081104517, 'timestamp': '2025-09-30 22:11:52.596849', 'step': 1416, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:52.649257', 'step': 1416, 'epoch': 2} +{'type': 'loss', 'content': 0.0023188970517367125, 'timestamp': '2025-09-30 22:11:52.651657', 'step': 1417, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:52.706530', 'step': 1417, 'epoch': 2} +{'type': 'loss', 'content': 0.002683415310457349, 'timestamp': '2025-09-30 22:11:52.709269', 'step': 1418, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:52.772989', 'step': 1418, 'epoch': 2} +{'type': 'loss', 'content': 0.006297766696661711, 'timestamp': '2025-09-30 22:11:52.775729', 'step': 1419, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:52.830863', 'step': 1419, 'epoch': 2} +{'type': 'loss', 'content': 0.018091067671775818, 'timestamp': '2025-09-30 22:11:52.838563', 'step': 1420, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:52.892921', 'step': 1420, 'epoch': 2} +{'type': 'loss', 'content': 0.015514843165874481, 'timestamp': '2025-09-30 22:11:52.895667', 'step': 1421, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:52.949542', 'step': 1421, 'epoch': 2} +{'type': 'loss', 'content': 0.0016685453010722995, 'timestamp': '2025-09-30 22:11:52.959052', 'step': 1422, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:53.015770', 'step': 1422, 'epoch': 2} +{'type': 'loss', 'content': 0.04028955474495888, 'timestamp': '2025-09-30 22:11:53.018535', 'step': 1423, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:53.074396', 'step': 1423, 'epoch': 2} +{'type': 'loss', 'content': 0.018086543306708336, 'timestamp': '2025-09-30 22:11:53.081604', 'step': 1424, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:53.137519', 'step': 1424, 'epoch': 2} +{'type': 'loss', 'content': 0.03144695982336998, 'timestamp': '2025-09-30 22:11:53.145979', 'step': 1425, 'epoch': 2} +{'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}], 'timestamp': '2025-09-30 22:11:54.385788', 'step': 1425, 'epoch': 2} +{'type': 'pplx', 'content': 32876480.431024157, 'timestamp': '2025-09-30 22:11:54.388368', 'step': 1425, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:54.441452', 'step': 1425, 'epoch': 2} +{'type': 'loss', 'content': 0.0008492738706991076, 'timestamp': '2025-09-30 22:11:54.444996', 'step': 1426, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:54.497804', 'step': 1426, 'epoch': 2} +{'type': 'loss', 'content': 0.006496089976280928, 'timestamp': '2025-09-30 22:11:54.500486', 'step': 1427, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:54.554857', 'step': 1427, 'epoch': 2} +{'type': 'loss', 'content': 0.046025075018405914, 'timestamp': '2025-09-30 22:11:54.561183', 'step': 1428, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:54.620775', 'step': 1428, 'epoch': 2} +{'type': 'loss', 'content': 0.004891328979283571, 'timestamp': '2025-09-30 22:11:54.623529', 'step': 1429, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:54.678675', 'step': 1429, 'epoch': 2} +{'type': 'loss', 'content': 0.006063086446374655, 'timestamp': '2025-09-30 22:11:54.689551', 'step': 1430, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:54.744569', 'step': 1430, 'epoch': 2} +{'type': 'loss', 'content': 0.015636909753084183, 'timestamp': '2025-09-30 22:11:54.746833', 'step': 1431, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:54.801259', 'step': 1431, 'epoch': 2} +{'type': 'loss', 'content': 0.009860769845545292, 'timestamp': '2025-09-30 22:11:54.812696', 'step': 1432, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:54.867412', 'step': 1432, 'epoch': 2} +{'type': 'loss', 'content': 0.021920878440141678, 'timestamp': '2025-09-30 22:11:54.873928', 'step': 1433, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:54.933680', 'step': 1433, 'epoch': 2} +{'type': 'loss', 'content': 0.03438463434576988, 'timestamp': '2025-09-30 22:11:54.935946', 'step': 1434, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:54.990181', 'step': 1434, 'epoch': 2} +{'type': 'loss', 'content': 0.010635143145918846, 'timestamp': '2025-09-30 22:11:54.992804', 'step': 1435, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:55.055330', 'step': 1435, 'epoch': 2} +{'type': 'loss', 'content': 0.00384154194034636, 'timestamp': '2025-09-30 22:11:55.062260', 'step': 1436, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:55.116261', 'step': 1436, 'epoch': 2} +{'type': 'loss', 'content': 0.01956865005195141, 'timestamp': '2025-09-30 22:11:55.126079', 'step': 1437, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:55.184259', 'step': 1437, 'epoch': 2} +{'type': 'loss', 'content': 0.011690896935760975, 'timestamp': '2025-09-30 22:11:55.187051', 'step': 1438, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:55.240826', 'step': 1438, 'epoch': 2} +{'type': 'loss', 'content': 0.0043997629545629025, 'timestamp': '2025-09-30 22:11:55.242792', 'step': 1439, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:55.295661', 'step': 1439, 'epoch': 2} +{'type': 'loss', 'content': 0.0012786614242941141, 'timestamp': '2025-09-30 22:11:55.301411', 'step': 1440, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:55.353838', 'step': 1440, 'epoch': 2} +{'type': 'loss', 'content': 0.001398958032950759, 'timestamp': '2025-09-30 22:11:55.356227', 'step': 1441, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:55.412017', 'step': 1441, 'epoch': 2} +{'type': 'loss', 'content': 0.0023240819573402405, 'timestamp': '2025-09-30 22:11:55.417275', 'step': 1442, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:55.470340', 'step': 1442, 'epoch': 2} +{'type': 'loss', 'content': 0.014607422053813934, 'timestamp': '2025-09-30 22:11:55.475088', 'step': 1443, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:55.528302', 'step': 1443, 'epoch': 2} +{'type': 'loss', 'content': 0.001739148749038577, 'timestamp': '2025-09-30 22:11:55.534596', 'step': 1444, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:55.591866', 'step': 1444, 'epoch': 2} +{'type': 'loss', 'content': 0.011579596437513828, 'timestamp': '2025-09-30 22:11:55.595434', 'step': 1445, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:55.649148', 'step': 1445, 'epoch': 2} +{'type': 'loss', 'content': 0.01198033057153225, 'timestamp': '2025-09-30 22:11:55.651642', 'step': 1446, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:55.705180', 'step': 1446, 'epoch': 2} +{'type': 'loss', 'content': 0.010683462955057621, 'timestamp': '2025-09-30 22:11:55.707304', 'step': 1447, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:55.762754', 'step': 1447, 'epoch': 2} +{'type': 'loss', 'content': 0.010873474180698395, 'timestamp': '2025-09-30 22:11:55.768493', 'step': 1448, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:55.821216', 'step': 1448, 'epoch': 2} +{'type': 'loss', 'content': 0.016803989186882973, 'timestamp': '2025-09-30 22:11:55.824294', 'step': 1449, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:55.879996', 'step': 1449, 'epoch': 2} +{'type': 'loss', 'content': 0.024432357400655746, 'timestamp': '2025-09-30 22:11:55.883382', 'step': 1450, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:55.937573', 'step': 1450, 'epoch': 2} +{'type': 'loss', 'content': 0.01734604313969612, 'timestamp': '2025-09-30 22:11:55.941133', 'step': 1451, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:55.994416', 'step': 1451, 'epoch': 2} +{'type': 'loss', 'content': 0.002315593184903264, 'timestamp': '2025-09-30 22:11:56.001288', 'step': 1452, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:56.055517', 'step': 1452, 'epoch': 2} +{'type': 'loss', 'content': 0.0013011764967814088, 'timestamp': '2025-09-30 22:11:56.057536', 'step': 1453, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:56.111279', 'step': 1453, 'epoch': 2} +{'type': 'loss', 'content': 0.014267000369727612, 'timestamp': '2025-09-30 22:11:56.115214', 'step': 1454, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:56.177103', 'step': 1454, 'epoch': 2} +{'type': 'loss', 'content': 0.01388590969145298, 'timestamp': '2025-09-30 22:11:56.179331', 'step': 1455, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:56.231796', 'step': 1455, 'epoch': 2} +{'type': 'loss', 'content': 0.03596239164471626, 'timestamp': '2025-09-30 22:11:56.238002', 'step': 1456, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:56.291186', 'step': 1456, 'epoch': 2} +{'type': 'loss', 'content': 0.008110507391393185, 'timestamp': '2025-09-30 22:11:56.293601', 'step': 1457, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:56.346799', 'step': 1457, 'epoch': 2} +{'type': 'loss', 'content': 0.005775760859251022, 'timestamp': '2025-09-30 22:11:56.348855', 'step': 1458, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:56.401731', 'step': 1458, 'epoch': 2} +{'type': 'loss', 'content': 0.02051071636378765, 'timestamp': '2025-09-30 22:11:56.403960', 'step': 1459, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:56.457070', 'step': 1459, 'epoch': 2} +{'type': 'loss', 'content': 0.003823460778221488, 'timestamp': '2025-09-30 22:11:56.463416', 'step': 1460, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:56.516251', 'step': 1460, 'epoch': 2} +{'type': 'loss', 'content': 0.05588651821017265, 'timestamp': '2025-09-30 22:11:56.519240', 'step': 1461, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:56.572359', 'step': 1461, 'epoch': 2} +{'type': 'loss', 'content': 0.011291184462606907, 'timestamp': '2025-09-30 22:11:56.574855', 'step': 1462, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:56.638559', 'step': 1462, 'epoch': 2} +{'type': 'loss', 'content': 0.02610655501484871, 'timestamp': '2025-09-30 22:11:56.640892', 'step': 1463, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:56.694525', 'step': 1463, 'epoch': 2} +{'type': 'loss', 'content': 0.025093723088502884, 'timestamp': '2025-09-30 22:11:56.702308', 'step': 1464, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:56.754189', 'step': 1464, 'epoch': 2} +{'type': 'loss', 'content': 0.018963143229484558, 'timestamp': '2025-09-30 22:11:56.762237', 'step': 1465, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:56.815551', 'step': 1465, 'epoch': 2} +{'type': 'loss', 'content': 0.02996927499771118, 'timestamp': '2025-09-30 22:11:56.818951', 'step': 1466, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:56.871563', 'step': 1466, 'epoch': 2} +{'type': 'loss', 'content': 0.031889282166957855, 'timestamp': '2025-09-30 22:11:56.874567', 'step': 1467, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:56.929697', 'step': 1467, 'epoch': 2} +{'type': 'loss', 'content': 0.013061118312180042, 'timestamp': '2025-09-30 22:11:56.938671', 'step': 1468, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:56.992670', 'step': 1468, 'epoch': 2} +{'type': 'loss', 'content': 0.021829981356859207, 'timestamp': '2025-09-30 22:11:56.995200', 'step': 1469, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:57.048959', 'step': 1469, 'epoch': 2} +{'type': 'loss', 'content': 0.048463720828294754, 'timestamp': '2025-09-30 22:11:57.051096', 'step': 1470, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:57.105679', 'step': 1470, 'epoch': 2} +{'type': 'loss', 'content': 0.0270835030823946, 'timestamp': '2025-09-30 22:11:57.107848', 'step': 1471, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:57.160647', 'step': 1471, 'epoch': 2} +{'type': 'loss', 'content': 0.007283608429133892, 'timestamp': '2025-09-30 22:11:57.166422', 'step': 1472, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:57.221551', 'step': 1472, 'epoch': 2} +{'type': 'loss', 'content': 0.014574305154383183, 'timestamp': '2025-09-30 22:11:57.223680', 'step': 1473, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:57.280308', 'step': 1473, 'epoch': 2} +{'type': 'loss', 'content': 0.014809882268309593, 'timestamp': '2025-09-30 22:11:57.282233', 'step': 1474, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:57.335185', 'step': 1474, 'epoch': 2} +{'type': 'loss', 'content': 0.010421657003462315, 'timestamp': '2025-09-30 22:11:57.337900', 'step': 1475, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:57.391394', 'step': 1475, 'epoch': 2} +{'type': 'loss', 'content': 0.025253277271986008, 'timestamp': '2025-09-30 22:11:57.399377', 'step': 1476, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:57.452018', 'step': 1476, 'epoch': 2} +{'type': 'loss', 'content': 0.005706873722374439, 'timestamp': '2025-09-30 22:11:57.454324', 'step': 1477, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:57.507110', 'step': 1477, 'epoch': 2} +{'type': 'loss', 'content': 0.007750978227704763, 'timestamp': '2025-09-30 22:11:57.510333', 'step': 1478, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:57.563595', 'step': 1478, 'epoch': 2} +{'type': 'loss', 'content': 0.0170641727745533, 'timestamp': '2025-09-30 22:11:57.565865', 'step': 1479, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:57.618452', 'step': 1479, 'epoch': 2} +{'type': 'loss', 'content': 0.010642343200743198, 'timestamp': '2025-09-30 22:11:57.624447', 'step': 1480, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:57.677415', 'step': 1480, 'epoch': 2} +{'type': 'loss', 'content': 0.022640392184257507, 'timestamp': '2025-09-30 22:11:57.680487', 'step': 1481, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:57.733786', 'step': 1481, 'epoch': 2} +{'type': 'loss', 'content': 0.012234336696565151, 'timestamp': '2025-09-30 22:11:57.736389', 'step': 1482, 'epoch': 2} +{'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}], 'timestamp': '2025-09-30 22:11:58.929466', 'step': 1482, 'epoch': 2} +{'type': 'pplx', 'content': 31872365.783038512, 'timestamp': '2025-09-30 22:11:58.932319', 'step': 1482, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:58.984024', 'step': 1482, 'epoch': 2} +{'type': 'loss', 'content': 0.016802014783024788, 'timestamp': '2025-09-30 22:11:58.986699', 'step': 1483, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:59.041050', 'step': 1483, 'epoch': 2} +{'type': 'loss', 'content': 0.010017321445047855, 'timestamp': '2025-09-30 22:11:59.047373', 'step': 1484, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-30 22:11:59.103047', 'step': 1484, 'epoch': 2} +{'type': 'loss', 'content': 0.01991168037056923, 'timestamp': '2025-09-30 22:11:59.106592', 'step': 1485, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:59.161064', 'step': 1485, 'epoch': 2} +{'type': 'loss', 'content': 0.02465466596186161, 'timestamp': '2025-09-30 22:11:59.163538', 'step': 1486, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:59.218736', 'step': 1486, 'epoch': 2} +{'type': 'loss', 'content': 0.011366413906216621, 'timestamp': '2025-09-30 22:11:59.221346', 'step': 1487, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:59.275277', 'step': 1487, 'epoch': 2} +{'type': 'loss', 'content': 0.009791248477995396, 'timestamp': '2025-09-30 22:11:59.282523', 'step': 1488, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:59.335043', 'step': 1488, 'epoch': 2} +{'type': 'loss', 'content': 0.012804691679775715, 'timestamp': '2025-09-30 22:11:59.337565', 'step': 1489, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:59.391128', 'step': 1489, 'epoch': 2} +{'type': 'loss', 'content': 0.008853507228195667, 'timestamp': '2025-09-30 22:11:59.394664', 'step': 1490, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:59.448734', 'step': 1490, 'epoch': 2} +{'type': 'loss', 'content': 0.007088639307767153, 'timestamp': '2025-09-30 22:11:59.451173', 'step': 1491, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:59.504600', 'step': 1491, 'epoch': 2} +{'type': 'loss', 'content': 0.017957476899027824, 'timestamp': '2025-09-30 22:11:59.511239', 'step': 1492, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:59.564635', 'step': 1492, 'epoch': 2} +{'type': 'loss', 'content': 0.011476176790893078, 'timestamp': '2025-09-30 22:11:59.567245', 'step': 1493, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:11:59.622806', 'step': 1493, 'epoch': 2} +{'type': 'loss', 'content': 0.014955559745430946, 'timestamp': '2025-09-30 22:11:59.625363', 'step': 1494, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-30 22:11:59.678610', 'step': 1494, 'epoch': 2} +{'type': 'loss', 'content': 0.020908983424305916, 'timestamp': '2025-09-30 22:11:59.681901', 'step': 1495, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:59.737652', 'step': 1495, 'epoch': 2} +{'type': 'loss', 'content': 0.007830632850527763, 'timestamp': '2025-09-30 22:11:59.743947', 'step': 1496, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:59.797627', 'step': 1496, 'epoch': 2} +{'type': 'loss', 'content': 0.007764595095068216, 'timestamp': '2025-09-30 22:11:59.800420', 'step': 1497, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:59.855257', 'step': 1497, 'epoch': 2} +{'type': 'loss', 'content': 0.01050377357751131, 'timestamp': '2025-09-30 22:11:59.857838', 'step': 1498, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:11:59.913101', 'step': 1498, 'epoch': 2} +{'type': 'loss', 'content': 0.008915500715374947, 'timestamp': '2025-09-30 22:11:59.915703', 'step': 1499, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:11:59.971399', 'step': 1499, 'epoch': 2} +{'type': 'loss', 'content': 0.015954632312059402, 'timestamp': '2025-09-30 22:11:59.977693', 'step': 1500, 'epoch': 2} +{'type': 'info', 'content': 'Checkpoint saved at step 1500', 'timestamp': '2025-09-30 22:12:00.388871', 'step': 1500, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:00.447231', 'step': 1500, 'epoch': 2} +{'type': 'loss', 'content': 0.011480267159640789, 'timestamp': '2025-09-30 22:12:00.449372', 'step': 1501, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:00.503242', 'step': 1501, 'epoch': 2} +{'type': 'loss', 'content': 0.010660061612725258, 'timestamp': '2025-09-30 22:12:00.505412', 'step': 1502, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:00.559660', 'step': 1502, 'epoch': 2} +{'type': 'loss', 'content': 0.00754969660192728, 'timestamp': '2025-09-30 22:12:00.562034', 'step': 1503, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:00.619455', 'step': 1503, 'epoch': 2} +{'type': 'loss', 'content': 0.006606790237128735, 'timestamp': '2025-09-30 22:12:00.625333', 'step': 1504, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:00.684398', 'step': 1504, 'epoch': 2} +{'type': 'loss', 'content': 0.024877065792679787, 'timestamp': '2025-09-30 22:12:00.686471', 'step': 1505, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:00.740071', 'step': 1505, 'epoch': 2} +{'type': 'loss', 'content': 0.018199782818555832, 'timestamp': '2025-09-30 22:12:00.742556', 'step': 1506, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:00.798637', 'step': 1506, 'epoch': 2} +{'type': 'loss', 'content': 0.024665992707014084, 'timestamp': '2025-09-30 22:12:00.800950', 'step': 1507, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-30 22:12:00.860320', 'step': 1507, 'epoch': 2} +{'type': 'loss', 'content': 0.010990189388394356, 'timestamp': '2025-09-30 22:12:00.866243', 'step': 1508, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:00.919912', 'step': 1508, 'epoch': 2} +{'type': 'loss', 'content': 0.011042303405702114, 'timestamp': '2025-09-30 22:12:00.922231', 'step': 1509, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:00.976152', 'step': 1509, 'epoch': 2} +{'type': 'loss', 'content': 0.016723833978176117, 'timestamp': '2025-09-30 22:12:00.978834', 'step': 1510, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:01.033142', 'step': 1510, 'epoch': 2} +{'type': 'loss', 'content': 0.0033279487397521734, 'timestamp': '2025-09-30 22:12:01.035168', 'step': 1511, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:01.088589', 'step': 1511, 'epoch': 2} +{'type': 'loss', 'content': 0.012400150299072266, 'timestamp': '2025-09-30 22:12:01.094521', 'step': 1512, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:01.150517', 'step': 1512, 'epoch': 2} +{'type': 'loss', 'content': 0.011185402050614357, 'timestamp': '2025-09-30 22:12:01.153351', 'step': 1513, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:01.207943', 'step': 1513, 'epoch': 2} +{'type': 'loss', 'content': 0.01125926524400711, 'timestamp': '2025-09-30 22:12:01.210391', 'step': 1514, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:01.265310', 'step': 1514, 'epoch': 2} +{'type': 'loss', 'content': 0.009529463946819305, 'timestamp': '2025-09-30 22:12:01.267633', 'step': 1515, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:01.329492', 'step': 1515, 'epoch': 2} +{'type': 'loss', 'content': 0.03866703063249588, 'timestamp': '2025-09-30 22:12:01.335633', 'step': 1516, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:01.393980', 'step': 1516, 'epoch': 2} +{'type': 'loss', 'content': 0.009177983738481998, 'timestamp': '2025-09-30 22:12:01.396395', 'step': 1517, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:01.453397', 'step': 1517, 'epoch': 2} +{'type': 'loss', 'content': 0.02175569161772728, 'timestamp': '2025-09-30 22:12:01.455615', 'step': 1518, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:01.508806', 'step': 1518, 'epoch': 2} +{'type': 'loss', 'content': 0.012183894403278828, 'timestamp': '2025-09-30 22:12:01.510982', 'step': 1519, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:01.566324', 'step': 1519, 'epoch': 2} +{'type': 'loss', 'content': 0.008869107812643051, 'timestamp': '2025-09-30 22:12:01.574362', 'step': 1520, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:01.628660', 'step': 1520, 'epoch': 2} +{'type': 'loss', 'content': 0.021661145612597466, 'timestamp': '2025-09-30 22:12:01.631165', 'step': 1521, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:01.691533', 'step': 1521, 'epoch': 2} +{'type': 'loss', 'content': 0.019203439354896545, 'timestamp': '2025-09-30 22:12:01.695020', 'step': 1522, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:01.749280', 'step': 1522, 'epoch': 2} +{'type': 'loss', 'content': 0.010725765489041805, 'timestamp': '2025-09-30 22:12:01.751519', 'step': 1523, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:01.806614', 'step': 1523, 'epoch': 2} +{'type': 'loss', 'content': 0.009876924566924572, 'timestamp': '2025-09-30 22:12:01.812145', 'step': 1524, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:01.867986', 'step': 1524, 'epoch': 2} +{'type': 'loss', 'content': 0.011706030927598476, 'timestamp': '2025-09-30 22:12:01.870018', 'step': 1525, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:01.923199', 'step': 1525, 'epoch': 2} +{'type': 'loss', 'content': 0.011448273435235023, 'timestamp': '2025-09-30 22:12:01.925038', 'step': 1526, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:01.977941', 'step': 1526, 'epoch': 2} +{'type': 'loss', 'content': 0.009272503666579723, 'timestamp': '2025-09-30 22:12:01.979995', 'step': 1527, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:02.033727', 'step': 1527, 'epoch': 2} +{'type': 'loss', 'content': 0.05378647893667221, 'timestamp': '2025-09-30 22:12:02.039275', 'step': 1528, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:02.091647', 'step': 1528, 'epoch': 2} +{'type': 'loss', 'content': 0.024520421400666237, 'timestamp': '2025-09-30 22:12:02.093742', 'step': 1529, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:02.146154', 'step': 1529, 'epoch': 2} +{'type': 'loss', 'content': 0.024694928899407387, 'timestamp': '2025-09-30 22:12:02.148676', 'step': 1530, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:02.203002', 'step': 1530, 'epoch': 2} +{'type': 'loss', 'content': 0.007078372407704592, 'timestamp': '2025-09-30 22:12:02.205450', 'step': 1531, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:02.259443', 'step': 1531, 'epoch': 2} +{'type': 'loss', 'content': 0.030642852187156677, 'timestamp': '2025-09-30 22:12:02.265928', 'step': 1532, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:02.319607', 'step': 1532, 'epoch': 2} +{'type': 'loss', 'content': 0.016413046047091484, 'timestamp': '2025-09-30 22:12:02.321885', 'step': 1533, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:02.377357', 'step': 1533, 'epoch': 2} +{'type': 'loss', 'content': 0.026809586212038994, 'timestamp': '2025-09-30 22:12:02.379738', 'step': 1534, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:02.434324', 'step': 1534, 'epoch': 2} +{'type': 'loss', 'content': 0.02859729900956154, 'timestamp': '2025-09-30 22:12:02.437108', 'step': 1535, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:02.492084', 'step': 1535, 'epoch': 2} +{'type': 'loss', 'content': 0.013830293901264668, 'timestamp': '2025-09-30 22:12:02.497710', 'step': 1536, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:02.550441', 'step': 1536, 'epoch': 2} +{'type': 'loss', 'content': 0.01072558481246233, 'timestamp': '2025-09-30 22:12:02.553119', 'step': 1537, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:02.612546', 'step': 1537, 'epoch': 2} +{'type': 'loss', 'content': 0.03165407106280327, 'timestamp': '2025-09-30 22:12:02.614540', 'step': 1538, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:02.669484', 'step': 1538, 'epoch': 2} +{'type': 'loss', 'content': 0.006038249935954809, 'timestamp': '2025-09-30 22:12:02.671470', 'step': 1539, 'epoch': 2} +{'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}], 'timestamp': '2025-09-30 22:12:03.885059', 'step': 1539, 'epoch': 2} +{'type': 'pplx', 'content': 35072222.03899579, 'timestamp': '2025-09-30 22:12:03.886596', 'step': 1539, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:03.938357', 'step': 1539, 'epoch': 2} +{'type': 'loss', 'content': 0.019516736268997192, 'timestamp': '2025-09-30 22:12:03.944029', 'step': 1540, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:03.997583', 'step': 1540, 'epoch': 2} +{'type': 'loss', 'content': 0.01919463463127613, 'timestamp': '2025-09-30 22:12:03.999837', 'step': 1541, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:04.053989', 'step': 1541, 'epoch': 2} +{'type': 'loss', 'content': 0.02023499831557274, 'timestamp': '2025-09-30 22:12:04.057050', 'step': 1542, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:04.109535', 'step': 1542, 'epoch': 2} +{'type': 'loss', 'content': 0.0022135619074106216, 'timestamp': '2025-09-30 22:12:04.111673', 'step': 1543, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:04.164050', 'step': 1543, 'epoch': 2} +{'type': 'loss', 'content': 0.008789089508354664, 'timestamp': '2025-09-30 22:12:04.170203', 'step': 1544, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:04.224306', 'step': 1544, 'epoch': 2} +{'type': 'loss', 'content': 0.013264109380543232, 'timestamp': '2025-09-30 22:12:04.231236', 'step': 1545, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:04.286705', 'step': 1545, 'epoch': 2} +{'type': 'loss', 'content': 0.035324085503816605, 'timestamp': '2025-09-30 22:12:04.291376', 'step': 1546, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:04.347304', 'step': 1546, 'epoch': 2} +{'type': 'loss', 'content': 0.028284629806876183, 'timestamp': '2025-09-30 22:12:04.349678', 'step': 1547, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:04.405061', 'step': 1547, 'epoch': 2} +{'type': 'loss', 'content': 0.007114126812666655, 'timestamp': '2025-09-30 22:12:04.410655', 'step': 1548, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:04.464055', 'step': 1548, 'epoch': 2} +{'type': 'loss', 'content': 0.0037361420691013336, 'timestamp': '2025-09-30 22:12:04.466154', 'step': 1549, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:04.519128', 'step': 1549, 'epoch': 2} +{'type': 'loss', 'content': 0.00416518421843648, 'timestamp': '2025-09-30 22:12:04.521220', 'step': 1550, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:04.574047', 'step': 1550, 'epoch': 2} +{'type': 'loss', 'content': 0.012675133533775806, 'timestamp': '2025-09-30 22:12:04.576126', 'step': 1551, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:04.629683', 'step': 1551, 'epoch': 2} +{'type': 'loss', 'content': 0.018537061288952827, 'timestamp': '2025-09-30 22:12:04.635243', 'step': 1552, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:04.688065', 'step': 1552, 'epoch': 2} +{'type': 'loss', 'content': 0.010581501759588718, 'timestamp': '2025-09-30 22:12:04.690083', 'step': 1553, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:04.743054', 'step': 1553, 'epoch': 2} +{'type': 'loss', 'content': 0.0007142522372305393, 'timestamp': '2025-09-30 22:12:04.745197', 'step': 1554, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:04.798016', 'step': 1554, 'epoch': 2} +{'type': 'loss', 'content': 0.03923966363072395, 'timestamp': '2025-09-30 22:12:04.800205', 'step': 1555, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:04.854036', 'step': 1555, 'epoch': 2} +{'type': 'loss', 'content': 0.02381220832467079, 'timestamp': '2025-09-30 22:12:04.859829', 'step': 1556, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:04.912029', 'step': 1556, 'epoch': 2} +{'type': 'loss', 'content': 0.002656190888956189, 'timestamp': '2025-09-30 22:12:04.914059', 'step': 1557, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:04.969018', 'step': 1557, 'epoch': 2} +{'type': 'loss', 'content': 0.009249404072761536, 'timestamp': '2025-09-30 22:12:04.971667', 'step': 1558, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:05.026246', 'step': 1558, 'epoch': 2} +{'type': 'loss', 'content': 0.013429306447505951, 'timestamp': '2025-09-30 22:12:05.028394', 'step': 1559, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:05.081243', 'step': 1559, 'epoch': 2} +{'type': 'loss', 'content': 0.0022556984331458807, 'timestamp': '2025-09-30 22:12:05.086891', 'step': 1560, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:05.139568', 'step': 1560, 'epoch': 2} +{'type': 'loss', 'content': 0.039768513292074203, 'timestamp': '2025-09-30 22:12:05.141838', 'step': 1561, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:05.194858', 'step': 1561, 'epoch': 2} +{'type': 'loss', 'content': 0.002185255754739046, 'timestamp': '2025-09-30 22:12:05.197796', 'step': 1562, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:05.250587', 'step': 1562, 'epoch': 2} +{'type': 'loss', 'content': 0.03695518150925636, 'timestamp': '2025-09-30 22:12:05.253676', 'step': 1563, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:05.309805', 'step': 1563, 'epoch': 2} +{'type': 'loss', 'content': 0.006958664394915104, 'timestamp': '2025-09-30 22:12:05.315287', 'step': 1564, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:05.367430', 'step': 1564, 'epoch': 2} +{'type': 'loss', 'content': 0.001130191725678742, 'timestamp': '2025-09-30 22:12:05.370117', 'step': 1565, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:05.423414', 'step': 1565, 'epoch': 2} +{'type': 'loss', 'content': 0.0016752462834119797, 'timestamp': '2025-09-30 22:12:05.425530', 'step': 1566, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:05.477789', 'step': 1566, 'epoch': 2} +{'type': 'loss', 'content': 0.0018236959585919976, 'timestamp': '2025-09-30 22:12:05.480120', 'step': 1567, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:05.532586', 'step': 1567, 'epoch': 2} +{'type': 'loss', 'content': 0.022834938019514084, 'timestamp': '2025-09-30 22:12:05.538287', 'step': 1568, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:05.590319', 'step': 1568, 'epoch': 2} +{'type': 'loss', 'content': 0.006333382334560156, 'timestamp': '2025-09-30 22:12:05.592495', 'step': 1569, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:05.644948', 'step': 1569, 'epoch': 2} +{'type': 'loss', 'content': 0.012706826440989971, 'timestamp': '2025-09-30 22:12:05.647109', 'step': 1570, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:05.699654', 'step': 1570, 'epoch': 2} +{'type': 'loss', 'content': 0.02377459593117237, 'timestamp': '2025-09-30 22:12:05.701745', 'step': 1571, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:05.754334', 'step': 1571, 'epoch': 2} +{'type': 'loss', 'content': 0.011410296894609928, 'timestamp': '2025-09-30 22:12:05.760192', 'step': 1572, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:05.815069', 'step': 1572, 'epoch': 2} +{'type': 'loss', 'content': 0.004736430011689663, 'timestamp': '2025-09-30 22:12:05.817268', 'step': 1573, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:05.869880', 'step': 1573, 'epoch': 2} +{'type': 'loss', 'content': 0.001076001557521522, 'timestamp': '2025-09-30 22:12:05.872137', 'step': 1574, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:05.925587', 'step': 1574, 'epoch': 2} +{'type': 'loss', 'content': 0.023327454924583435, 'timestamp': '2025-09-30 22:12:05.927859', 'step': 1575, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:05.981165', 'step': 1575, 'epoch': 2} +{'type': 'loss', 'content': 0.0017735332949087024, 'timestamp': '2025-09-30 22:12:05.987084', 'step': 1576, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:06.039957', 'step': 1576, 'epoch': 2} +{'type': 'loss', 'content': 0.010920924134552479, 'timestamp': '2025-09-30 22:12:06.042225', 'step': 1577, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:06.096214', 'step': 1577, 'epoch': 2} +{'type': 'loss', 'content': 0.028373869135975838, 'timestamp': '2025-09-30 22:12:06.098273', 'step': 1578, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:06.152111', 'step': 1578, 'epoch': 2} +{'type': 'loss', 'content': 0.0025909452233463526, 'timestamp': '2025-09-30 22:12:06.154864', 'step': 1579, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:06.209637', 'step': 1579, 'epoch': 2} +{'type': 'loss', 'content': 0.01281531248241663, 'timestamp': '2025-09-30 22:12:06.216014', 'step': 1580, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:06.269328', 'step': 1580, 'epoch': 2} +{'type': 'loss', 'content': 0.01906949281692505, 'timestamp': '2025-09-30 22:12:06.272333', 'step': 1581, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:06.326030', 'step': 1581, 'epoch': 2} +{'type': 'loss', 'content': 0.008657933212816715, 'timestamp': '2025-09-30 22:12:06.328046', 'step': 1582, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:06.380427', 'step': 1582, 'epoch': 2} +{'type': 'loss', 'content': 0.011052996851503849, 'timestamp': '2025-09-30 22:12:06.382745', 'step': 1583, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:06.436203', 'step': 1583, 'epoch': 2} +{'type': 'loss', 'content': 0.008702821098268032, 'timestamp': '2025-09-30 22:12:06.441965', 'step': 1584, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:06.493780', 'step': 1584, 'epoch': 2} +{'type': 'loss', 'content': 0.01199591439217329, 'timestamp': '2025-09-30 22:12:06.495960', 'step': 1585, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:06.548081', 'step': 1585, 'epoch': 2} +{'type': 'loss', 'content': 0.02052219584584236, 'timestamp': '2025-09-30 22:12:06.550557', 'step': 1586, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:06.603416', 'step': 1586, 'epoch': 2} +{'type': 'loss', 'content': 0.029241319745779037, 'timestamp': '2025-09-30 22:12:06.605525', 'step': 1587, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:06.657839', 'step': 1587, 'epoch': 2} +{'type': 'loss', 'content': 0.004892691969871521, 'timestamp': '2025-09-30 22:12:06.663407', 'step': 1588, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:06.727359', 'step': 1588, 'epoch': 2} +{'type': 'loss', 'content': 0.021011924371123314, 'timestamp': '2025-09-30 22:12:06.729566', 'step': 1589, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:06.782007', 'step': 1589, 'epoch': 2} +{'type': 'loss', 'content': 0.01708308607339859, 'timestamp': '2025-09-30 22:12:06.784478', 'step': 1590, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:06.837235', 'step': 1590, 'epoch': 2} +{'type': 'loss', 'content': 0.035234589129686356, 'timestamp': '2025-09-30 22:12:06.839857', 'step': 1591, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:06.892589', 'step': 1591, 'epoch': 2} +{'type': 'loss', 'content': 0.047162704169750214, 'timestamp': '2025-09-30 22:12:06.898404', 'step': 1592, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:06.950346', 'step': 1592, 'epoch': 2} +{'type': 'loss', 'content': 0.010256440378725529, 'timestamp': '2025-09-30 22:12:06.952596', 'step': 1593, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:07.005219', 'step': 1593, 'epoch': 2} +{'type': 'loss', 'content': 0.011917630210518837, 'timestamp': '2025-09-30 22:12:07.007233', 'step': 1594, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:07.062589', 'step': 1594, 'epoch': 2} +{'type': 'loss', 'content': 0.019979296252131462, 'timestamp': '2025-09-30 22:12:07.064654', 'step': 1595, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:07.117139', 'step': 1595, 'epoch': 2} +{'type': 'loss', 'content': 0.020958244800567627, 'timestamp': '2025-09-30 22:12:07.122974', 'step': 1596, 'epoch': 2} +{'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}], 'timestamp': '2025-09-30 22:12:08.315114', 'step': 1596, 'epoch': 2} +{'type': 'pplx', 'content': 33021765.354655504, 'timestamp': '2025-09-30 22:12:08.317184', 'step': 1596, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:08.368251', 'step': 1596, 'epoch': 2} +{'type': 'loss', 'content': 0.00980797503143549, 'timestamp': '2025-09-30 22:12:08.370313', 'step': 1597, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:08.433892', 'step': 1597, 'epoch': 2} +{'type': 'loss', 'content': 0.009293398819863796, 'timestamp': '2025-09-30 22:12:08.435957', 'step': 1598, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:08.488754', 'step': 1598, 'epoch': 2} +{'type': 'loss', 'content': 0.015030805952847004, 'timestamp': '2025-09-30 22:12:08.491114', 'step': 1599, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:08.549008', 'step': 1599, 'epoch': 2} +{'type': 'loss', 'content': 0.011158740147948265, 'timestamp': '2025-09-30 22:12:08.554545', 'step': 1600, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:08.610775', 'step': 1600, 'epoch': 2} +{'type': 'loss', 'content': 0.005423234310001135, 'timestamp': '2025-09-30 22:12:08.612759', 'step': 1601, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:08.666779', 'step': 1601, 'epoch': 2} +{'type': 'loss', 'content': 0.02317948266863823, 'timestamp': '2025-09-30 22:12:08.668958', 'step': 1602, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:08.722140', 'step': 1602, 'epoch': 2} +{'type': 'loss', 'content': 0.01775677688419819, 'timestamp': '2025-09-30 22:12:08.724860', 'step': 1603, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:08.791960', 'step': 1603, 'epoch': 2} +{'type': 'loss', 'content': 0.0035264005418866873, 'timestamp': '2025-09-30 22:12:08.798017', 'step': 1604, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:08.851106', 'step': 1604, 'epoch': 2} +{'type': 'loss', 'content': 0.006642151158303022, 'timestamp': '2025-09-30 22:12:08.853289', 'step': 1605, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:08.906804', 'step': 1605, 'epoch': 2} +{'type': 'loss', 'content': 0.026895778253674507, 'timestamp': '2025-09-30 22:12:08.909344', 'step': 1606, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:08.964905', 'step': 1606, 'epoch': 2} +{'type': 'loss', 'content': 0.008761496283113956, 'timestamp': '2025-09-30 22:12:08.967384', 'step': 1607, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:09.020829', 'step': 1607, 'epoch': 2} +{'type': 'loss', 'content': 0.011370251886546612, 'timestamp': '2025-09-30 22:12:09.026859', 'step': 1608, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:09.079870', 'step': 1608, 'epoch': 2} +{'type': 'loss', 'content': 0.007803045213222504, 'timestamp': '2025-09-30 22:12:09.081910', 'step': 1609, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:09.135803', 'step': 1609, 'epoch': 2} +{'type': 'loss', 'content': 0.02594866044819355, 'timestamp': '2025-09-30 22:12:09.138287', 'step': 1610, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:09.191233', 'step': 1610, 'epoch': 2} +{'type': 'loss', 'content': 0.03655938431620598, 'timestamp': '2025-09-30 22:12:09.194673', 'step': 1611, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:09.250578', 'step': 1611, 'epoch': 2} +{'type': 'loss', 'content': 0.005339773837476969, 'timestamp': '2025-09-30 22:12:09.258772', 'step': 1612, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:09.313826', 'step': 1612, 'epoch': 2} +{'type': 'loss', 'content': 0.013894207775592804, 'timestamp': '2025-09-30 22:12:09.316505', 'step': 1613, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:09.370619', 'step': 1613, 'epoch': 2} +{'type': 'loss', 'content': 0.005706585478037596, 'timestamp': '2025-09-30 22:12:09.373401', 'step': 1614, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:09.427451', 'step': 1614, 'epoch': 2} +{'type': 'loss', 'content': 0.005220034625381231, 'timestamp': '2025-09-30 22:12:09.430066', 'step': 1615, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:09.483919', 'step': 1615, 'epoch': 2} +{'type': 'loss', 'content': 0.02249130979180336, 'timestamp': '2025-09-30 22:12:09.490894', 'step': 1616, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:09.545619', 'step': 1616, 'epoch': 2} +{'type': 'loss', 'content': 0.024420803412795067, 'timestamp': '2025-09-30 22:12:09.548516', 'step': 1617, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:09.601846', 'step': 1617, 'epoch': 2} +{'type': 'loss', 'content': 0.006213244050741196, 'timestamp': '2025-09-30 22:12:09.604007', 'step': 1618, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:09.657294', 'step': 1618, 'epoch': 2} +{'type': 'loss', 'content': 0.010518096387386322, 'timestamp': '2025-09-30 22:12:09.660390', 'step': 1619, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:09.715056', 'step': 1619, 'epoch': 2} +{'type': 'loss', 'content': 0.016626330092549324, 'timestamp': '2025-09-30 22:12:09.721021', 'step': 1620, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:09.778310', 'step': 1620, 'epoch': 2} +{'type': 'loss', 'content': 0.010907611809670925, 'timestamp': '2025-09-30 22:12:09.780560', 'step': 1621, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:09.835444', 'step': 1621, 'epoch': 2} +{'type': 'loss', 'content': 0.010932761244475842, 'timestamp': '2025-09-30 22:12:09.838341', 'step': 1622, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:09.891989', 'step': 1622, 'epoch': 2} +{'type': 'loss', 'content': 0.002364618005231023, 'timestamp': '2025-09-30 22:12:09.894614', 'step': 1623, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:09.949021', 'step': 1623, 'epoch': 2} +{'type': 'loss', 'content': 0.0025081944186240435, 'timestamp': '2025-09-30 22:12:09.955528', 'step': 1624, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:10.012894', 'step': 1624, 'epoch': 2} +{'type': 'loss', 'content': 0.011359497904777527, 'timestamp': '2025-09-30 22:12:10.015406', 'step': 1625, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:10.068841', 'step': 1625, 'epoch': 2} +{'type': 'loss', 'content': 0.0060988375917077065, 'timestamp': '2025-09-30 22:12:10.071461', 'step': 1626, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:10.127759', 'step': 1626, 'epoch': 2} +{'type': 'loss', 'content': 0.05257219076156616, 'timestamp': '2025-09-30 22:12:10.130388', 'step': 1627, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-30 22:12:10.185216', 'step': 1627, 'epoch': 2} +{'type': 'loss', 'content': 0.012155899778008461, 'timestamp': '2025-09-30 22:12:10.192809', 'step': 1628, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:10.248137', 'step': 1628, 'epoch': 2} +{'type': 'loss', 'content': 0.01888015680015087, 'timestamp': '2025-09-30 22:12:10.251948', 'step': 1629, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:10.310545', 'step': 1629, 'epoch': 2} +{'type': 'loss', 'content': 0.011508808471262455, 'timestamp': '2025-09-30 22:12:10.314123', 'step': 1630, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:10.369979', 'step': 1630, 'epoch': 2} +{'type': 'loss', 'content': 0.009577545337378979, 'timestamp': '2025-09-30 22:12:10.372814', 'step': 1631, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:10.427544', 'step': 1631, 'epoch': 2} +{'type': 'loss', 'content': 0.022420035675168037, 'timestamp': '2025-09-30 22:12:10.433549', 'step': 1632, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:10.487384', 'step': 1632, 'epoch': 2} +{'type': 'loss', 'content': 0.013017824850976467, 'timestamp': '2025-09-30 22:12:10.490027', 'step': 1633, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:10.543592', 'step': 1633, 'epoch': 2} +{'type': 'loss', 'content': 0.006547243800014257, 'timestamp': '2025-09-30 22:12:10.545311', 'step': 1634, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:10.603149', 'step': 1634, 'epoch': 2} +{'type': 'loss', 'content': 0.016644669696688652, 'timestamp': '2025-09-30 22:12:10.604931', 'step': 1635, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:10.660831', 'step': 1635, 'epoch': 2} +{'type': 'loss', 'content': 0.009076499380171299, 'timestamp': '2025-09-30 22:12:10.666106', 'step': 1636, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:10.719634', 'step': 1636, 'epoch': 2} +{'type': 'loss', 'content': 0.0293984804302454, 'timestamp': '2025-09-30 22:12:10.721835', 'step': 1637, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:10.776206', 'step': 1637, 'epoch': 2} +{'type': 'loss', 'content': 0.001451183925382793, 'timestamp': '2025-09-30 22:12:10.778315', 'step': 1638, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:10.832230', 'step': 1638, 'epoch': 2} +{'type': 'loss', 'content': 0.002988268854096532, 'timestamp': '2025-09-30 22:12:10.834016', 'step': 1639, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:10.888355', 'step': 1639, 'epoch': 2} +{'type': 'loss', 'content': 0.007970958016812801, 'timestamp': '2025-09-30 22:12:10.893965', 'step': 1640, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:10.947408', 'step': 1640, 'epoch': 2} +{'type': 'loss', 'content': 0.026539817452430725, 'timestamp': '2025-09-30 22:12:10.949401', 'step': 1641, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:11.003469', 'step': 1641, 'epoch': 2} +{'type': 'loss', 'content': 0.017281271517276764, 'timestamp': '2025-09-30 22:12:11.005352', 'step': 1642, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:11.067465', 'step': 1642, 'epoch': 2} +{'type': 'loss', 'content': 0.008392428047955036, 'timestamp': '2025-09-30 22:12:11.069643', 'step': 1643, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:11.125032', 'step': 1643, 'epoch': 2} +{'type': 'loss', 'content': 0.01337872352451086, 'timestamp': '2025-09-30 22:12:11.130293', 'step': 1644, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:11.183409', 'step': 1644, 'epoch': 2} +{'type': 'loss', 'content': 0.005623048637062311, 'timestamp': '2025-09-30 22:12:11.186127', 'step': 1645, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:11.241311', 'step': 1645, 'epoch': 2} +{'type': 'loss', 'content': 0.009559988044202328, 'timestamp': '2025-09-30 22:12:11.243866', 'step': 1646, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:11.296841', 'step': 1646, 'epoch': 2} +{'type': 'loss', 'content': 0.010462108068168163, 'timestamp': '2025-09-30 22:12:11.299819', 'step': 1647, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:11.354919', 'step': 1647, 'epoch': 2} +{'type': 'loss', 'content': 0.008628972806036472, 'timestamp': '2025-09-30 22:12:11.360579', 'step': 1648, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:11.413579', 'step': 1648, 'epoch': 2} +{'type': 'loss', 'content': 0.017333444207906723, 'timestamp': '2025-09-30 22:12:11.415784', 'step': 1649, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:11.473420', 'step': 1649, 'epoch': 2} +{'type': 'loss', 'content': 0.007165636867284775, 'timestamp': '2025-09-30 22:12:11.475820', 'step': 1650, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:11.529599', 'step': 1650, 'epoch': 2} +{'type': 'loss', 'content': 0.005024661775678396, 'timestamp': '2025-09-30 22:12:11.531962', 'step': 1651, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:11.586225', 'step': 1651, 'epoch': 2} +{'type': 'loss', 'content': 0.003687667427584529, 'timestamp': '2025-09-30 22:12:11.591545', 'step': 1652, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:11.644033', 'step': 1652, 'epoch': 2} +{'type': 'loss', 'content': 0.01683473400771618, 'timestamp': '2025-09-30 22:12:11.645842', 'step': 1653, 'epoch': 2} +{'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}], 'timestamp': '2025-09-30 22:12:12.861767', 'step': 1653, 'epoch': 2} +{'type': 'pplx', 'content': 31819411.36838668, 'timestamp': '2025-09-30 22:12:12.875645', 'step': 1653, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:12.927653', 'step': 1653, 'epoch': 2} +{'type': 'loss', 'content': 0.019419243559241295, 'timestamp': '2025-09-30 22:12:12.929830', 'step': 1654, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:12.982438', 'step': 1654, 'epoch': 2} +{'type': 'loss', 'content': 0.004655472934246063, 'timestamp': '2025-09-30 22:12:12.984630', 'step': 1655, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:13.038205', 'step': 1655, 'epoch': 2} +{'type': 'loss', 'content': 0.0069176494143903255, 'timestamp': '2025-09-30 22:12:13.044005', 'step': 1656, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:13.096761', 'step': 1656, 'epoch': 2} +{'type': 'loss', 'content': 0.0254069771617651, 'timestamp': '2025-09-30 22:12:13.098612', 'step': 1657, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:13.151554', 'step': 1657, 'epoch': 2} +{'type': 'loss', 'content': 0.02898487262427807, 'timestamp': '2025-09-30 22:12:13.153772', 'step': 1658, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:13.207387', 'step': 1658, 'epoch': 2} +{'type': 'loss', 'content': 0.010060988366603851, 'timestamp': '2025-09-30 22:12:13.210551', 'step': 1659, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:13.264259', 'step': 1659, 'epoch': 2} +{'type': 'loss', 'content': 0.004005379509180784, 'timestamp': '2025-09-30 22:12:13.272363', 'step': 1660, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:13.334659', 'step': 1660, 'epoch': 2} +{'type': 'loss', 'content': 0.043611329048871994, 'timestamp': '2025-09-30 22:12:13.337118', 'step': 1661, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:13.389890', 'step': 1661, 'epoch': 2} +{'type': 'loss', 'content': 0.007646446116268635, 'timestamp': '2025-09-30 22:12:13.391929', 'step': 1662, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:13.445571', 'step': 1662, 'epoch': 2} +{'type': 'loss', 'content': 0.008269752375781536, 'timestamp': '2025-09-30 22:12:13.447687', 'step': 1663, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:13.503140', 'step': 1663, 'epoch': 2} +{'type': 'loss', 'content': 0.003249566303566098, 'timestamp': '2025-09-30 22:12:13.508527', 'step': 1664, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:13.561133', 'step': 1664, 'epoch': 2} +{'type': 'loss', 'content': 0.010090269148349762, 'timestamp': '2025-09-30 22:12:13.562734', 'step': 1665, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:13.615328', 'step': 1665, 'epoch': 2} +{'type': 'loss', 'content': 0.005579036194831133, 'timestamp': '2025-09-30 22:12:13.616959', 'step': 1666, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:13.670719', 'step': 1666, 'epoch': 2} +{'type': 'loss', 'content': 0.023332465440034866, 'timestamp': '2025-09-30 22:12:13.672956', 'step': 1667, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:13.726365', 'step': 1667, 'epoch': 2} +{'type': 'loss', 'content': 0.03554206341505051, 'timestamp': '2025-09-30 22:12:13.732007', 'step': 1668, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:13.785078', 'step': 1668, 'epoch': 2} +{'type': 'loss', 'content': 0.059974852949380875, 'timestamp': '2025-09-30 22:12:13.787293', 'step': 1669, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:13.840026', 'step': 1669, 'epoch': 2} +{'type': 'loss', 'content': 0.00815680343657732, 'timestamp': '2025-09-30 22:12:13.842053', 'step': 1670, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:13.897501', 'step': 1670, 'epoch': 2} +{'type': 'loss', 'content': 0.013160581700503826, 'timestamp': '2025-09-30 22:12:13.899928', 'step': 1671, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:13.953678', 'step': 1671, 'epoch': 2} +{'type': 'loss', 'content': 0.011886881664395332, 'timestamp': '2025-09-30 22:12:13.959315', 'step': 1672, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:14.016600', 'step': 1672, 'epoch': 2} +{'type': 'loss', 'content': 0.011096789501607418, 'timestamp': '2025-09-30 22:12:14.018598', 'step': 1673, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:14.072022', 'step': 1673, 'epoch': 2} +{'type': 'loss', 'content': 0.002368275774642825, 'timestamp': '2025-09-30 22:12:14.074112', 'step': 1674, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:14.128129', 'step': 1674, 'epoch': 2} +{'type': 'loss', 'content': 0.00044797913869842887, 'timestamp': '2025-09-30 22:12:14.130228', 'step': 1675, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:14.184419', 'step': 1675, 'epoch': 2} +{'type': 'loss', 'content': 0.0028583400417119265, 'timestamp': '2025-09-30 22:12:14.190607', 'step': 1676, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:14.244200', 'step': 1676, 'epoch': 2} +{'type': 'loss', 'content': 0.008487415499985218, 'timestamp': '2025-09-30 22:12:14.246938', 'step': 1677, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:14.299946', 'step': 1677, 'epoch': 2} +{'type': 'loss', 'content': 0.00515100359916687, 'timestamp': '2025-09-30 22:12:14.302158', 'step': 1678, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-30 22:12:14.358300', 'step': 1678, 'epoch': 2} +{'type': 'loss', 'content': 0.0111812399700284, 'timestamp': '2025-09-30 22:12:14.360428', 'step': 1679, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:14.413857', 'step': 1679, 'epoch': 2} +{'type': 'loss', 'content': 0.018307924270629883, 'timestamp': '2025-09-30 22:12:14.419308', 'step': 1680, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:14.474616', 'step': 1680, 'epoch': 2} +{'type': 'loss', 'content': 0.005157233215868473, 'timestamp': '2025-09-30 22:12:14.476856', 'step': 1681, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:14.533033', 'step': 1681, 'epoch': 2} +{'type': 'loss', 'content': 0.006048670504242182, 'timestamp': '2025-09-30 22:12:14.535241', 'step': 1682, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:14.589464', 'step': 1682, 'epoch': 2} +{'type': 'loss', 'content': 0.005315546877682209, 'timestamp': '2025-09-30 22:12:14.592376', 'step': 1683, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:14.653180', 'step': 1683, 'epoch': 2} +{'type': 'loss', 'content': 0.004861609544605017, 'timestamp': '2025-09-30 22:12:14.658824', 'step': 1684, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:14.711677', 'step': 1684, 'epoch': 2} +{'type': 'loss', 'content': 0.03198011592030525, 'timestamp': '2025-09-30 22:12:14.713847', 'step': 1685, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:14.766204', 'step': 1685, 'epoch': 2} +{'type': 'loss', 'content': 0.006582505535334349, 'timestamp': '2025-09-30 22:12:14.768519', 'step': 1686, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:14.824037', 'step': 1686, 'epoch': 2} +{'type': 'loss', 'content': 0.041787777096033096, 'timestamp': '2025-09-30 22:12:14.826140', 'step': 1687, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:14.884293', 'step': 1687, 'epoch': 2} +{'type': 'loss', 'content': 0.05907066911458969, 'timestamp': '2025-09-30 22:12:14.890039', 'step': 1688, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:14.944929', 'step': 1688, 'epoch': 2} +{'type': 'loss', 'content': 0.001978454412892461, 'timestamp': '2025-09-30 22:12:14.947025', 'step': 1689, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:15.002116', 'step': 1689, 'epoch': 2} +{'type': 'loss', 'content': 0.052878353744745255, 'timestamp': '2025-09-30 22:12:15.004343', 'step': 1690, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:15.059424', 'step': 1690, 'epoch': 2} +{'type': 'loss', 'content': 0.0005954128573648632, 'timestamp': '2025-09-30 22:12:15.061543', 'step': 1691, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:15.114789', 'step': 1691, 'epoch': 2} +{'type': 'loss', 'content': 0.02634171023964882, 'timestamp': '2025-09-30 22:12:15.121913', 'step': 1692, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:15.177756', 'step': 1692, 'epoch': 2} +{'type': 'loss', 'content': 0.013122417032718658, 'timestamp': '2025-09-30 22:12:15.179846', 'step': 1693, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:15.234822', 'step': 1693, 'epoch': 2} +{'type': 'loss', 'content': 0.008947016671299934, 'timestamp': '2025-09-30 22:12:15.238000', 'step': 1694, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:15.292395', 'step': 1694, 'epoch': 2} +{'type': 'loss', 'content': 0.0031593344174325466, 'timestamp': '2025-09-30 22:12:15.295329', 'step': 1695, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:15.350920', 'step': 1695, 'epoch': 2} +{'type': 'loss', 'content': 0.05288988724350929, 'timestamp': '2025-09-30 22:12:15.356438', 'step': 1696, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:15.411357', 'step': 1696, 'epoch': 2} +{'type': 'loss', 'content': 0.005359127651900053, 'timestamp': '2025-09-30 22:12:15.413335', 'step': 1697, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:15.471260', 'step': 1697, 'epoch': 2} +{'type': 'loss', 'content': 0.011889653280377388, 'timestamp': '2025-09-30 22:12:15.473427', 'step': 1698, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:15.532562', 'step': 1698, 'epoch': 2} +{'type': 'loss', 'content': 0.0044033522717654705, 'timestamp': '2025-09-30 22:12:15.534884', 'step': 1699, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:15.598256', 'step': 1699, 'epoch': 2} +{'type': 'loss', 'content': 0.016879115253686905, 'timestamp': '2025-09-30 22:12:15.604579', 'step': 1700, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:15.657992', 'step': 1700, 'epoch': 2} +{'type': 'loss', 'content': 0.02566424012184143, 'timestamp': '2025-09-30 22:12:15.660020', 'step': 1701, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:15.713446', 'step': 1701, 'epoch': 2} +{'type': 'loss', 'content': 0.0031795850954949856, 'timestamp': '2025-09-30 22:12:15.715503', 'step': 1702, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:15.769623', 'step': 1702, 'epoch': 2} +{'type': 'loss', 'content': 0.003098678309470415, 'timestamp': '2025-09-30 22:12:15.771959', 'step': 1703, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:15.828493', 'step': 1703, 'epoch': 2} +{'type': 'loss', 'content': 0.0025204757694154978, 'timestamp': '2025-09-30 22:12:15.833993', 'step': 1704, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:15.890623', 'step': 1704, 'epoch': 2} +{'type': 'loss', 'content': 0.007822325453162193, 'timestamp': '2025-09-30 22:12:15.892874', 'step': 1705, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:15.945573', 'step': 1705, 'epoch': 2} +{'type': 'loss', 'content': 0.017046038061380386, 'timestamp': '2025-09-30 22:12:15.947579', 'step': 1706, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:16.015154', 'step': 1706, 'epoch': 2} +{'type': 'loss', 'content': 0.0030831003095954657, 'timestamp': '2025-09-30 22:12:16.017231', 'step': 1707, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:16.072731', 'step': 1707, 'epoch': 2} +{'type': 'loss', 'content': 0.00840392243117094, 'timestamp': '2025-09-30 22:12:16.078330', 'step': 1708, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:16.130875', 'step': 1708, 'epoch': 2} +{'type': 'loss', 'content': 0.011422915384173393, 'timestamp': '2025-09-30 22:12:16.133009', 'step': 1709, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:16.189790', 'step': 1709, 'epoch': 2} +{'type': 'loss', 'content': 0.004428292624652386, 'timestamp': '2025-09-30 22:12:16.191961', 'step': 1710, 'epoch': 2} +{'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}], 'timestamp': '2025-09-30 22:12:17.490807', 'step': 1710, 'epoch': 2} +{'type': 'pplx', 'content': 28349924.714862473, 'timestamp': '2025-09-30 22:12:17.492915', 'step': 1710, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:17.549103', 'step': 1710, 'epoch': 2} +{'type': 'loss', 'content': 0.024888822808861732, 'timestamp': '2025-09-30 22:12:17.552103', 'step': 1711, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:17.607307', 'step': 1711, 'epoch': 2} +{'type': 'loss', 'content': 0.022045819088816643, 'timestamp': '2025-09-30 22:12:17.613677', 'step': 1712, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:17.671421', 'step': 1712, 'epoch': 2} +{'type': 'loss', 'content': 0.006401827093213797, 'timestamp': '2025-09-30 22:12:17.673761', 'step': 1713, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:17.729271', 'step': 1713, 'epoch': 2} +{'type': 'loss', 'content': 0.007945166900753975, 'timestamp': '2025-09-30 22:12:17.733538', 'step': 1714, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:17.795084', 'step': 1714, 'epoch': 2} +{'type': 'loss', 'content': 0.006832032930105925, 'timestamp': '2025-09-30 22:12:17.797623', 'step': 1715, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:17.854066', 'step': 1715, 'epoch': 2} +{'type': 'loss', 'content': 0.010955681093037128, 'timestamp': '2025-09-30 22:12:17.860042', 'step': 1716, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:17.913039', 'step': 1716, 'epoch': 2} +{'type': 'loss', 'content': 0.025570230558514595, 'timestamp': '2025-09-30 22:12:17.917842', 'step': 1717, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:17.972577', 'step': 1717, 'epoch': 2} +{'type': 'loss', 'content': 0.013870848342776299, 'timestamp': '2025-09-30 22:12:17.977653', 'step': 1718, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:18.031960', 'step': 1718, 'epoch': 2} +{'type': 'loss', 'content': 0.018312016502022743, 'timestamp': '2025-09-30 22:12:18.034566', 'step': 1719, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:18.098990', 'step': 1719, 'epoch': 2} +{'type': 'loss', 'content': 0.016707230359315872, 'timestamp': '2025-09-30 22:12:18.104901', 'step': 1720, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:18.169517', 'step': 1720, 'epoch': 2} +{'type': 'loss', 'content': 0.010144525207579136, 'timestamp': '2025-09-30 22:12:18.171798', 'step': 1721, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:18.233785', 'step': 1721, 'epoch': 2} +{'type': 'loss', 'content': 0.011039652861654758, 'timestamp': '2025-09-30 22:12:18.237053', 'step': 1722, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:18.305230', 'step': 1722, 'epoch': 2} +{'type': 'loss', 'content': 0.01270805113017559, 'timestamp': '2025-09-30 22:12:18.313241', 'step': 1723, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:18.368284', 'step': 1723, 'epoch': 2} +{'type': 'loss', 'content': 0.00563334533944726, 'timestamp': '2025-09-30 22:12:18.375460', 'step': 1724, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:18.432695', 'step': 1724, 'epoch': 2} +{'type': 'loss', 'content': 0.005631479900330305, 'timestamp': '2025-09-30 22:12:18.440294', 'step': 1725, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:18.497729', 'step': 1725, 'epoch': 2} +{'type': 'loss', 'content': 0.009919635951519012, 'timestamp': '2025-09-30 22:12:18.505164', 'step': 1726, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:18.560365', 'step': 1726, 'epoch': 2} +{'type': 'loss', 'content': 0.023495439440011978, 'timestamp': '2025-09-30 22:12:18.564363', 'step': 1727, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:18.620110', 'step': 1727, 'epoch': 2} +{'type': 'loss', 'content': 0.008099724538624287, 'timestamp': '2025-09-30 22:12:18.626786', 'step': 1728, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:18.683795', 'step': 1728, 'epoch': 2} +{'type': 'loss', 'content': 0.009610223583877087, 'timestamp': '2025-09-30 22:12:18.697005', 'step': 1729, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:18.753105', 'step': 1729, 'epoch': 2} +{'type': 'loss', 'content': 0.009814736433327198, 'timestamp': '2025-09-30 22:12:18.758237', 'step': 1730, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:18.821455', 'step': 1730, 'epoch': 2} +{'type': 'loss', 'content': 0.02346763201057911, 'timestamp': '2025-09-30 22:12:18.835026', 'step': 1731, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:18.902723', 'step': 1731, 'epoch': 2} +{'type': 'loss', 'content': 0.0012323985574766994, 'timestamp': '2025-09-30 22:12:18.909790', 'step': 1732, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:18.965608', 'step': 1732, 'epoch': 2} +{'type': 'loss', 'content': 0.010308587923645973, 'timestamp': '2025-09-30 22:12:18.967963', 'step': 1733, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:19.025664', 'step': 1733, 'epoch': 2} +{'type': 'loss', 'content': 0.015463477931916714, 'timestamp': '2025-09-30 22:12:19.028111', 'step': 1734, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:19.082060', 'step': 1734, 'epoch': 2} +{'type': 'loss', 'content': 0.013520815409719944, 'timestamp': '2025-09-30 22:12:19.086364', 'step': 1735, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:19.142929', 'step': 1735, 'epoch': 2} +{'type': 'loss', 'content': 0.007343901786953211, 'timestamp': '2025-09-30 22:12:19.149616', 'step': 1736, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:19.207407', 'step': 1736, 'epoch': 2} +{'type': 'loss', 'content': 0.012418312020599842, 'timestamp': '2025-09-30 22:12:19.209506', 'step': 1737, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:19.267575', 'step': 1737, 'epoch': 2} +{'type': 'loss', 'content': 0.019974172115325928, 'timestamp': '2025-09-30 22:12:19.270043', 'step': 1738, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:19.329102', 'step': 1738, 'epoch': 2} +{'type': 'loss', 'content': 0.005352269392460585, 'timestamp': '2025-09-30 22:12:19.333119', 'step': 1739, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:19.393917', 'step': 1739, 'epoch': 2} +{'type': 'loss', 'content': 0.007415872532874346, 'timestamp': '2025-09-30 22:12:19.404366', 'step': 1740, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:19.465342', 'step': 1740, 'epoch': 2} +{'type': 'loss', 'content': 0.020012961700558662, 'timestamp': '2025-09-30 22:12:19.467678', 'step': 1741, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:19.526733', 'step': 1741, 'epoch': 2} +{'type': 'loss', 'content': 0.011243684217333794, 'timestamp': '2025-09-30 22:12:19.528909', 'step': 1742, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:19.599075', 'step': 1742, 'epoch': 2} +{'type': 'loss', 'content': 0.023677440360188484, 'timestamp': '2025-09-30 22:12:19.601646', 'step': 1743, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:19.655231', 'step': 1743, 'epoch': 2} +{'type': 'loss', 'content': 0.023243535310029984, 'timestamp': '2025-09-30 22:12:19.661376', 'step': 1744, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:19.730048', 'step': 1744, 'epoch': 2} +{'type': 'loss', 'content': 0.009879850782454014, 'timestamp': '2025-09-30 22:12:19.732022', 'step': 1745, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:19.786102', 'step': 1745, 'epoch': 2} +{'type': 'loss', 'content': 0.006921069230884314, 'timestamp': '2025-09-30 22:12:19.788498', 'step': 1746, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:19.853434', 'step': 1746, 'epoch': 2} +{'type': 'loss', 'content': 0.004081842955201864, 'timestamp': '2025-09-30 22:12:19.855443', 'step': 1747, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:19.910691', 'step': 1747, 'epoch': 2} +{'type': 'loss', 'content': 0.004515796434134245, 'timestamp': '2025-09-30 22:12:19.916269', 'step': 1748, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:19.973698', 'step': 1748, 'epoch': 2} +{'type': 'loss', 'content': 0.005187832750380039, 'timestamp': '2025-09-30 22:12:19.975788', 'step': 1749, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:20.030734', 'step': 1749, 'epoch': 2} +{'type': 'loss', 'content': 0.009365570731461048, 'timestamp': '2025-09-30 22:12:20.036962', 'step': 1750, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:20.099843', 'step': 1750, 'epoch': 2} +{'type': 'loss', 'content': 0.014311355538666248, 'timestamp': '2025-09-30 22:12:20.101958', 'step': 1751, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:20.163735', 'step': 1751, 'epoch': 2} +{'type': 'loss', 'content': 0.0038259513676166534, 'timestamp': '2025-09-30 22:12:20.169460', 'step': 1752, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:20.222660', 'step': 1752, 'epoch': 2} +{'type': 'loss', 'content': 0.011391245760023594, 'timestamp': '2025-09-30 22:12:20.224850', 'step': 1753, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:20.282986', 'step': 1753, 'epoch': 2} +{'type': 'loss', 'content': 0.03851420059800148, 'timestamp': '2025-09-30 22:12:20.285579', 'step': 1754, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:20.347897', 'step': 1754, 'epoch': 2} +{'type': 'loss', 'content': 0.008675575256347656, 'timestamp': '2025-09-30 22:12:20.349959', 'step': 1755, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:20.405526', 'step': 1755, 'epoch': 2} +{'type': 'loss', 'content': 0.016350556164979935, 'timestamp': '2025-09-30 22:12:20.411169', 'step': 1756, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:20.465248', 'step': 1756, 'epoch': 2} +{'type': 'loss', 'content': 0.0020395752508193254, 'timestamp': '2025-09-30 22:12:20.468495', 'step': 1757, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:20.527627', 'step': 1757, 'epoch': 2} +{'type': 'loss', 'content': 0.007115581072866917, 'timestamp': '2025-09-30 22:12:20.530931', 'step': 1758, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:20.588081', 'step': 1758, 'epoch': 2} +{'type': 'loss', 'content': 0.008915326558053493, 'timestamp': '2025-09-30 22:12:20.590168', 'step': 1759, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:20.648323', 'step': 1759, 'epoch': 2} +{'type': 'loss', 'content': 0.01385872345417738, 'timestamp': '2025-09-30 22:12:20.653777', 'step': 1760, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:20.710857', 'step': 1760, 'epoch': 2} +{'type': 'loss', 'content': 0.019434064626693726, 'timestamp': '2025-09-30 22:12:20.712841', 'step': 1761, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:20.768129', 'step': 1761, 'epoch': 2} +{'type': 'loss', 'content': 0.001426653703674674, 'timestamp': '2025-09-30 22:12:20.770139', 'step': 1762, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:20.826674', 'step': 1762, 'epoch': 2} +{'type': 'loss', 'content': 0.000667016429360956, 'timestamp': '2025-09-30 22:12:20.828901', 'step': 1763, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:20.884903', 'step': 1763, 'epoch': 2} +{'type': 'loss', 'content': 0.012331242673099041, 'timestamp': '2025-09-30 22:12:20.891604', 'step': 1764, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:20.946165', 'step': 1764, 'epoch': 2} +{'type': 'loss', 'content': 0.008639157749712467, 'timestamp': '2025-09-30 22:12:20.948392', 'step': 1765, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:21.015661', 'step': 1765, 'epoch': 2} +{'type': 'loss', 'content': 0.001175249577499926, 'timestamp': '2025-09-30 22:12:21.017767', 'step': 1766, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:21.074161', 'step': 1766, 'epoch': 2} +{'type': 'loss', 'content': 0.0008622457389719784, 'timestamp': '2025-09-30 22:12:21.076962', 'step': 1767, 'epoch': 2} +{'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}], 'timestamp': '2025-09-30 22:12:22.508631', 'step': 1767, 'epoch': 2} +{'type': 'pplx', 'content': 30314827.130945917, 'timestamp': '2025-09-30 22:12:22.511215', 'step': 1767, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:22.574951', 'step': 1767, 'epoch': 2} +{'type': 'loss', 'content': 0.001509829773567617, 'timestamp': '2025-09-30 22:12:22.581148', 'step': 1768, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:22.653620', 'step': 1768, 'epoch': 2} +{'type': 'loss', 'content': 0.011697669513523579, 'timestamp': '2025-09-30 22:12:22.655679', 'step': 1769, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:22.719592', 'step': 1769, 'epoch': 2} +{'type': 'loss', 'content': 0.0038434225134551525, 'timestamp': '2025-09-30 22:12:22.721767', 'step': 1770, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:22.779999', 'step': 1770, 'epoch': 2} +{'type': 'loss', 'content': 0.010291250422596931, 'timestamp': '2025-09-30 22:12:22.782196', 'step': 1771, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:22.855201', 'step': 1771, 'epoch': 2} +{'type': 'loss', 'content': 0.00038557566585950553, 'timestamp': '2025-09-30 22:12:22.861700', 'step': 1772, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:22.925889', 'step': 1772, 'epoch': 2} +{'type': 'loss', 'content': 0.002135960618034005, 'timestamp': '2025-09-30 22:12:22.928233', 'step': 1773, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:22.983098', 'step': 1773, 'epoch': 2} +{'type': 'loss', 'content': 0.022244064137339592, 'timestamp': '2025-09-30 22:12:22.990669', 'step': 1774, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:23.047551', 'step': 1774, 'epoch': 2} +{'type': 'loss', 'content': 0.021448533982038498, 'timestamp': '2025-09-30 22:12:23.050681', 'step': 1775, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:23.126710', 'step': 1775, 'epoch': 2} +{'type': 'loss', 'content': 0.005585017614066601, 'timestamp': '2025-09-30 22:12:23.132632', 'step': 1776, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:23.209022', 'step': 1776, 'epoch': 2} +{'type': 'loss', 'content': 0.0050450703129172325, 'timestamp': '2025-09-30 22:12:23.214519', 'step': 1777, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:23.281367', 'step': 1777, 'epoch': 2} +{'type': 'loss', 'content': 0.006257961504161358, 'timestamp': '2025-09-30 22:12:23.284211', 'step': 1778, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:23.358645', 'step': 1778, 'epoch': 2} +{'type': 'loss', 'content': 0.0036130433436483145, 'timestamp': '2025-09-30 22:12:23.360773', 'step': 1779, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:23.429914', 'step': 1779, 'epoch': 2} +{'type': 'loss', 'content': 0.0057233721017837524, 'timestamp': '2025-09-30 22:12:23.435998', 'step': 1780, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:23.492874', 'step': 1780, 'epoch': 2} +{'type': 'loss', 'content': 0.001503689563833177, 'timestamp': '2025-09-30 22:12:23.495169', 'step': 1781, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:23.551669', 'step': 1781, 'epoch': 2} +{'type': 'loss', 'content': 0.01165227685123682, 'timestamp': '2025-09-30 22:12:23.571755', 'step': 1782, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:23.661283', 'step': 1782, 'epoch': 2} +{'type': 'loss', 'content': 0.0008390057482756674, 'timestamp': '2025-09-30 22:12:23.678023', 'step': 1783, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:23.740549', 'step': 1783, 'epoch': 2} +{'type': 'loss', 'content': 0.0012339332606643438, 'timestamp': '2025-09-30 22:12:23.756039', 'step': 1784, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:23.820029', 'step': 1784, 'epoch': 2} +{'type': 'loss', 'content': 0.0013973814202472568, 'timestamp': '2025-09-30 22:12:23.828115', 'step': 1785, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:23.902989', 'step': 1785, 'epoch': 2} +{'type': 'loss', 'content': 0.0011909554013982415, 'timestamp': '2025-09-30 22:12:23.907395', 'step': 1786, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:23.984498', 'step': 1786, 'epoch': 2} +{'type': 'loss', 'content': 0.020810788497328758, 'timestamp': '2025-09-30 22:12:23.989340', 'step': 1787, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:24.047666', 'step': 1787, 'epoch': 2} +{'type': 'loss', 'content': 0.003498237580060959, 'timestamp': '2025-09-30 22:12:24.056749', 'step': 1788, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:24.128102', 'step': 1788, 'epoch': 2} +{'type': 'loss', 'content': 0.0002930622431449592, 'timestamp': '2025-09-30 22:12:24.135350', 'step': 1789, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:24.208793', 'step': 1789, 'epoch': 2} +{'type': 'loss', 'content': 0.0038824868388473988, 'timestamp': '2025-09-30 22:12:24.213275', 'step': 1790, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:24.272697', 'step': 1790, 'epoch': 2} +{'type': 'loss', 'content': 0.0029304868075996637, 'timestamp': '2025-09-30 22:12:24.275392', 'step': 1791, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:24.361640', 'step': 1791, 'epoch': 2} +{'type': 'loss', 'content': 0.008040046319365501, 'timestamp': '2025-09-30 22:12:24.373017', 'step': 1792, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:24.447033', 'step': 1792, 'epoch': 2} +{'type': 'loss', 'content': 0.005316526163369417, 'timestamp': '2025-09-30 22:12:24.450440', 'step': 1793, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:24.525425', 'step': 1793, 'epoch': 2} +{'type': 'loss', 'content': 0.0005331950960680842, 'timestamp': '2025-09-30 22:12:24.532931', 'step': 1794, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:24.598646', 'step': 1794, 'epoch': 2} +{'type': 'loss', 'content': 0.0008259565802291036, 'timestamp': '2025-09-30 22:12:24.605028', 'step': 1795, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:24.665867', 'step': 1795, 'epoch': 2} +{'type': 'loss', 'content': 0.005228062160313129, 'timestamp': '2025-09-30 22:12:24.674437', 'step': 1796, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:24.742664', 'step': 1796, 'epoch': 2} +{'type': 'loss', 'content': 0.025880755856633186, 'timestamp': '2025-09-30 22:12:24.748436', 'step': 1797, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:24.806826', 'step': 1797, 'epoch': 2} +{'type': 'loss', 'content': 0.04264013096690178, 'timestamp': '2025-09-30 22:12:24.813400', 'step': 1798, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:24.875741', 'step': 1798, 'epoch': 2} +{'type': 'loss', 'content': 0.03723777085542679, 'timestamp': '2025-09-30 22:12:24.881371', 'step': 1799, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:24.943188', 'step': 1799, 'epoch': 2} +{'type': 'loss', 'content': 0.028541741892695427, 'timestamp': '2025-09-30 22:12:24.950398', 'step': 1800, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:25.009304', 'step': 1800, 'epoch': 2} +{'type': 'loss', 'content': 0.000658934935927391, 'timestamp': '2025-09-30 22:12:25.014125', 'step': 1801, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:25.076617', 'step': 1801, 'epoch': 2} +{'type': 'loss', 'content': 0.00514825526624918, 'timestamp': '2025-09-30 22:12:25.081465', 'step': 1802, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:25.140816', 'step': 1802, 'epoch': 2} +{'type': 'loss', 'content': 0.01359549630433321, 'timestamp': '2025-09-30 22:12:25.144175', 'step': 1803, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:25.209586', 'step': 1803, 'epoch': 2} +{'type': 'loss', 'content': 0.003553016809746623, 'timestamp': '2025-09-30 22:12:25.216248', 'step': 1804, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:25.276635', 'step': 1804, 'epoch': 2} +{'type': 'loss', 'content': 0.01718035340309143, 'timestamp': '2025-09-30 22:12:25.280427', 'step': 1805, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:25.339761', 'step': 1805, 'epoch': 2} +{'type': 'loss', 'content': 0.0026898744981735945, 'timestamp': '2025-09-30 22:12:25.343047', 'step': 1806, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:25.411331', 'step': 1806, 'epoch': 2} +{'type': 'loss', 'content': 0.029336009174585342, 'timestamp': '2025-09-30 22:12:25.414050', 'step': 1807, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:25.476066', 'step': 1807, 'epoch': 2} +{'type': 'loss', 'content': 0.0065011694096028805, 'timestamp': '2025-09-30 22:12:25.485199', 'step': 1808, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:25.550418', 'step': 1808, 'epoch': 2} +{'type': 'loss', 'content': 0.004351469222456217, 'timestamp': '2025-09-30 22:12:25.556302', 'step': 1809, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:25.623912', 'step': 1809, 'epoch': 2} +{'type': 'loss', 'content': 0.009557033888995647, 'timestamp': '2025-09-30 22:12:25.626657', 'step': 1810, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:25.686205', 'step': 1810, 'epoch': 2} +{'type': 'loss', 'content': 0.021093687042593956, 'timestamp': '2025-09-30 22:12:25.692721', 'step': 1811, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:25.751652', 'step': 1811, 'epoch': 2} +{'type': 'loss', 'content': 0.007726198993623257, 'timestamp': '2025-09-30 22:12:25.761738', 'step': 1812, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:25.826331', 'step': 1812, 'epoch': 2} +{'type': 'loss', 'content': 0.006598359905183315, 'timestamp': '2025-09-30 22:12:25.828662', 'step': 1813, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:25.892888', 'step': 1813, 'epoch': 2} +{'type': 'loss', 'content': 0.007870987989008427, 'timestamp': '2025-09-30 22:12:25.896860', 'step': 1814, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:25.952583', 'step': 1814, 'epoch': 2} +{'type': 'loss', 'content': 0.011960902251303196, 'timestamp': '2025-09-30 22:12:25.959745', 'step': 1815, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:26.032617', 'step': 1815, 'epoch': 2} +{'type': 'loss', 'content': 0.002732042223215103, 'timestamp': '2025-09-30 22:12:26.040178', 'step': 1816, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:26.098524', 'step': 1816, 'epoch': 2} +{'type': 'loss', 'content': 0.0035709214862436056, 'timestamp': '2025-09-30 22:12:26.102122', 'step': 1817, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:26.162259', 'step': 1817, 'epoch': 2} +{'type': 'loss', 'content': 0.03033597581088543, 'timestamp': '2025-09-30 22:12:26.168318', 'step': 1818, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:26.234789', 'step': 1818, 'epoch': 2} +{'type': 'loss', 'content': 0.04264497384428978, 'timestamp': '2025-09-30 22:12:26.242009', 'step': 1819, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:26.300287', 'step': 1819, 'epoch': 2} +{'type': 'loss', 'content': 0.008137590251863003, 'timestamp': '2025-09-30 22:12:26.307528', 'step': 1820, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:26.374224', 'step': 1820, 'epoch': 2} +{'type': 'loss', 'content': 0.004997264593839645, 'timestamp': '2025-09-30 22:12:26.384862', 'step': 1821, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:26.448877', 'step': 1821, 'epoch': 2} +{'type': 'loss', 'content': 0.0041570719331502914, 'timestamp': '2025-09-30 22:12:26.453045', 'step': 1822, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:26.526430', 'step': 1822, 'epoch': 2} +{'type': 'loss', 'content': 0.004612022079527378, 'timestamp': '2025-09-30 22:12:26.530133', 'step': 1823, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:26.591520', 'step': 1823, 'epoch': 2} +{'type': 'loss', 'content': 0.024884754791855812, 'timestamp': '2025-09-30 22:12:26.598978', 'step': 1824, 'epoch': 2} +{'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}], 'timestamp': '2025-09-30 22:12:28.095332', 'step': 1824, 'epoch': 2} +{'type': 'pplx', 'content': 30672189.36827617, 'timestamp': '2025-09-30 22:12:28.100771', 'step': 1824, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:28.170601', 'step': 1824, 'epoch': 2} +{'type': 'loss', 'content': 0.00739399716258049, 'timestamp': '2025-09-30 22:12:28.173632', 'step': 1825, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:28.241236', 'step': 1825, 'epoch': 2} +{'type': 'loss', 'content': 0.01240342017263174, 'timestamp': '2025-09-30 22:12:28.251710', 'step': 1826, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:28.317530', 'step': 1826, 'epoch': 2} +{'type': 'loss', 'content': 0.0024557800497859716, 'timestamp': '2025-09-30 22:12:28.328228', 'step': 1827, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:28.386913', 'step': 1827, 'epoch': 2} +{'type': 'loss', 'content': 0.013910613022744656, 'timestamp': '2025-09-30 22:12:28.394292', 'step': 1828, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:28.450865', 'step': 1828, 'epoch': 2} +{'type': 'loss', 'content': 0.001578363822773099, 'timestamp': '2025-09-30 22:12:28.453690', 'step': 1829, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:28.511246', 'step': 1829, 'epoch': 2} +{'type': 'loss', 'content': 0.04201820492744446, 'timestamp': '2025-09-30 22:12:28.517272', 'step': 1830, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:28.576318', 'step': 1830, 'epoch': 2} +{'type': 'loss', 'content': 0.0008280682959593832, 'timestamp': '2025-09-30 22:12:28.578521', 'step': 1831, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:28.649280', 'step': 1831, 'epoch': 2} +{'type': 'loss', 'content': 0.006336410064250231, 'timestamp': '2025-09-30 22:12:28.661730', 'step': 1832, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:28.742433', 'step': 1832, 'epoch': 2} +{'type': 'loss', 'content': 0.005034334491938353, 'timestamp': '2025-09-30 22:12:28.746912', 'step': 1833, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:28.836112', 'step': 1833, 'epoch': 2} +{'type': 'loss', 'content': 0.013774161227047443, 'timestamp': '2025-09-30 22:12:28.839488', 'step': 1834, 'epoch': 2} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:28.905166', 'step': 1834, 'epoch': 3} +{'type': 'loss', 'content': 0.06970737874507904, 'timestamp': '2025-09-30 22:12:28.913773', 'step': 1835, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:28.974737', 'step': 1835, 'epoch': 3} +{'type': 'loss', 'content': 0.053864989429712296, 'timestamp': '2025-09-30 22:12:28.983214', 'step': 1836, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:29.050549', 'step': 1836, 'epoch': 3} +{'type': 'loss', 'content': 0.0217081718146801, 'timestamp': '2025-09-30 22:12:29.052793', 'step': 1837, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:29.121315', 'step': 1837, 'epoch': 3} +{'type': 'loss', 'content': 0.013128918595612049, 'timestamp': '2025-09-30 22:12:29.123710', 'step': 1838, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:29.188599', 'step': 1838, 'epoch': 3} +{'type': 'loss', 'content': 0.057358644902706146, 'timestamp': '2025-09-30 22:12:29.191407', 'step': 1839, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:29.249536', 'step': 1839, 'epoch': 3} +{'type': 'loss', 'content': 0.03622815012931824, 'timestamp': '2025-09-30 22:12:29.255911', 'step': 1840, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:29.323306', 'step': 1840, 'epoch': 3} +{'type': 'loss', 'content': 0.027150630950927734, 'timestamp': '2025-09-30 22:12:29.325939', 'step': 1841, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:29.391564', 'step': 1841, 'epoch': 3} +{'type': 'loss', 'content': 0.002653220435604453, 'timestamp': '2025-09-30 22:12:29.398259', 'step': 1842, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:29.482420', 'step': 1842, 'epoch': 3} +{'type': 'loss', 'content': 0.0013348475331440568, 'timestamp': '2025-09-30 22:12:29.485991', 'step': 1843, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:29.544185', 'step': 1843, 'epoch': 3} +{'type': 'loss', 'content': 0.029313331469893456, 'timestamp': '2025-09-30 22:12:29.549874', 'step': 1844, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:29.621496', 'step': 1844, 'epoch': 3} +{'type': 'loss', 'content': 0.007896478287875652, 'timestamp': '2025-09-30 22:12:29.625555', 'step': 1845, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:29.689604', 'step': 1845, 'epoch': 3} +{'type': 'loss', 'content': 0.0026143400464206934, 'timestamp': '2025-09-30 22:12:29.696347', 'step': 1846, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:29.758025', 'step': 1846, 'epoch': 3} +{'type': 'loss', 'content': 0.01978653483092785, 'timestamp': '2025-09-30 22:12:29.761214', 'step': 1847, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:29.827132', 'step': 1847, 'epoch': 3} +{'type': 'loss', 'content': 0.028982680290937424, 'timestamp': '2025-09-30 22:12:29.833733', 'step': 1848, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:29.893636', 'step': 1848, 'epoch': 3} +{'type': 'loss', 'content': 0.03256614878773689, 'timestamp': '2025-09-30 22:12:29.898393', 'step': 1849, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:29.963923', 'step': 1849, 'epoch': 3} +{'type': 'loss', 'content': 0.010735915042459965, 'timestamp': '2025-09-30 22:12:29.967708', 'step': 1850, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:30.025959', 'step': 1850, 'epoch': 3} +{'type': 'loss', 'content': 0.011695819906890392, 'timestamp': '2025-09-30 22:12:30.029295', 'step': 1851, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:30.084003', 'step': 1851, 'epoch': 3} +{'type': 'loss', 'content': 0.006875795312225819, 'timestamp': '2025-09-30 22:12:30.093480', 'step': 1852, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:30.151820', 'step': 1852, 'epoch': 3} +{'type': 'loss', 'content': 0.014828616753220558, 'timestamp': '2025-09-30 22:12:30.157422', 'step': 1853, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:30.218984', 'step': 1853, 'epoch': 3} +{'type': 'loss', 'content': 0.010935215279459953, 'timestamp': '2025-09-30 22:12:30.222139', 'step': 1854, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:30.280694', 'step': 1854, 'epoch': 3} +{'type': 'loss', 'content': 0.011751067824661732, 'timestamp': '2025-09-30 22:12:30.283823', 'step': 1855, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:30.342542', 'step': 1855, 'epoch': 3} +{'type': 'loss', 'content': 0.03328597545623779, 'timestamp': '2025-09-30 22:12:30.349244', 'step': 1856, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:30.427820', 'step': 1856, 'epoch': 3} +{'type': 'loss', 'content': 0.015032247640192509, 'timestamp': '2025-09-30 22:12:30.431664', 'step': 1857, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:30.487533', 'step': 1857, 'epoch': 3} +{'type': 'loss', 'content': 0.014371916651725769, 'timestamp': '2025-09-30 22:12:30.495788', 'step': 1858, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:30.578734', 'step': 1858, 'epoch': 3} +{'type': 'loss', 'content': 0.010754152201116085, 'timestamp': '2025-09-30 22:12:30.584875', 'step': 1859, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:30.647369', 'step': 1859, 'epoch': 3} +{'type': 'loss', 'content': 0.01473289541900158, 'timestamp': '2025-09-30 22:12:30.654052', 'step': 1860, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:30.712046', 'step': 1860, 'epoch': 3} +{'type': 'loss', 'content': 0.008067458868026733, 'timestamp': '2025-09-30 22:12:30.714880', 'step': 1861, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:30.774679', 'step': 1861, 'epoch': 3} +{'type': 'loss', 'content': 0.02813909575343132, 'timestamp': '2025-09-30 22:12:30.779834', 'step': 1862, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:30.841083', 'step': 1862, 'epoch': 3} +{'type': 'loss', 'content': 0.008334296755492687, 'timestamp': '2025-09-30 22:12:30.852524', 'step': 1863, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:30.913530', 'step': 1863, 'epoch': 3} +{'type': 'loss', 'content': 0.015859607607126236, 'timestamp': '2025-09-30 22:12:30.920050', 'step': 1864, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:30.976590', 'step': 1864, 'epoch': 3} +{'type': 'loss', 'content': 0.017760148271918297, 'timestamp': '2025-09-30 22:12:30.982243', 'step': 1865, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:31.053340', 'step': 1865, 'epoch': 3} +{'type': 'loss', 'content': 0.024972213432192802, 'timestamp': '2025-09-30 22:12:31.056911', 'step': 1866, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:31.125418', 'step': 1866, 'epoch': 3} +{'type': 'loss', 'content': 0.007451011333614588, 'timestamp': '2025-09-30 22:12:31.128354', 'step': 1867, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:31.185780', 'step': 1867, 'epoch': 3} +{'type': 'loss', 'content': 0.023097878322005272, 'timestamp': '2025-09-30 22:12:31.191973', 'step': 1868, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:31.247075', 'step': 1868, 'epoch': 3} +{'type': 'loss', 'content': 0.005390969570726156, 'timestamp': '2025-09-30 22:12:31.250345', 'step': 1869, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:31.307936', 'step': 1869, 'epoch': 3} +{'type': 'loss', 'content': 0.014087321236729622, 'timestamp': '2025-09-30 22:12:31.322415', 'step': 1870, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:31.385612', 'step': 1870, 'epoch': 3} +{'type': 'loss', 'content': 0.018397578969597816, 'timestamp': '2025-09-30 22:12:31.390764', 'step': 1871, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:31.462449', 'step': 1871, 'epoch': 3} +{'type': 'loss', 'content': 0.007025882601737976, 'timestamp': '2025-09-30 22:12:31.480345', 'step': 1872, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:31.536704', 'step': 1872, 'epoch': 3} +{'type': 'loss', 'content': 0.011556877754628658, 'timestamp': '2025-09-30 22:12:31.541459', 'step': 1873, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:31.605457', 'step': 1873, 'epoch': 3} +{'type': 'loss', 'content': 0.01839577779173851, 'timestamp': '2025-09-30 22:12:31.608900', 'step': 1874, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:31.680668', 'step': 1874, 'epoch': 3} +{'type': 'loss', 'content': 0.010417903773486614, 'timestamp': '2025-09-30 22:12:31.690146', 'step': 1875, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:31.744977', 'step': 1875, 'epoch': 3} +{'type': 'loss', 'content': 0.009947366081178188, 'timestamp': '2025-09-30 22:12:31.751963', 'step': 1876, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:31.820158', 'step': 1876, 'epoch': 3} +{'type': 'loss', 'content': 0.012040347792208195, 'timestamp': '2025-09-30 22:12:31.822535', 'step': 1877, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:31.890594', 'step': 1877, 'epoch': 3} +{'type': 'loss', 'content': 0.0012225349200889468, 'timestamp': '2025-09-30 22:12:31.894112', 'step': 1878, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:31.949728', 'step': 1878, 'epoch': 3} +{'type': 'loss', 'content': 0.006086386274546385, 'timestamp': '2025-09-30 22:12:31.953732', 'step': 1879, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:32.009116', 'step': 1879, 'epoch': 3} +{'type': 'loss', 'content': 0.00934376660734415, 'timestamp': '2025-09-30 22:12:32.016702', 'step': 1880, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:32.071527', 'step': 1880, 'epoch': 3} +{'type': 'loss', 'content': 0.009230917319655418, 'timestamp': '2025-09-30 22:12:32.073939', 'step': 1881, 'epoch': 3} +{'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}], 'timestamp': '2025-09-30 22:12:33.529057', 'step': 1881, 'epoch': 3} +{'type': 'pplx', 'content': 29811204.71677049, 'timestamp': '2025-09-30 22:12:33.532880', 'step': 1881, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:33.589718', 'step': 1881, 'epoch': 3} +{'type': 'loss', 'content': 0.002237612148746848, 'timestamp': '2025-09-30 22:12:33.597449', 'step': 1882, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:33.654864', 'step': 1882, 'epoch': 3} +{'type': 'loss', 'content': 0.0021926003973931074, 'timestamp': '2025-09-30 22:12:33.657646', 'step': 1883, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:33.719097', 'step': 1883, 'epoch': 3} +{'type': 'loss', 'content': 0.007021576166152954, 'timestamp': '2025-09-30 22:12:33.725938', 'step': 1884, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:33.787887', 'step': 1884, 'epoch': 3} +{'type': 'loss', 'content': 0.002189788268879056, 'timestamp': '2025-09-30 22:12:33.791754', 'step': 1885, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:33.849419', 'step': 1885, 'epoch': 3} +{'type': 'loss', 'content': 0.0325036458671093, 'timestamp': '2025-09-30 22:12:33.852113', 'step': 1886, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:33.912628', 'step': 1886, 'epoch': 3} +{'type': 'loss', 'content': 0.02236909046769142, 'timestamp': '2025-09-30 22:12:33.915640', 'step': 1887, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:33.973272', 'step': 1887, 'epoch': 3} +{'type': 'loss', 'content': 0.018104365095496178, 'timestamp': '2025-09-30 22:12:33.979563', 'step': 1888, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:34.034551', 'step': 1888, 'epoch': 3} +{'type': 'loss', 'content': 0.01875036023557186, 'timestamp': '2025-09-30 22:12:34.037045', 'step': 1889, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:34.095906', 'step': 1889, 'epoch': 3} +{'type': 'loss', 'content': 0.010548067279160023, 'timestamp': '2025-09-30 22:12:34.098658', 'step': 1890, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:34.155832', 'step': 1890, 'epoch': 3} +{'type': 'loss', 'content': 0.007385232951492071, 'timestamp': '2025-09-30 22:12:34.159492', 'step': 1891, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:34.214578', 'step': 1891, 'epoch': 3} +{'type': 'loss', 'content': 0.00859599094837904, 'timestamp': '2025-09-30 22:12:34.226724', 'step': 1892, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:34.287077', 'step': 1892, 'epoch': 3} +{'type': 'loss', 'content': 0.011528218165040016, 'timestamp': '2025-09-30 22:12:34.289393', 'step': 1893, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:34.344245', 'step': 1893, 'epoch': 3} +{'type': 'loss', 'content': 0.03839213401079178, 'timestamp': '2025-09-30 22:12:34.347664', 'step': 1894, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:34.413440', 'step': 1894, 'epoch': 3} +{'type': 'loss', 'content': 0.004163206089287996, 'timestamp': '2025-09-30 22:12:34.420353', 'step': 1895, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:34.476133', 'step': 1895, 'epoch': 3} +{'type': 'loss', 'content': 0.006831273436546326, 'timestamp': '2025-09-30 22:12:34.482107', 'step': 1896, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:34.550831', 'step': 1896, 'epoch': 3} +{'type': 'loss', 'content': 0.0011001095408573747, 'timestamp': '2025-09-30 22:12:34.553869', 'step': 1897, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:34.613063', 'step': 1897, 'epoch': 3} +{'type': 'loss', 'content': 0.002597188577055931, 'timestamp': '2025-09-30 22:12:34.619835', 'step': 1898, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:34.678929', 'step': 1898, 'epoch': 3} +{'type': 'loss', 'content': 0.003021536162123084, 'timestamp': '2025-09-30 22:12:34.681965', 'step': 1899, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:34.736489', 'step': 1899, 'epoch': 3} +{'type': 'loss', 'content': 0.035559866577386856, 'timestamp': '2025-09-30 22:12:34.747555', 'step': 1900, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:34.808191', 'step': 1900, 'epoch': 3} +{'type': 'loss', 'content': 0.00288590369746089, 'timestamp': '2025-09-30 22:12:34.812074', 'step': 1901, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:34.875165', 'step': 1901, 'epoch': 3} +{'type': 'loss', 'content': 0.010649462230503559, 'timestamp': '2025-09-30 22:12:34.877318', 'step': 1902, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:34.938011', 'step': 1902, 'epoch': 3} +{'type': 'loss', 'content': 0.020891645923256874, 'timestamp': '2025-09-30 22:12:34.946875', 'step': 1903, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:35.007218', 'step': 1903, 'epoch': 3} +{'type': 'loss', 'content': 0.034685712307691574, 'timestamp': '2025-09-30 22:12:35.019492', 'step': 1904, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:35.075150', 'step': 1904, 'epoch': 3} +{'type': 'loss', 'content': 0.0015056979609653354, 'timestamp': '2025-09-30 22:12:35.078579', 'step': 1905, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:35.133854', 'step': 1905, 'epoch': 3} +{'type': 'loss', 'content': 0.036593399941921234, 'timestamp': '2025-09-30 22:12:35.137590', 'step': 1906, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:35.193504', 'step': 1906, 'epoch': 3} +{'type': 'loss', 'content': 0.0018256593029946089, 'timestamp': '2025-09-30 22:12:35.196477', 'step': 1907, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:35.255154', 'step': 1907, 'epoch': 3} +{'type': 'loss', 'content': 0.04090496152639389, 'timestamp': '2025-09-30 22:12:35.261202', 'step': 1908, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:35.317711', 'step': 1908, 'epoch': 3} +{'type': 'loss', 'content': 0.014386068098247051, 'timestamp': '2025-09-30 22:12:35.321688', 'step': 1909, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:35.386201', 'step': 1909, 'epoch': 3} +{'type': 'loss', 'content': 0.018922999501228333, 'timestamp': '2025-09-30 22:12:35.390261', 'step': 1910, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:35.449885', 'step': 1910, 'epoch': 3} +{'type': 'loss', 'content': 0.015430129133164883, 'timestamp': '2025-09-30 22:12:35.456042', 'step': 1911, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:35.516289', 'step': 1911, 'epoch': 3} +{'type': 'loss', 'content': 0.014453909359872341, 'timestamp': '2025-09-30 22:12:35.522502', 'step': 1912, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:35.583529', 'step': 1912, 'epoch': 3} +{'type': 'loss', 'content': 0.019513025879859924, 'timestamp': '2025-09-30 22:12:35.586244', 'step': 1913, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:35.644204', 'step': 1913, 'epoch': 3} +{'type': 'loss', 'content': 0.011197819374501705, 'timestamp': '2025-09-30 22:12:35.659235', 'step': 1914, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:35.718192', 'step': 1914, 'epoch': 3} +{'type': 'loss', 'content': 0.016041845083236694, 'timestamp': '2025-09-30 22:12:35.720928', 'step': 1915, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:35.776597', 'step': 1915, 'epoch': 3} +{'type': 'loss', 'content': 0.008869746699929237, 'timestamp': '2025-09-30 22:12:35.786518', 'step': 1916, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:35.843545', 'step': 1916, 'epoch': 3} +{'type': 'loss', 'content': 0.009099365212023258, 'timestamp': '2025-09-30 22:12:35.847455', 'step': 1917, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:35.904360', 'step': 1917, 'epoch': 3} +{'type': 'loss', 'content': 0.014621244743466377, 'timestamp': '2025-09-30 22:12:35.909624', 'step': 1918, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:35.965569', 'step': 1918, 'epoch': 3} +{'type': 'loss', 'content': 0.009779931046068668, 'timestamp': '2025-09-30 22:12:35.981765', 'step': 1919, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:36.046589', 'step': 1919, 'epoch': 3} +{'type': 'loss', 'content': 0.016505222767591476, 'timestamp': '2025-09-30 22:12:36.060991', 'step': 1920, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:36.125986', 'step': 1920, 'epoch': 3} +{'type': 'loss', 'content': 0.006841294001787901, 'timestamp': '2025-09-30 22:12:36.129710', 'step': 1921, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:36.189066', 'step': 1921, 'epoch': 3} +{'type': 'loss', 'content': 0.009847632609307766, 'timestamp': '2025-09-30 22:12:36.195855', 'step': 1922, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:36.252944', 'step': 1922, 'epoch': 3} +{'type': 'loss', 'content': 0.02639087848365307, 'timestamp': '2025-09-30 22:12:36.258090', 'step': 1923, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:36.319740', 'step': 1923, 'epoch': 3} +{'type': 'loss', 'content': 0.01638978160917759, 'timestamp': '2025-09-30 22:12:36.326755', 'step': 1924, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:36.386575', 'step': 1924, 'epoch': 3} +{'type': 'loss', 'content': 0.028748007491230965, 'timestamp': '2025-09-30 22:12:36.390353', 'step': 1925, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:36.444542', 'step': 1925, 'epoch': 3} +{'type': 'loss', 'content': 0.018612733110785484, 'timestamp': '2025-09-30 22:12:36.447012', 'step': 1926, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:36.501961', 'step': 1926, 'epoch': 3} +{'type': 'loss', 'content': 0.004065474960952997, 'timestamp': '2025-09-30 22:12:36.508877', 'step': 1927, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:36.565538', 'step': 1927, 'epoch': 3} +{'type': 'loss', 'content': 0.006613335572183132, 'timestamp': '2025-09-30 22:12:36.571650', 'step': 1928, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:36.625346', 'step': 1928, 'epoch': 3} +{'type': 'loss', 'content': 0.024656997993588448, 'timestamp': '2025-09-30 22:12:36.628060', 'step': 1929, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:36.686966', 'step': 1929, 'epoch': 3} +{'type': 'loss', 'content': 0.04802418872714043, 'timestamp': '2025-09-30 22:12:36.689252', 'step': 1930, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:36.742692', 'step': 1930, 'epoch': 3} +{'type': 'loss', 'content': 0.021498506888747215, 'timestamp': '2025-09-30 22:12:36.745190', 'step': 1931, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:36.799740', 'step': 1931, 'epoch': 3} +{'type': 'loss', 'content': 0.008578136563301086, 'timestamp': '2025-09-30 22:12:36.805549', 'step': 1932, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:36.861069', 'step': 1932, 'epoch': 3} +{'type': 'loss', 'content': 0.0029331946279853582, 'timestamp': '2025-09-30 22:12:36.863376', 'step': 1933, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:36.925739', 'step': 1933, 'epoch': 3} +{'type': 'loss', 'content': 0.01277841441333294, 'timestamp': '2025-09-30 22:12:36.928011', 'step': 1934, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:36.989085', 'step': 1934, 'epoch': 3} +{'type': 'loss', 'content': 0.008462684229016304, 'timestamp': '2025-09-30 22:12:36.992617', 'step': 1935, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:37.058882', 'step': 1935, 'epoch': 3} +{'type': 'loss', 'content': 0.00633528595790267, 'timestamp': '2025-09-30 22:12:37.064520', 'step': 1936, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:37.126025', 'step': 1936, 'epoch': 3} +{'type': 'loss', 'content': 0.009705213829874992, 'timestamp': '2025-09-30 22:12:37.128017', 'step': 1937, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:37.183503', 'step': 1937, 'epoch': 3} +{'type': 'loss', 'content': 0.009753524325788021, 'timestamp': '2025-09-30 22:12:37.187696', 'step': 1938, 'epoch': 3} +{'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}], 'timestamp': '2025-09-30 22:12:38.512858', 'step': 1938, 'epoch': 3} +{'type': 'pplx', 'content': 29558369.15336448, 'timestamp': '2025-09-30 22:12:38.514909', 'step': 1938, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:38.567034', 'step': 1938, 'epoch': 3} +{'type': 'loss', 'content': 0.025362467393279076, 'timestamp': '2025-09-30 22:12:38.571280', 'step': 1939, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:38.646596', 'step': 1939, 'epoch': 3} +{'type': 'loss', 'content': 0.004295314662158489, 'timestamp': '2025-09-30 22:12:38.654788', 'step': 1940, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:38.717141', 'step': 1940, 'epoch': 3} +{'type': 'loss', 'content': 0.0038865043316036463, 'timestamp': '2025-09-30 22:12:38.723450', 'step': 1941, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:38.785516', 'step': 1941, 'epoch': 3} +{'type': 'loss', 'content': 0.016797490417957306, 'timestamp': '2025-09-30 22:12:38.790002', 'step': 1942, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:38.845954', 'step': 1942, 'epoch': 3} +{'type': 'loss', 'content': 0.031100988388061523, 'timestamp': '2025-09-30 22:12:38.850849', 'step': 1943, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-30 22:12:38.924658', 'step': 1943, 'epoch': 3} +{'type': 'loss', 'content': 0.0393851213157177, 'timestamp': '2025-09-30 22:12:38.935303', 'step': 1944, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:38.996063', 'step': 1944, 'epoch': 3} +{'type': 'loss', 'content': 0.005985604133456945, 'timestamp': '2025-09-30 22:12:39.001238', 'step': 1945, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:39.058690', 'step': 1945, 'epoch': 3} +{'type': 'loss', 'content': 0.008343399502336979, 'timestamp': '2025-09-30 22:12:39.064100', 'step': 1946, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:39.120940', 'step': 1946, 'epoch': 3} +{'type': 'loss', 'content': 0.003788830479606986, 'timestamp': '2025-09-30 22:12:39.126550', 'step': 1947, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:39.185776', 'step': 1947, 'epoch': 3} +{'type': 'loss', 'content': 0.005876143928617239, 'timestamp': '2025-09-30 22:12:39.194044', 'step': 1948, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:39.247405', 'step': 1948, 'epoch': 3} +{'type': 'loss', 'content': 0.009399537928402424, 'timestamp': '2025-09-30 22:12:39.252551', 'step': 1949, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:39.317976', 'step': 1949, 'epoch': 3} +{'type': 'loss', 'content': 0.0053526838310062885, 'timestamp': '2025-09-30 22:12:39.321738', 'step': 1950, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:39.384587', 'step': 1950, 'epoch': 3} +{'type': 'loss', 'content': 0.021824544295668602, 'timestamp': '2025-09-30 22:12:39.389592', 'step': 1951, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:39.448210', 'step': 1951, 'epoch': 3} +{'type': 'loss', 'content': 0.016011234372854233, 'timestamp': '2025-09-30 22:12:39.457427', 'step': 1952, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:39.515827', 'step': 1952, 'epoch': 3} +{'type': 'loss', 'content': 0.0025454445276409388, 'timestamp': '2025-09-30 22:12:39.522667', 'step': 1953, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:39.580991', 'step': 1953, 'epoch': 3} +{'type': 'loss', 'content': 0.003802164224907756, 'timestamp': '2025-09-30 22:12:39.584553', 'step': 1954, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:39.644276', 'step': 1954, 'epoch': 3} +{'type': 'loss', 'content': 0.03186912089586258, 'timestamp': '2025-09-30 22:12:39.646374', 'step': 1955, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:39.704066', 'step': 1955, 'epoch': 3} +{'type': 'loss', 'content': 0.023978808894753456, 'timestamp': '2025-09-30 22:12:39.710510', 'step': 1956, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:39.767903', 'step': 1956, 'epoch': 3} +{'type': 'loss', 'content': 0.01542146410793066, 'timestamp': '2025-09-30 22:12:39.770627', 'step': 1957, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:39.824884', 'step': 1957, 'epoch': 3} +{'type': 'loss', 'content': 0.009438499808311462, 'timestamp': '2025-09-30 22:12:39.830557', 'step': 1958, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:39.900535', 'step': 1958, 'epoch': 3} +{'type': 'loss', 'content': 0.00540934456512332, 'timestamp': '2025-09-30 22:12:39.903028', 'step': 1959, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:39.958455', 'step': 1959, 'epoch': 3} +{'type': 'loss', 'content': 0.034282151609659195, 'timestamp': '2025-09-30 22:12:39.964357', 'step': 1960, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:40.020785', 'step': 1960, 'epoch': 3} +{'type': 'loss', 'content': 0.006182772573083639, 'timestamp': '2025-09-30 22:12:40.022932', 'step': 1961, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:40.077532', 'step': 1961, 'epoch': 3} +{'type': 'loss', 'content': 0.004338286817073822, 'timestamp': '2025-09-30 22:12:40.079805', 'step': 1962, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:40.134069', 'step': 1962, 'epoch': 3} +{'type': 'loss', 'content': 0.012008560821413994, 'timestamp': '2025-09-30 22:12:40.136462', 'step': 1963, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:40.217923', 'step': 1963, 'epoch': 3} +{'type': 'loss', 'content': 0.03428441658616066, 'timestamp': '2025-09-30 22:12:40.223659', 'step': 1964, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:40.278527', 'step': 1964, 'epoch': 3} +{'type': 'loss', 'content': 0.008386987261474133, 'timestamp': '2025-09-30 22:12:40.280482', 'step': 1965, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:40.340067', 'step': 1965, 'epoch': 3} +{'type': 'loss', 'content': 0.0035431894939392805, 'timestamp': '2025-09-30 22:12:40.352999', 'step': 1966, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:40.426650', 'step': 1966, 'epoch': 3} +{'type': 'loss', 'content': 0.005195711273699999, 'timestamp': '2025-09-30 22:12:40.428878', 'step': 1967, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:40.483244', 'step': 1967, 'epoch': 3} +{'type': 'loss', 'content': 0.014768867753446102, 'timestamp': '2025-09-30 22:12:40.489131', 'step': 1968, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:40.541727', 'step': 1968, 'epoch': 3} +{'type': 'loss', 'content': 0.0026658920105546713, 'timestamp': '2025-09-30 22:12:40.544813', 'step': 1969, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:40.605167', 'step': 1969, 'epoch': 3} +{'type': 'loss', 'content': 0.00781507883220911, 'timestamp': '2025-09-30 22:12:40.607386', 'step': 1970, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:40.671135', 'step': 1970, 'epoch': 3} +{'type': 'loss', 'content': 0.012213180772960186, 'timestamp': '2025-09-30 22:12:40.672951', 'step': 1971, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:40.731179', 'step': 1971, 'epoch': 3} +{'type': 'loss', 'content': 0.030378760769963264, 'timestamp': '2025-09-30 22:12:40.739563', 'step': 1972, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:40.796297', 'step': 1972, 'epoch': 3} +{'type': 'loss', 'content': 0.014325362630188465, 'timestamp': '2025-09-30 22:12:40.800560', 'step': 1973, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:40.865717', 'step': 1973, 'epoch': 3} +{'type': 'loss', 'content': 0.006878285203129053, 'timestamp': '2025-09-30 22:12:40.871537', 'step': 1974, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:40.943601', 'step': 1974, 'epoch': 3} +{'type': 'loss', 'content': 0.019338076934218407, 'timestamp': '2025-09-30 22:12:40.960489', 'step': 1975, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:41.020679', 'step': 1975, 'epoch': 3} +{'type': 'loss', 'content': 0.013357887975871563, 'timestamp': '2025-09-30 22:12:41.029453', 'step': 1976, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:41.107755', 'step': 1976, 'epoch': 3} +{'type': 'loss', 'content': 0.00660253269597888, 'timestamp': '2025-09-30 22:12:41.126488', 'step': 1977, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:41.203228', 'step': 1977, 'epoch': 3} +{'type': 'loss', 'content': 0.006124368868768215, 'timestamp': '2025-09-30 22:12:41.221345', 'step': 1978, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:41.292381', 'step': 1978, 'epoch': 3} +{'type': 'loss', 'content': 0.00962238758802414, 'timestamp': '2025-09-30 22:12:41.299224', 'step': 1979, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:41.357369', 'step': 1979, 'epoch': 3} +{'type': 'loss', 'content': 0.002619996899738908, 'timestamp': '2025-09-30 22:12:41.379860', 'step': 1980, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:41.448904', 'step': 1980, 'epoch': 3} +{'type': 'loss', 'content': 0.008656610734760761, 'timestamp': '2025-09-30 22:12:41.452895', 'step': 1981, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:41.515103', 'step': 1981, 'epoch': 3} +{'type': 'loss', 'content': 0.028397956863045692, 'timestamp': '2025-09-30 22:12:41.519895', 'step': 1982, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:41.586905', 'step': 1982, 'epoch': 3} +{'type': 'loss', 'content': 0.02610507234930992, 'timestamp': '2025-09-30 22:12:41.590858', 'step': 1983, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:41.654054', 'step': 1983, 'epoch': 3} +{'type': 'loss', 'content': 0.018599865958094597, 'timestamp': '2025-09-30 22:12:41.671758', 'step': 1984, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:41.737134', 'step': 1984, 'epoch': 3} +{'type': 'loss', 'content': 0.005543197970837355, 'timestamp': '2025-09-30 22:12:41.742183', 'step': 1985, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:41.813581', 'step': 1985, 'epoch': 3} +{'type': 'loss', 'content': 0.002317222999408841, 'timestamp': '2025-09-30 22:12:41.815633', 'step': 1986, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:41.881426', 'step': 1986, 'epoch': 3} +{'type': 'loss', 'content': 0.014996451325714588, 'timestamp': '2025-09-30 22:12:41.883416', 'step': 1987, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:41.937781', 'step': 1987, 'epoch': 3} +{'type': 'loss', 'content': 0.017824586480855942, 'timestamp': '2025-09-30 22:12:41.943617', 'step': 1988, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:42.001541', 'step': 1988, 'epoch': 3} +{'type': 'loss', 'content': 0.026619719341397285, 'timestamp': '2025-09-30 22:12:42.004640', 'step': 1989, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:42.068935', 'step': 1989, 'epoch': 3} +{'type': 'loss', 'content': 0.0035347731318324804, 'timestamp': '2025-09-30 22:12:42.071286', 'step': 1990, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:42.126117', 'step': 1990, 'epoch': 3} +{'type': 'loss', 'content': 0.03799136355519295, 'timestamp': '2025-09-30 22:12:42.128517', 'step': 1991, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:42.184013', 'step': 1991, 'epoch': 3} +{'type': 'loss', 'content': 0.04470697417855263, 'timestamp': '2025-09-30 22:12:42.189722', 'step': 1992, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:42.244769', 'step': 1992, 'epoch': 3} +{'type': 'loss', 'content': 0.05278744548559189, 'timestamp': '2025-09-30 22:12:42.249219', 'step': 1993, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:42.315365', 'step': 1993, 'epoch': 3} +{'type': 'loss', 'content': 0.01737930439412594, 'timestamp': '2025-09-30 22:12:42.318475', 'step': 1994, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:42.383369', 'step': 1994, 'epoch': 3} +{'type': 'loss', 'content': 0.01412777416408062, 'timestamp': '2025-09-30 22:12:42.385937', 'step': 1995, 'epoch': 3} +{'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}], 'timestamp': '2025-09-30 22:12:43.827522', 'step': 1995, 'epoch': 3} +{'type': 'pplx', 'content': 30986920.64344966, 'timestamp': '2025-09-30 22:12:43.836901', 'step': 1995, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:43.896684', 'step': 1995, 'epoch': 3} +{'type': 'loss', 'content': 0.007152962498366833, 'timestamp': '2025-09-30 22:12:43.904023', 'step': 1996, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:43.962205', 'step': 1996, 'epoch': 3} +{'type': 'loss', 'content': 0.008768231607973576, 'timestamp': '2025-09-30 22:12:43.965930', 'step': 1997, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:44.024945', 'step': 1997, 'epoch': 3} +{'type': 'loss', 'content': 0.014288007281720638, 'timestamp': '2025-09-30 22:12:44.028859', 'step': 1998, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:44.086462', 'step': 1998, 'epoch': 3} +{'type': 'loss', 'content': 0.025513725355267525, 'timestamp': '2025-09-30 22:12:44.090556', 'step': 1999, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:44.145220', 'step': 1999, 'epoch': 3} +{'type': 'loss', 'content': 0.022684959694743156, 'timestamp': '2025-09-30 22:12:44.152892', 'step': 2000, 'epoch': 3} +{'type': 'info', 'content': 'Checkpoint saved at step 2000', 'timestamp': '2025-09-30 22:12:44.704758', 'step': 2000, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:44.762348', 'step': 2000, 'epoch': 3} +{'type': 'loss', 'content': 0.01981218159198761, 'timestamp': '2025-09-30 22:12:44.765533', 'step': 2001, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:44.837870', 'step': 2001, 'epoch': 3} +{'type': 'loss', 'content': 0.006580715533345938, 'timestamp': '2025-09-30 22:12:44.840899', 'step': 2002, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:44.910869', 'step': 2002, 'epoch': 3} +{'type': 'loss', 'content': 0.007349673192948103, 'timestamp': '2025-09-30 22:12:44.924598', 'step': 2003, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:44.990101', 'step': 2003, 'epoch': 3} +{'type': 'loss', 'content': 0.03402791544795036, 'timestamp': '2025-09-30 22:12:44.996862', 'step': 2004, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:45.064039', 'step': 2004, 'epoch': 3} +{'type': 'loss', 'content': 0.0037932060658931732, 'timestamp': '2025-09-30 22:12:45.073329', 'step': 2005, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:45.132903', 'step': 2005, 'epoch': 3} +{'type': 'loss', 'content': 0.03237783536314964, 'timestamp': '2025-09-30 22:12:45.135893', 'step': 2006, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:45.204595', 'step': 2006, 'epoch': 3} +{'type': 'loss', 'content': 0.012929347343742847, 'timestamp': '2025-09-30 22:12:45.207303', 'step': 2007, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:45.265577', 'step': 2007, 'epoch': 3} +{'type': 'loss', 'content': 0.01525042299181223, 'timestamp': '2025-09-30 22:12:45.277214', 'step': 2008, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:45.338311', 'step': 2008, 'epoch': 3} +{'type': 'loss', 'content': 0.018445158377289772, 'timestamp': '2025-09-30 22:12:45.342291', 'step': 2009, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:45.406350', 'step': 2009, 'epoch': 3} +{'type': 'loss', 'content': 0.023437367752194405, 'timestamp': '2025-09-30 22:12:45.416485', 'step': 2010, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:45.478091', 'step': 2010, 'epoch': 3} +{'type': 'loss', 'content': 0.022062715142965317, 'timestamp': '2025-09-30 22:12:45.487759', 'step': 2011, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:45.555782', 'step': 2011, 'epoch': 3} +{'type': 'loss', 'content': 0.022687969729304314, 'timestamp': '2025-09-30 22:12:45.562458', 'step': 2012, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:45.629028', 'step': 2012, 'epoch': 3} +{'type': 'loss', 'content': 0.007122944109141827, 'timestamp': '2025-09-30 22:12:45.631721', 'step': 2013, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:45.688904', 'step': 2013, 'epoch': 3} +{'type': 'loss', 'content': 0.007444100920110941, 'timestamp': '2025-09-30 22:12:45.692505', 'step': 2014, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:45.747857', 'step': 2014, 'epoch': 3} +{'type': 'loss', 'content': 0.02668391726911068, 'timestamp': '2025-09-30 22:12:45.757716', 'step': 2015, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:45.824072', 'step': 2015, 'epoch': 3} +{'type': 'loss', 'content': 0.01846177875995636, 'timestamp': '2025-09-30 22:12:45.830050', 'step': 2016, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:45.888760', 'step': 2016, 'epoch': 3} +{'type': 'loss', 'content': 0.013639253564178944, 'timestamp': '2025-09-30 22:12:45.898113', 'step': 2017, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:45.960139', 'step': 2017, 'epoch': 3} +{'type': 'loss', 'content': 0.01387722697108984, 'timestamp': '2025-09-30 22:12:45.968363', 'step': 2018, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:46.026652', 'step': 2018, 'epoch': 3} +{'type': 'loss', 'content': 0.00344693916849792, 'timestamp': '2025-09-30 22:12:46.036303', 'step': 2019, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:46.092962', 'step': 2019, 'epoch': 3} +{'type': 'loss', 'content': 0.010282598435878754, 'timestamp': '2025-09-30 22:12:46.101409', 'step': 2020, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:46.156391', 'step': 2020, 'epoch': 3} +{'type': 'loss', 'content': 0.006298901047557592, 'timestamp': '2025-09-30 22:12:46.159534', 'step': 2021, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:46.213661', 'step': 2021, 'epoch': 3} +{'type': 'loss', 'content': 0.008653373457491398, 'timestamp': '2025-09-30 22:12:46.217113', 'step': 2022, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:46.271623', 'step': 2022, 'epoch': 3} +{'type': 'loss', 'content': 0.0037791277281939983, 'timestamp': '2025-09-30 22:12:46.276546', 'step': 2023, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:46.338579', 'step': 2023, 'epoch': 3} +{'type': 'loss', 'content': 0.014515328221023083, 'timestamp': '2025-09-30 22:12:46.351822', 'step': 2024, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:46.405559', 'step': 2024, 'epoch': 3} +{'type': 'loss', 'content': 0.013266796246170998, 'timestamp': '2025-09-30 22:12:46.416235', 'step': 2025, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:46.474075', 'step': 2025, 'epoch': 3} +{'type': 'loss', 'content': 0.002332469215616584, 'timestamp': '2025-09-30 22:12:46.483229', 'step': 2026, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:46.547920', 'step': 2026, 'epoch': 3} +{'type': 'loss', 'content': 0.00418002950027585, 'timestamp': '2025-09-30 22:12:46.559265', 'step': 2027, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:46.614126', 'step': 2027, 'epoch': 3} +{'type': 'loss', 'content': 0.004884445574134588, 'timestamp': '2025-09-30 22:12:46.621285', 'step': 2028, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:46.683577', 'step': 2028, 'epoch': 3} +{'type': 'loss', 'content': 0.01921374723315239, 'timestamp': '2025-09-30 22:12:46.687247', 'step': 2029, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:46.754441', 'step': 2029, 'epoch': 3} +{'type': 'loss', 'content': 0.01416950486600399, 'timestamp': '2025-09-30 22:12:46.764603', 'step': 2030, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:46.826453', 'step': 2030, 'epoch': 3} +{'type': 'loss', 'content': 0.004991916473954916, 'timestamp': '2025-09-30 22:12:46.829235', 'step': 2031, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:46.891979', 'step': 2031, 'epoch': 3} +{'type': 'loss', 'content': 0.00923225563019514, 'timestamp': '2025-09-30 22:12:46.899939', 'step': 2032, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:46.960726', 'step': 2032, 'epoch': 3} +{'type': 'loss', 'content': 0.029489969834685326, 'timestamp': '2025-09-30 22:12:46.963458', 'step': 2033, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:47.025987', 'step': 2033, 'epoch': 3} +{'type': 'loss', 'content': 0.03119957633316517, 'timestamp': '2025-09-30 22:12:47.036326', 'step': 2034, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:47.092090', 'step': 2034, 'epoch': 3} +{'type': 'loss', 'content': 0.013563781045377254, 'timestamp': '2025-09-30 22:12:47.103675', 'step': 2035, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:47.172647', 'step': 2035, 'epoch': 3} +{'type': 'loss', 'content': 0.013546071946620941, 'timestamp': '2025-09-30 22:12:47.179641', 'step': 2036, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:47.235262', 'step': 2036, 'epoch': 3} +{'type': 'loss', 'content': 0.013057815842330456, 'timestamp': '2025-09-30 22:12:47.237721', 'step': 2037, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:47.304263', 'step': 2037, 'epoch': 3} +{'type': 'loss', 'content': 0.028074579313397408, 'timestamp': '2025-09-30 22:12:47.307359', 'step': 2038, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:47.365018', 'step': 2038, 'epoch': 3} +{'type': 'loss', 'content': 0.0029875219333916903, 'timestamp': '2025-09-30 22:12:47.376522', 'step': 2039, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:47.440639', 'step': 2039, 'epoch': 3} +{'type': 'loss', 'content': 0.002364709507673979, 'timestamp': '2025-09-30 22:12:47.453613', 'step': 2040, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:47.519200', 'step': 2040, 'epoch': 3} +{'type': 'loss', 'content': 0.01649676077067852, 'timestamp': '2025-09-30 22:12:47.528985', 'step': 2041, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:47.591132', 'step': 2041, 'epoch': 3} +{'type': 'loss', 'content': 0.0018646756652742624, 'timestamp': '2025-09-30 22:12:47.594259', 'step': 2042, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:47.657779', 'step': 2042, 'epoch': 3} +{'type': 'loss', 'content': 0.018227288499474525, 'timestamp': '2025-09-30 22:12:47.662542', 'step': 2043, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:47.725112', 'step': 2043, 'epoch': 3} +{'type': 'loss', 'content': 0.02195359393954277, 'timestamp': '2025-09-30 22:12:47.739489', 'step': 2044, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:47.807304', 'step': 2044, 'epoch': 3} +{'type': 'loss', 'content': 0.00557445315644145, 'timestamp': '2025-09-30 22:12:47.811883', 'step': 2045, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:47.868755', 'step': 2045, 'epoch': 3} +{'type': 'loss', 'content': 0.04431688040494919, 'timestamp': '2025-09-30 22:12:47.873901', 'step': 2046, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:47.937423', 'step': 2046, 'epoch': 3} +{'type': 'loss', 'content': 0.018206002190709114, 'timestamp': '2025-09-30 22:12:47.947104', 'step': 2047, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:48.006345', 'step': 2047, 'epoch': 3} +{'type': 'loss', 'content': 0.006048198323696852, 'timestamp': '2025-09-30 22:12:48.024417', 'step': 2048, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:48.092028', 'step': 2048, 'epoch': 3} +{'type': 'loss', 'content': 0.004360709339380264, 'timestamp': '2025-09-30 22:12:48.107431', 'step': 2049, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:48.174773', 'step': 2049, 'epoch': 3} +{'type': 'loss', 'content': 0.011058392934501171, 'timestamp': '2025-09-30 22:12:48.187116', 'step': 2050, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:48.242660', 'step': 2050, 'epoch': 3} +{'type': 'loss', 'content': 0.00571943586692214, 'timestamp': '2025-09-30 22:12:48.249247', 'step': 2051, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:48.310667', 'step': 2051, 'epoch': 3} +{'type': 'loss', 'content': 0.026921484619379044, 'timestamp': '2025-09-30 22:12:48.325873', 'step': 2052, 'epoch': 3} +{'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}], 'timestamp': '2025-09-30 22:12:49.721444', 'step': 2052, 'epoch': 3} +{'type': 'pplx', 'content': 31591669.075829722, 'timestamp': '2025-09-30 22:12:49.725309', 'step': 2052, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:49.784999', 'step': 2052, 'epoch': 3} +{'type': 'loss', 'content': 0.007747288327664137, 'timestamp': '2025-09-30 22:12:49.797589', 'step': 2053, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:49.852580', 'step': 2053, 'epoch': 3} +{'type': 'loss', 'content': 0.0032807444222271442, 'timestamp': '2025-09-30 22:12:49.855656', 'step': 2054, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:49.917143', 'step': 2054, 'epoch': 3} +{'type': 'loss', 'content': 0.013838792219758034, 'timestamp': '2025-09-30 22:12:49.931141', 'step': 2055, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:49.987250', 'step': 2055, 'epoch': 3} +{'type': 'loss', 'content': 0.002472275635227561, 'timestamp': '2025-09-30 22:12:49.995095', 'step': 2056, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:50.050210', 'step': 2056, 'epoch': 3} +{'type': 'loss', 'content': 0.01641083136200905, 'timestamp': '2025-09-30 22:12:50.061765', 'step': 2057, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:50.122990', 'step': 2057, 'epoch': 3} +{'type': 'loss', 'content': 0.00696180434897542, 'timestamp': '2025-09-30 22:12:50.126640', 'step': 2058, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:50.183835', 'step': 2058, 'epoch': 3} +{'type': 'loss', 'content': 0.006303395610302687, 'timestamp': '2025-09-30 22:12:50.196071', 'step': 2059, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:50.252924', 'step': 2059, 'epoch': 3} +{'type': 'loss', 'content': 0.011265805922448635, 'timestamp': '2025-09-30 22:12:50.267874', 'step': 2060, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:50.326364', 'step': 2060, 'epoch': 3} +{'type': 'loss', 'content': 0.01980595663189888, 'timestamp': '2025-09-30 22:12:50.341639', 'step': 2061, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:50.400756', 'step': 2061, 'epoch': 3} +{'type': 'loss', 'content': 0.002819489687681198, 'timestamp': '2025-09-30 22:12:50.406844', 'step': 2062, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:50.463800', 'step': 2062, 'epoch': 3} +{'type': 'loss', 'content': 0.007079659961163998, 'timestamp': '2025-09-30 22:12:50.467330', 'step': 2063, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:50.524689', 'step': 2063, 'epoch': 3} +{'type': 'loss', 'content': 0.009019630961120129, 'timestamp': '2025-09-30 22:12:50.533155', 'step': 2064, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:50.603580', 'step': 2064, 'epoch': 3} +{'type': 'loss', 'content': 0.0046064346097409725, 'timestamp': '2025-09-30 22:12:50.606342', 'step': 2065, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:50.662393', 'step': 2065, 'epoch': 3} +{'type': 'loss', 'content': 0.025710809975862503, 'timestamp': '2025-09-30 22:12:50.675290', 'step': 2066, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:50.739641', 'step': 2066, 'epoch': 3} +{'type': 'loss', 'content': 0.01167394407093525, 'timestamp': '2025-09-30 22:12:50.742662', 'step': 2067, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:50.805429', 'step': 2067, 'epoch': 3} +{'type': 'loss', 'content': 0.021902693435549736, 'timestamp': '2025-09-30 22:12:50.812165', 'step': 2068, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:50.871441', 'step': 2068, 'epoch': 3} +{'type': 'loss', 'content': 0.004060753621160984, 'timestamp': '2025-09-30 22:12:50.874617', 'step': 2069, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:50.929615', 'step': 2069, 'epoch': 3} +{'type': 'loss', 'content': 0.023838216438889503, 'timestamp': '2025-09-30 22:12:50.933346', 'step': 2070, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:50.988296', 'step': 2070, 'epoch': 3} +{'type': 'loss', 'content': 0.000567333830986172, 'timestamp': '2025-09-30 22:12:50.991262', 'step': 2071, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:51.048304', 'step': 2071, 'epoch': 3} +{'type': 'loss', 'content': 0.033882711082696915, 'timestamp': '2025-09-30 22:12:51.054912', 'step': 2072, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:51.125544', 'step': 2072, 'epoch': 3} +{'type': 'loss', 'content': 0.01983877643942833, 'timestamp': '2025-09-30 22:12:51.128731', 'step': 2073, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:51.183987', 'step': 2073, 'epoch': 3} +{'type': 'loss', 'content': 0.012886099517345428, 'timestamp': '2025-09-30 22:12:51.192788', 'step': 2074, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:51.255083', 'step': 2074, 'epoch': 3} +{'type': 'loss', 'content': 0.005958546884357929, 'timestamp': '2025-09-30 22:12:51.257289', 'step': 2075, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:51.325526', 'step': 2075, 'epoch': 3} +{'type': 'loss', 'content': 0.005503936670720577, 'timestamp': '2025-09-30 22:12:51.333228', 'step': 2076, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:51.387290', 'step': 2076, 'epoch': 3} +{'type': 'loss', 'content': 0.0063980803824961185, 'timestamp': '2025-09-30 22:12:51.396594', 'step': 2077, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:51.456724', 'step': 2077, 'epoch': 3} +{'type': 'loss', 'content': 0.012718225829303265, 'timestamp': '2025-09-30 22:12:51.459467', 'step': 2078, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:51.520522', 'step': 2078, 'epoch': 3} +{'type': 'loss', 'content': 0.02810058370232582, 'timestamp': '2025-09-30 22:12:51.523666', 'step': 2079, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:51.578431', 'step': 2079, 'epoch': 3} +{'type': 'loss', 'content': 0.00501110078766942, 'timestamp': '2025-09-30 22:12:51.585017', 'step': 2080, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:51.640545', 'step': 2080, 'epoch': 3} +{'type': 'loss', 'content': 0.015446553006768227, 'timestamp': '2025-09-30 22:12:51.649823', 'step': 2081, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:51.706135', 'step': 2081, 'epoch': 3} +{'type': 'loss', 'content': 0.006646531168371439, 'timestamp': '2025-09-30 22:12:51.709919', 'step': 2082, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:51.770248', 'step': 2082, 'epoch': 3} +{'type': 'loss', 'content': 0.007312596309930086, 'timestamp': '2025-09-30 22:12:51.773296', 'step': 2083, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:51.834524', 'step': 2083, 'epoch': 3} +{'type': 'loss', 'content': 0.001499996636994183, 'timestamp': '2025-09-30 22:12:51.847405', 'step': 2084, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:51.901463', 'step': 2084, 'epoch': 3} +{'type': 'loss', 'content': 0.03770451620221138, 'timestamp': '2025-09-30 22:12:51.904924', 'step': 2085, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:51.961949', 'step': 2085, 'epoch': 3} +{'type': 'loss', 'content': 0.013079524040222168, 'timestamp': '2025-09-30 22:12:51.965128', 'step': 2086, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:52.023658', 'step': 2086, 'epoch': 3} +{'type': 'loss', 'content': 0.007665101904422045, 'timestamp': '2025-09-30 22:12:52.026475', 'step': 2087, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:52.099632', 'step': 2087, 'epoch': 3} +{'type': 'loss', 'content': 0.014709694311022758, 'timestamp': '2025-09-30 22:12:52.114550', 'step': 2088, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:52.174855', 'step': 2088, 'epoch': 3} +{'type': 'loss', 'content': 0.0035318650770932436, 'timestamp': '2025-09-30 22:12:52.178697', 'step': 2089, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-30 22:12:52.242420', 'step': 2089, 'epoch': 3} +{'type': 'loss', 'content': 0.0026837512850761414, 'timestamp': '2025-09-30 22:12:52.245902', 'step': 2090, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:52.303214', 'step': 2090, 'epoch': 3} +{'type': 'loss', 'content': 0.003676437307149172, 'timestamp': '2025-09-30 22:12:52.307125', 'step': 2091, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:52.366135', 'step': 2091, 'epoch': 3} +{'type': 'loss', 'content': 0.022839205339550972, 'timestamp': '2025-09-30 22:12:52.375466', 'step': 2092, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:52.432074', 'step': 2092, 'epoch': 3} +{'type': 'loss', 'content': 0.0020653358660638332, 'timestamp': '2025-09-30 22:12:52.435305', 'step': 2093, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:52.496909', 'step': 2093, 'epoch': 3} +{'type': 'loss', 'content': 0.010991484858095646, 'timestamp': '2025-09-30 22:12:52.500189', 'step': 2094, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:52.560748', 'step': 2094, 'epoch': 3} +{'type': 'loss', 'content': 0.005481290630996227, 'timestamp': '2025-09-30 22:12:52.565143', 'step': 2095, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:52.620407', 'step': 2095, 'epoch': 3} +{'type': 'loss', 'content': 0.04029859974980354, 'timestamp': '2025-09-30 22:12:52.627397', 'step': 2096, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:52.684592', 'step': 2096, 'epoch': 3} +{'type': 'loss', 'content': 0.0042072138749063015, 'timestamp': '2025-09-30 22:12:52.689409', 'step': 2097, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:52.747358', 'step': 2097, 'epoch': 3} +{'type': 'loss', 'content': 0.049394670873880386, 'timestamp': '2025-09-30 22:12:52.751278', 'step': 2098, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:52.806129', 'step': 2098, 'epoch': 3} +{'type': 'loss', 'content': 0.020421581342816353, 'timestamp': '2025-09-30 22:12:52.809621', 'step': 2099, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:52.865837', 'step': 2099, 'epoch': 3} +{'type': 'loss', 'content': 0.010594218969345093, 'timestamp': '2025-09-30 22:12:52.872798', 'step': 2100, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:52.926620', 'step': 2100, 'epoch': 3} +{'type': 'loss', 'content': 0.0015280555235221982, 'timestamp': '2025-09-30 22:12:52.935170', 'step': 2101, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:52.990431', 'step': 2101, 'epoch': 3} +{'type': 'loss', 'content': 0.0070367841981351376, 'timestamp': '2025-09-30 22:12:52.993310', 'step': 2102, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:53.050292', 'step': 2102, 'epoch': 3} +{'type': 'loss', 'content': 0.030247675254940987, 'timestamp': '2025-09-30 22:12:53.053333', 'step': 2103, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:53.116406', 'step': 2103, 'epoch': 3} +{'type': 'loss', 'content': 0.0010143903782591224, 'timestamp': '2025-09-30 22:12:53.123065', 'step': 2104, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:53.176416', 'step': 2104, 'epoch': 3} +{'type': 'loss', 'content': 0.0039184377528727055, 'timestamp': '2025-09-30 22:12:53.179306', 'step': 2105, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:53.235365', 'step': 2105, 'epoch': 3} +{'type': 'loss', 'content': 0.007116036955267191, 'timestamp': '2025-09-30 22:12:53.238300', 'step': 2106, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:53.293136', 'step': 2106, 'epoch': 3} +{'type': 'loss', 'content': 0.02538345754146576, 'timestamp': '2025-09-30 22:12:53.296719', 'step': 2107, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:53.358073', 'step': 2107, 'epoch': 3} +{'type': 'loss', 'content': 0.007159593049436808, 'timestamp': '2025-09-30 22:12:53.377858', 'step': 2108, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:53.433335', 'step': 2108, 'epoch': 3} +{'type': 'loss', 'content': 0.012715340591967106, 'timestamp': '2025-09-30 22:12:53.435844', 'step': 2109, 'epoch': 3} +{'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}], 'timestamp': '2025-09-30 22:12:54.807881', 'step': 2109, 'epoch': 3} +{'type': 'pplx', 'content': 31717602.29605612, 'timestamp': '2025-09-30 22:12:54.810448', 'step': 2109, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:54.866666', 'step': 2109, 'epoch': 3} +{'type': 'loss', 'content': 0.0069103785790503025, 'timestamp': '2025-09-30 22:12:54.870626', 'step': 2110, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:54.925774', 'step': 2110, 'epoch': 3} +{'type': 'loss', 'content': 0.0004411868576426059, 'timestamp': '2025-09-30 22:12:54.928538', 'step': 2111, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:54.986256', 'step': 2111, 'epoch': 3} +{'type': 'loss', 'content': 0.02559594437479973, 'timestamp': '2025-09-30 22:12:54.994525', 'step': 2112, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:55.056307', 'step': 2112, 'epoch': 3} +{'type': 'loss', 'content': 0.006536331493407488, 'timestamp': '2025-09-30 22:12:55.061762', 'step': 2113, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:55.119574', 'step': 2113, 'epoch': 3} +{'type': 'loss', 'content': 0.034199222922325134, 'timestamp': '2025-09-30 22:12:55.121736', 'step': 2114, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:55.184055', 'step': 2114, 'epoch': 3} +{'type': 'loss', 'content': 0.0037060887552797794, 'timestamp': '2025-09-30 22:12:55.190340', 'step': 2115, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:55.246418', 'step': 2115, 'epoch': 3} +{'type': 'loss', 'content': 0.010455816984176636, 'timestamp': '2025-09-30 22:12:55.257495', 'step': 2116, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:55.323363', 'step': 2116, 'epoch': 3} +{'type': 'loss', 'content': 0.009131490252912045, 'timestamp': '2025-09-30 22:12:55.329740', 'step': 2117, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:55.385535', 'step': 2117, 'epoch': 3} +{'type': 'loss', 'content': 0.028637733310461044, 'timestamp': '2025-09-30 22:12:55.388119', 'step': 2118, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:55.462086', 'step': 2118, 'epoch': 3} +{'type': 'loss', 'content': 0.0017552494537085295, 'timestamp': '2025-09-30 22:12:55.464428', 'step': 2119, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:55.524997', 'step': 2119, 'epoch': 3} +{'type': 'loss', 'content': 0.0008230188977904618, 'timestamp': '2025-09-30 22:12:55.532668', 'step': 2120, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-30 22:12:55.586377', 'step': 2120, 'epoch': 3} +{'type': 'loss', 'content': 0.001255502225831151, 'timestamp': '2025-09-30 22:12:55.599866', 'step': 2121, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:55.687367', 'step': 2121, 'epoch': 3} +{'type': 'loss', 'content': 0.03605801239609718, 'timestamp': '2025-09-30 22:12:55.689607', 'step': 2122, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:55.780220', 'step': 2122, 'epoch': 3} +{'type': 'loss', 'content': 0.02137705311179161, 'timestamp': '2025-09-30 22:12:55.784688', 'step': 2123, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:55.876184', 'step': 2123, 'epoch': 3} +{'type': 'loss', 'content': 0.00452017318457365, 'timestamp': '2025-09-30 22:12:55.885171', 'step': 2124, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:55.977205', 'step': 2124, 'epoch': 3} +{'type': 'loss', 'content': 0.022074688225984573, 'timestamp': '2025-09-30 22:12:55.979429', 'step': 2125, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:56.069986', 'step': 2125, 'epoch': 3} +{'type': 'loss', 'content': 0.01156954187899828, 'timestamp': '2025-09-30 22:12:56.072411', 'step': 2126, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:56.160302', 'step': 2126, 'epoch': 3} +{'type': 'loss', 'content': 0.01876743696630001, 'timestamp': '2025-09-30 22:12:56.162722', 'step': 2127, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:56.258051', 'step': 2127, 'epoch': 3} +{'type': 'loss', 'content': 0.018221214413642883, 'timestamp': '2025-09-30 22:12:56.268989', 'step': 2128, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:56.343166', 'step': 2128, 'epoch': 3} +{'type': 'loss', 'content': 0.011680861935019493, 'timestamp': '2025-09-30 22:12:56.345766', 'step': 2129, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:56.427254', 'step': 2129, 'epoch': 3} +{'type': 'loss', 'content': 0.007095720618963242, 'timestamp': '2025-09-30 22:12:56.431297', 'step': 2130, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:56.498838', 'step': 2130, 'epoch': 3} +{'type': 'loss', 'content': 0.004647306632250547, 'timestamp': '2025-09-30 22:12:56.501718', 'step': 2131, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:56.576126', 'step': 2131, 'epoch': 3} +{'type': 'loss', 'content': 0.012296868488192558, 'timestamp': '2025-09-30 22:12:56.582860', 'step': 2132, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:56.659773', 'step': 2132, 'epoch': 3} +{'type': 'loss', 'content': 0.03337612748146057, 'timestamp': '2025-09-30 22:12:56.663029', 'step': 2133, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:56.744957', 'step': 2133, 'epoch': 3} +{'type': 'loss', 'content': 0.00937966164201498, 'timestamp': '2025-09-30 22:12:56.748135', 'step': 2134, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:56.821695', 'step': 2134, 'epoch': 3} +{'type': 'loss', 'content': 0.002799929352477193, 'timestamp': '2025-09-30 22:12:56.825877', 'step': 2135, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:56.906853', 'step': 2135, 'epoch': 3} +{'type': 'loss', 'content': 0.0039896611124277115, 'timestamp': '2025-09-30 22:12:56.913216', 'step': 2136, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:56.988658', 'step': 2136, 'epoch': 3} +{'type': 'loss', 'content': 0.00480355229228735, 'timestamp': '2025-09-30 22:12:56.998934', 'step': 2137, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:57.075916', 'step': 2137, 'epoch': 3} +{'type': 'loss', 'content': 0.006306238938122988, 'timestamp': '2025-09-30 22:12:57.078555', 'step': 2138, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:57.158792', 'step': 2138, 'epoch': 3} +{'type': 'loss', 'content': 0.0029865510296076536, 'timestamp': '2025-09-30 22:12:57.166147', 'step': 2139, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:57.241108', 'step': 2139, 'epoch': 3} +{'type': 'loss', 'content': 0.00501589709892869, 'timestamp': '2025-09-30 22:12:57.252081', 'step': 2140, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:57.309098', 'step': 2140, 'epoch': 3} +{'type': 'loss', 'content': 0.0062422240152955055, 'timestamp': '2025-09-30 22:12:57.311877', 'step': 2141, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:57.370622', 'step': 2141, 'epoch': 3} +{'type': 'loss', 'content': 0.014008880592882633, 'timestamp': '2025-09-30 22:12:57.373707', 'step': 2142, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:57.446870', 'step': 2142, 'epoch': 3} +{'type': 'loss', 'content': 0.0016020223265513778, 'timestamp': '2025-09-30 22:12:57.450732', 'step': 2143, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:57.506223', 'step': 2143, 'epoch': 3} +{'type': 'loss', 'content': 0.007073293440043926, 'timestamp': '2025-09-30 22:12:57.520776', 'step': 2144, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:57.581482', 'step': 2144, 'epoch': 3} +{'type': 'loss', 'content': 0.006945033557713032, 'timestamp': '2025-09-30 22:12:57.585496', 'step': 2145, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:57.640324', 'step': 2145, 'epoch': 3} +{'type': 'loss', 'content': 0.0017107333987951279, 'timestamp': '2025-09-30 22:12:57.642964', 'step': 2146, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:57.697675', 'step': 2146, 'epoch': 3} +{'type': 'loss', 'content': 0.020304743200540543, 'timestamp': '2025-09-30 22:12:57.701181', 'step': 2147, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:57.755963', 'step': 2147, 'epoch': 3} +{'type': 'loss', 'content': 0.004924300592392683, 'timestamp': '2025-09-30 22:12:57.762052', 'step': 2148, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:57.825982', 'step': 2148, 'epoch': 3} +{'type': 'loss', 'content': 0.008847953751683235, 'timestamp': '2025-09-30 22:12:57.828336', 'step': 2149, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:57.895714', 'step': 2149, 'epoch': 3} +{'type': 'loss', 'content': 0.0031908315140753984, 'timestamp': '2025-09-30 22:12:57.898745', 'step': 2150, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:57.954645', 'step': 2150, 'epoch': 3} +{'type': 'loss', 'content': 0.009887597523629665, 'timestamp': '2025-09-30 22:12:57.957892', 'step': 2151, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:58.016821', 'step': 2151, 'epoch': 3} +{'type': 'loss', 'content': 0.016261015087366104, 'timestamp': '2025-09-30 22:12:58.023858', 'step': 2152, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:58.082714', 'step': 2152, 'epoch': 3} +{'type': 'loss', 'content': 0.011811012402176857, 'timestamp': '2025-09-30 22:12:58.086341', 'step': 2153, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:58.143616', 'step': 2153, 'epoch': 3} +{'type': 'loss', 'content': 0.006635370198637247, 'timestamp': '2025-09-30 22:12:58.146935', 'step': 2154, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:58.203868', 'step': 2154, 'epoch': 3} +{'type': 'loss', 'content': 0.009925504215061665, 'timestamp': '2025-09-30 22:12:58.206731', 'step': 2155, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:58.276892', 'step': 2155, 'epoch': 3} +{'type': 'loss', 'content': 0.01277694571763277, 'timestamp': '2025-09-30 22:12:58.284433', 'step': 2156, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:58.341811', 'step': 2156, 'epoch': 3} +{'type': 'loss', 'content': 0.019755104556679726, 'timestamp': '2025-09-30 22:12:58.345051', 'step': 2157, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:58.401533', 'step': 2157, 'epoch': 3} +{'type': 'loss', 'content': 0.005314360372722149, 'timestamp': '2025-09-30 22:12:58.404741', 'step': 2158, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:58.462362', 'step': 2158, 'epoch': 3} +{'type': 'loss', 'content': 0.008464450016617775, 'timestamp': '2025-09-30 22:12:58.465423', 'step': 2159, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:58.536232', 'step': 2159, 'epoch': 3} +{'type': 'loss', 'content': 0.02357994019985199, 'timestamp': '2025-09-30 22:12:58.543507', 'step': 2160, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:58.602367', 'step': 2160, 'epoch': 3} +{'type': 'loss', 'content': 0.015320166014134884, 'timestamp': '2025-09-30 22:12:58.609775', 'step': 2161, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-30 22:12:58.664326', 'step': 2161, 'epoch': 3} +{'type': 'loss', 'content': 0.026179542765021324, 'timestamp': '2025-09-30 22:12:58.667705', 'step': 2162, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:12:58.728789', 'step': 2162, 'epoch': 3} +{'type': 'loss', 'content': 0.021009227260947227, 'timestamp': '2025-09-30 22:12:58.732799', 'step': 2163, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:58.795597', 'step': 2163, 'epoch': 3} +{'type': 'loss', 'content': 0.019618257880210876, 'timestamp': '2025-09-30 22:12:58.801753', 'step': 2164, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:12:58.859468', 'step': 2164, 'epoch': 3} +{'type': 'loss', 'content': 0.0046376558020710945, 'timestamp': '2025-09-30 22:12:58.863573', 'step': 2165, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:12:58.919648', 'step': 2165, 'epoch': 3} +{'type': 'loss', 'content': 0.013165593147277832, 'timestamp': '2025-09-30 22:12:58.934748', 'step': 2166, 'epoch': 3} +{'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}], 'timestamp': '2025-09-30 22:13:00.344251', 'step': 2166, 'epoch': 3} +{'type': 'pplx', 'content': 28392377.416443832, 'timestamp': '2025-09-30 22:13:00.348523', 'step': 2166, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:00.405249', 'step': 2166, 'epoch': 3} +{'type': 'loss', 'content': 0.01954605057835579, 'timestamp': '2025-09-30 22:13:00.413299', 'step': 2167, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:00.470031', 'step': 2167, 'epoch': 3} +{'type': 'loss', 'content': 0.022829841822385788, 'timestamp': '2025-09-30 22:13:00.484098', 'step': 2168, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:00.539409', 'step': 2168, 'epoch': 3} +{'type': 'loss', 'content': 0.007014808710664511, 'timestamp': '2025-09-30 22:13:00.541868', 'step': 2169, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-30 22:13:00.603810', 'step': 2169, 'epoch': 3} +{'type': 'loss', 'content': 0.0007217395468614995, 'timestamp': '2025-09-30 22:13:00.609050', 'step': 2170, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:00.668024', 'step': 2170, 'epoch': 3} +{'type': 'loss', 'content': 0.0036041310522705317, 'timestamp': '2025-09-30 22:13:00.671095', 'step': 2171, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:00.732930', 'step': 2171, 'epoch': 3} +{'type': 'loss', 'content': 0.0017051756149157882, 'timestamp': '2025-09-30 22:13:00.740229', 'step': 2172, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:00.802077', 'step': 2172, 'epoch': 3} +{'type': 'loss', 'content': 0.01299409568309784, 'timestamp': '2025-09-30 22:13:00.806365', 'step': 2173, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:00.860598', 'step': 2173, 'epoch': 3} +{'type': 'loss', 'content': 0.008998853154480457, 'timestamp': '2025-09-30 22:13:00.868540', 'step': 2174, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:00.923574', 'step': 2174, 'epoch': 3} +{'type': 'loss', 'content': 0.022543568164110184, 'timestamp': '2025-09-30 22:13:00.933624', 'step': 2175, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:00.995139', 'step': 2175, 'epoch': 3} +{'type': 'loss', 'content': 0.00293608452193439, 'timestamp': '2025-09-30 22:13:01.001532', 'step': 2176, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:01.063423', 'step': 2176, 'epoch': 3} +{'type': 'loss', 'content': 0.007718019187450409, 'timestamp': '2025-09-30 22:13:01.077359', 'step': 2177, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:01.139374', 'step': 2177, 'epoch': 3} +{'type': 'loss', 'content': 0.05061681196093559, 'timestamp': '2025-09-30 22:13:01.142240', 'step': 2178, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:01.205104', 'step': 2178, 'epoch': 3} +{'type': 'loss', 'content': 0.0036929224152117968, 'timestamp': '2025-09-30 22:13:01.209922', 'step': 2179, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:01.274566', 'step': 2179, 'epoch': 3} +{'type': 'loss', 'content': 0.010988332331180573, 'timestamp': '2025-09-30 22:13:01.282766', 'step': 2180, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:01.339141', 'step': 2180, 'epoch': 3} +{'type': 'loss', 'content': 0.006843825336545706, 'timestamp': '2025-09-30 22:13:01.345966', 'step': 2181, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:01.404084', 'step': 2181, 'epoch': 3} +{'type': 'loss', 'content': 0.0031122539658099413, 'timestamp': '2025-09-30 22:13:01.410275', 'step': 2182, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:01.467236', 'step': 2182, 'epoch': 3} +{'type': 'loss', 'content': 0.012426759116351604, 'timestamp': '2025-09-30 22:13:01.470013', 'step': 2183, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:01.527816', 'step': 2183, 'epoch': 3} +{'type': 'loss', 'content': 0.011462002992630005, 'timestamp': '2025-09-30 22:13:01.537702', 'step': 2184, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:01.603326', 'step': 2184, 'epoch': 3} +{'type': 'loss', 'content': 0.013611538335680962, 'timestamp': '2025-09-30 22:13:01.605682', 'step': 2185, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:01.662218', 'step': 2185, 'epoch': 3} +{'type': 'loss', 'content': 0.00627383217215538, 'timestamp': '2025-09-30 22:13:01.666345', 'step': 2186, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:01.723862', 'step': 2186, 'epoch': 3} +{'type': 'loss', 'content': 0.016363272443413734, 'timestamp': '2025-09-30 22:13:01.728741', 'step': 2187, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:01.787447', 'step': 2187, 'epoch': 3} +{'type': 'loss', 'content': 0.01059199869632721, 'timestamp': '2025-09-30 22:13:01.795897', 'step': 2188, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:01.851716', 'step': 2188, 'epoch': 3} +{'type': 'loss', 'content': 0.007782274391502142, 'timestamp': '2025-09-30 22:13:01.854373', 'step': 2189, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:01.909906', 'step': 2189, 'epoch': 3} +{'type': 'loss', 'content': 0.004535790532827377, 'timestamp': '2025-09-30 22:13:01.919337', 'step': 2190, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:01.983918', 'step': 2190, 'epoch': 3} +{'type': 'loss', 'content': 0.028639093041419983, 'timestamp': '2025-09-30 22:13:01.992130', 'step': 2191, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:02.047436', 'step': 2191, 'epoch': 3} +{'type': 'loss', 'content': 0.0020749655086547136, 'timestamp': '2025-09-30 22:13:02.053529', 'step': 2192, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:02.114527', 'step': 2192, 'epoch': 3} +{'type': 'loss', 'content': 0.004414039198309183, 'timestamp': '2025-09-30 22:13:02.120142', 'step': 2193, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:02.181647', 'step': 2193, 'epoch': 3} +{'type': 'loss', 'content': 0.022147908806800842, 'timestamp': '2025-09-30 22:13:02.193109', 'step': 2194, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:02.261747', 'step': 2194, 'epoch': 3} +{'type': 'loss', 'content': 0.0016150318551808596, 'timestamp': '2025-09-30 22:13:02.274714', 'step': 2195, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:02.342578', 'step': 2195, 'epoch': 3} +{'type': 'loss', 'content': 0.04595841094851494, 'timestamp': '2025-09-30 22:13:02.361835', 'step': 2196, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:02.421979', 'step': 2196, 'epoch': 3} +{'type': 'loss', 'content': 0.02536730282008648, 'timestamp': '2025-09-30 22:13:02.426233', 'step': 2197, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:02.480360', 'step': 2197, 'epoch': 3} +{'type': 'loss', 'content': 0.0070744892582297325, 'timestamp': '2025-09-30 22:13:02.489001', 'step': 2198, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:02.544743', 'step': 2198, 'epoch': 3} +{'type': 'loss', 'content': 0.005618877708911896, 'timestamp': '2025-09-30 22:13:02.547146', 'step': 2199, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:02.606597', 'step': 2199, 'epoch': 3} +{'type': 'loss', 'content': 0.0013747888151556253, 'timestamp': '2025-09-30 22:13:02.615649', 'step': 2200, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-30 22:13:02.670306', 'step': 2200, 'epoch': 3} +{'type': 'loss', 'content': 0.00427739042788744, 'timestamp': '2025-09-30 22:13:02.672784', 'step': 2201, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:02.749819', 'step': 2201, 'epoch': 3} +{'type': 'loss', 'content': 0.012922806665301323, 'timestamp': '2025-09-30 22:13:02.752972', 'step': 2202, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:02.811834', 'step': 2202, 'epoch': 3} +{'type': 'loss', 'content': 0.0016593519831076264, 'timestamp': '2025-09-30 22:13:02.817260', 'step': 2203, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:02.872153', 'step': 2203, 'epoch': 3} +{'type': 'loss', 'content': 0.003395694075152278, 'timestamp': '2025-09-30 22:13:02.881138', 'step': 2204, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:02.939609', 'step': 2204, 'epoch': 3} +{'type': 'loss', 'content': 0.0013927072286605835, 'timestamp': '2025-09-30 22:13:02.942410', 'step': 2205, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:02.998936', 'step': 2205, 'epoch': 3} +{'type': 'loss', 'content': 0.017614727839827538, 'timestamp': '2025-09-30 22:13:03.008428', 'step': 2206, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:03.080214', 'step': 2206, 'epoch': 3} +{'type': 'loss', 'content': 0.0028198177460581064, 'timestamp': '2025-09-30 22:13:03.083580', 'step': 2207, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:03.148948', 'step': 2207, 'epoch': 3} +{'type': 'loss', 'content': 0.02004883624613285, 'timestamp': '2025-09-30 22:13:03.156280', 'step': 2208, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:03.215571', 'step': 2208, 'epoch': 3} +{'type': 'loss', 'content': 0.010212014429271221, 'timestamp': '2025-09-30 22:13:03.221426', 'step': 2209, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:03.276640', 'step': 2209, 'epoch': 3} +{'type': 'loss', 'content': 0.002033967524766922, 'timestamp': '2025-09-30 22:13:03.283455', 'step': 2210, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:03.343725', 'step': 2210, 'epoch': 3} +{'type': 'loss', 'content': 0.0013138767099007964, 'timestamp': '2025-09-30 22:13:03.347390', 'step': 2211, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:03.413671', 'step': 2211, 'epoch': 3} +{'type': 'loss', 'content': 0.0013939599739387631, 'timestamp': '2025-09-30 22:13:03.432049', 'step': 2212, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:03.486732', 'step': 2212, 'epoch': 3} +{'type': 'loss', 'content': 0.024434858933091164, 'timestamp': '2025-09-30 22:13:03.489344', 'step': 2213, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:03.553859', 'step': 2213, 'epoch': 3} +{'type': 'loss', 'content': 0.008440472185611725, 'timestamp': '2025-09-30 22:13:03.563935', 'step': 2214, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:03.624857', 'step': 2214, 'epoch': 3} +{'type': 'loss', 'content': 0.01175626926124096, 'timestamp': '2025-09-30 22:13:03.628668', 'step': 2215, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:03.696209', 'step': 2215, 'epoch': 3} +{'type': 'loss', 'content': 0.018027642741799355, 'timestamp': '2025-09-30 22:13:03.706990', 'step': 2216, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:03.764429', 'step': 2216, 'epoch': 3} +{'type': 'loss', 'content': 0.008521536365151405, 'timestamp': '2025-09-30 22:13:03.772218', 'step': 2217, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:03.836575', 'step': 2217, 'epoch': 3} +{'type': 'loss', 'content': 0.008888277225196362, 'timestamp': '2025-09-30 22:13:03.840283', 'step': 2218, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:03.902344', 'step': 2218, 'epoch': 3} +{'type': 'loss', 'content': 0.0012383210705593228, 'timestamp': '2025-09-30 22:13:03.910926', 'step': 2219, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:03.977533', 'step': 2219, 'epoch': 3} +{'type': 'loss', 'content': 0.0042413403280079365, 'timestamp': '2025-09-30 22:13:03.984542', 'step': 2220, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:04.040130', 'step': 2220, 'epoch': 3} +{'type': 'loss', 'content': 0.0010714359814301133, 'timestamp': '2025-09-30 22:13:04.042791', 'step': 2221, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:04.107674', 'step': 2221, 'epoch': 3} +{'type': 'loss', 'content': 0.006664037238806486, 'timestamp': '2025-09-30 22:13:04.111000', 'step': 2222, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:04.170928', 'step': 2222, 'epoch': 3} +{'type': 'loss', 'content': 0.0031583583913743496, 'timestamp': '2025-09-30 22:13:04.173623', 'step': 2223, 'epoch': 3} +{'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}], 'timestamp': '2025-09-30 22:13:05.490048', 'step': 2223, 'epoch': 3} +{'type': 'pplx', 'content': 29344370.172395393, 'timestamp': '2025-09-30 22:13:05.492733', 'step': 2223, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:05.546368', 'step': 2223, 'epoch': 3} +{'type': 'loss', 'content': 0.006295633502304554, 'timestamp': '2025-09-30 22:13:05.552730', 'step': 2224, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:05.607930', 'step': 2224, 'epoch': 3} +{'type': 'loss', 'content': 0.003405241295695305, 'timestamp': '2025-09-30 22:13:05.610697', 'step': 2225, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:05.667829', 'step': 2225, 'epoch': 3} +{'type': 'loss', 'content': 0.002446610014885664, 'timestamp': '2025-09-30 22:13:05.673988', 'step': 2226, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:05.729752', 'step': 2226, 'epoch': 3} +{'type': 'loss', 'content': 0.017070619389414787, 'timestamp': '2025-09-30 22:13:05.733291', 'step': 2227, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-30 22:13:05.789448', 'step': 2227, 'epoch': 3} +{'type': 'loss', 'content': 0.01944059133529663, 'timestamp': '2025-09-30 22:13:05.796101', 'step': 2228, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:05.852039', 'step': 2228, 'epoch': 3} +{'type': 'loss', 'content': 0.0016683695139363408, 'timestamp': '2025-09-30 22:13:05.857473', 'step': 2229, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:05.925067', 'step': 2229, 'epoch': 3} +{'type': 'loss', 'content': 0.0019555925391614437, 'timestamp': '2025-09-30 22:13:05.929845', 'step': 2230, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:05.989548', 'step': 2230, 'epoch': 3} +{'type': 'loss', 'content': 0.00015620069461874664, 'timestamp': '2025-09-30 22:13:05.991628', 'step': 2231, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:06.052663', 'step': 2231, 'epoch': 3} +{'type': 'loss', 'content': 0.01360904611647129, 'timestamp': '2025-09-30 22:13:06.063942', 'step': 2232, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:06.127300', 'step': 2232, 'epoch': 3} +{'type': 'loss', 'content': 0.0014074391219764948, 'timestamp': '2025-09-30 22:13:06.132756', 'step': 2233, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:06.196856', 'step': 2233, 'epoch': 3} +{'type': 'loss', 'content': 0.002081622602418065, 'timestamp': '2025-09-30 22:13:06.204891', 'step': 2234, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:06.262721', 'step': 2234, 'epoch': 3} +{'type': 'loss', 'content': 0.0013570208102464676, 'timestamp': '2025-09-30 22:13:06.273367', 'step': 2235, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:06.341281', 'step': 2235, 'epoch': 3} +{'type': 'loss', 'content': 0.0063094040378928185, 'timestamp': '2025-09-30 22:13:06.353285', 'step': 2236, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:06.425103', 'step': 2236, 'epoch': 3} +{'type': 'loss', 'content': 0.0011793047888204455, 'timestamp': '2025-09-30 22:13:06.432018', 'step': 2237, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:06.494162', 'step': 2237, 'epoch': 3} +{'type': 'loss', 'content': 0.024860413745045662, 'timestamp': '2025-09-30 22:13:06.499047', 'step': 2238, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:06.571763', 'step': 2238, 'epoch': 3} +{'type': 'loss', 'content': 0.029842732474207878, 'timestamp': '2025-09-30 22:13:06.581059', 'step': 2239, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:06.652497', 'step': 2239, 'epoch': 3} +{'type': 'loss', 'content': 0.0033206476364284754, 'timestamp': '2025-09-30 22:13:06.664027', 'step': 2240, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:06.725036', 'step': 2240, 'epoch': 3} +{'type': 'loss', 'content': 0.0032124409917742014, 'timestamp': '2025-09-30 22:13:06.729256', 'step': 2241, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:06.797711', 'step': 2241, 'epoch': 3} +{'type': 'loss', 'content': 0.0014608422061428428, 'timestamp': '2025-09-30 22:13:06.820254', 'step': 2242, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:06.893327', 'step': 2242, 'epoch': 3} +{'type': 'loss', 'content': 0.0015411525964736938, 'timestamp': '2025-09-30 22:13:06.909198', 'step': 2243, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:06.972246', 'step': 2243, 'epoch': 3} +{'type': 'loss', 'content': 0.00187637226190418, 'timestamp': '2025-09-30 22:13:06.983547', 'step': 2244, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:07.038825', 'step': 2244, 'epoch': 3} +{'type': 'loss', 'content': 0.0047594718635082245, 'timestamp': '2025-09-30 22:13:07.045563', 'step': 2245, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:07.105710', 'step': 2245, 'epoch': 3} +{'type': 'loss', 'content': 0.003375373315066099, 'timestamp': '2025-09-30 22:13:07.108166', 'step': 2246, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:07.162366', 'step': 2246, 'epoch': 3} +{'type': 'loss', 'content': 0.0013589292066171765, 'timestamp': '2025-09-30 22:13:07.165331', 'step': 2247, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:07.220888', 'step': 2247, 'epoch': 3} +{'type': 'loss', 'content': 0.016323648393154144, 'timestamp': '2025-09-30 22:13:07.229246', 'step': 2248, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:07.283426', 'step': 2248, 'epoch': 3} +{'type': 'loss', 'content': 0.006245364900678396, 'timestamp': '2025-09-30 22:13:07.292940', 'step': 2249, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:07.349128', 'step': 2249, 'epoch': 3} +{'type': 'loss', 'content': 0.0017451593885198236, 'timestamp': '2025-09-30 22:13:07.352875', 'step': 2250, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:07.412427', 'step': 2250, 'epoch': 3} +{'type': 'loss', 'content': 0.007775729056447744, 'timestamp': '2025-09-30 22:13:07.423227', 'step': 2251, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:07.480814', 'step': 2251, 'epoch': 3} +{'type': 'loss', 'content': 0.02229670248925686, 'timestamp': '2025-09-30 22:13:07.491348', 'step': 2252, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:07.553274', 'step': 2252, 'epoch': 3} +{'type': 'loss', 'content': 0.0005842237151227891, 'timestamp': '2025-09-30 22:13:07.555760', 'step': 2253, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:07.624606', 'step': 2253, 'epoch': 3} +{'type': 'loss', 'content': 0.0024967549834400415, 'timestamp': '2025-09-30 22:13:07.627473', 'step': 2254, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:07.682185', 'step': 2254, 'epoch': 3} +{'type': 'loss', 'content': 0.00876485276967287, 'timestamp': '2025-09-30 22:13:07.685725', 'step': 2255, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:07.748598', 'step': 2255, 'epoch': 3} +{'type': 'loss', 'content': 0.00036741181975230575, 'timestamp': '2025-09-30 22:13:07.754726', 'step': 2256, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:07.814528', 'step': 2256, 'epoch': 3} +{'type': 'loss', 'content': 0.0012112419353798032, 'timestamp': '2025-09-30 22:13:07.818293', 'step': 2257, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:07.892341', 'step': 2257, 'epoch': 3} +{'type': 'loss', 'content': 0.0117741534486413, 'timestamp': '2025-09-30 22:13:07.898564', 'step': 2258, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:07.956914', 'step': 2258, 'epoch': 3} +{'type': 'loss', 'content': 0.001920665497891605, 'timestamp': '2025-09-30 22:13:07.968074', 'step': 2259, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:08.024271', 'step': 2259, 'epoch': 3} +{'type': 'loss', 'content': 0.0017800560453906655, 'timestamp': '2025-09-30 22:13:08.032844', 'step': 2260, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:08.087032', 'step': 2260, 'epoch': 3} +{'type': 'loss', 'content': 0.007017010357230902, 'timestamp': '2025-09-30 22:13:08.089549', 'step': 2261, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:08.146410', 'step': 2261, 'epoch': 3} +{'type': 'loss', 'content': 0.0003600542258936912, 'timestamp': '2025-09-30 22:13:08.154264', 'step': 2262, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:08.211686', 'step': 2262, 'epoch': 3} +{'type': 'loss', 'content': 0.009401796385645866, 'timestamp': '2025-09-30 22:13:08.214506', 'step': 2263, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:08.270432', 'step': 2263, 'epoch': 3} +{'type': 'loss', 'content': 0.004206747282296419, 'timestamp': '2025-09-30 22:13:08.281054', 'step': 2264, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:08.339524', 'step': 2264, 'epoch': 3} +{'type': 'loss', 'content': 0.00021178685710765421, 'timestamp': '2025-09-30 22:13:08.341888', 'step': 2265, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:08.397581', 'step': 2265, 'epoch': 3} +{'type': 'loss', 'content': 0.00048541155410930514, 'timestamp': '2025-09-30 22:13:08.402602', 'step': 2266, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:08.458086', 'step': 2266, 'epoch': 3} +{'type': 'loss', 'content': 0.003917259629815817, 'timestamp': '2025-09-30 22:13:08.464337', 'step': 2267, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-30 22:13:08.523381', 'step': 2267, 'epoch': 3} +{'type': 'loss', 'content': 0.003979508299380541, 'timestamp': '2025-09-30 22:13:08.530734', 'step': 2268, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:08.597171', 'step': 2268, 'epoch': 3} +{'type': 'loss', 'content': 0.0003099239547736943, 'timestamp': '2025-09-30 22:13:08.599264', 'step': 2269, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:08.652680', 'step': 2269, 'epoch': 3} +{'type': 'loss', 'content': 0.000424488156568259, 'timestamp': '2025-09-30 22:13:08.662711', 'step': 2270, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:08.715873', 'step': 2270, 'epoch': 3} +{'type': 'loss', 'content': 0.007150634657591581, 'timestamp': '2025-09-30 22:13:08.718812', 'step': 2271, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:08.783543', 'step': 2271, 'epoch': 3} +{'type': 'loss', 'content': 0.0024335901252925396, 'timestamp': '2025-09-30 22:13:08.790618', 'step': 2272, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:08.846206', 'step': 2272, 'epoch': 3} +{'type': 'loss', 'content': 0.01159227080643177, 'timestamp': '2025-09-30 22:13:08.849451', 'step': 2273, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:08.917310', 'step': 2273, 'epoch': 3} +{'type': 'loss', 'content': 0.005326881073415279, 'timestamp': '2025-09-30 22:13:08.926533', 'step': 2274, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:08.983661', 'step': 2274, 'epoch': 3} +{'type': 'loss', 'content': 0.0012480862205848098, 'timestamp': '2025-09-30 22:13:08.992832', 'step': 2275, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:09.050359', 'step': 2275, 'epoch': 3} +{'type': 'loss', 'content': 0.008051712065935135, 'timestamp': '2025-09-30 22:13:09.056973', 'step': 2276, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:09.120010', 'step': 2276, 'epoch': 3} +{'type': 'loss', 'content': 0.0012699085054919124, 'timestamp': '2025-09-30 22:13:09.125755', 'step': 2277, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:09.180682', 'step': 2277, 'epoch': 3} +{'type': 'loss', 'content': 0.006048406939953566, 'timestamp': '2025-09-30 22:13:09.183726', 'step': 2278, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:09.250575', 'step': 2278, 'epoch': 3} +{'type': 'loss', 'content': 0.004453370813280344, 'timestamp': '2025-09-30 22:13:09.253346', 'step': 2279, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:09.308955', 'step': 2279, 'epoch': 3} +{'type': 'loss', 'content': 0.002166403690353036, 'timestamp': '2025-09-30 22:13:09.315558', 'step': 2280, 'epoch': 3} +{'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}], 'timestamp': '2025-09-30 22:13:10.738278', 'step': 2280, 'epoch': 3} +{'type': 'pplx', 'content': 31190960.103936635, 'timestamp': '2025-09-30 22:13:10.740643', 'step': 2280, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:10.792576', 'step': 2280, 'epoch': 3} +{'type': 'loss', 'content': 0.00041843479266390204, 'timestamp': '2025-09-30 22:13:10.795117', 'step': 2281, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:10.859013', 'step': 2281, 'epoch': 3} +{'type': 'loss', 'content': 0.006251118145883083, 'timestamp': '2025-09-30 22:13:10.862684', 'step': 2282, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:10.926799', 'step': 2282, 'epoch': 3} +{'type': 'loss', 'content': 0.0009974894346669316, 'timestamp': '2025-09-30 22:13:10.929266', 'step': 2283, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:10.989116', 'step': 2283, 'epoch': 3} +{'type': 'loss', 'content': 0.0032588716130703688, 'timestamp': '2025-09-30 22:13:10.997818', 'step': 2284, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:11.061570', 'step': 2284, 'epoch': 3} +{'type': 'loss', 'content': 0.005364909302443266, 'timestamp': '2025-09-30 22:13:11.064019', 'step': 2285, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:11.122069', 'step': 2285, 'epoch': 3} +{'type': 'loss', 'content': 8.350759890163317e-05, 'timestamp': '2025-09-30 22:13:11.124572', 'step': 2286, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:11.190995', 'step': 2286, 'epoch': 3} +{'type': 'loss', 'content': 0.0043342201970517635, 'timestamp': '2025-09-30 22:13:11.195044', 'step': 2287, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-30 22:13:11.268671', 'step': 2287, 'epoch': 3} +{'type': 'loss', 'content': 0.0011432368773967028, 'timestamp': '2025-09-30 22:13:11.275236', 'step': 2288, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:11.339399', 'step': 2288, 'epoch': 3} +{'type': 'loss', 'content': 0.0010339574655517936, 'timestamp': '2025-09-30 22:13:11.342362', 'step': 2289, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:11.403531', 'step': 2289, 'epoch': 3} +{'type': 'loss', 'content': 0.0022677092347294092, 'timestamp': '2025-09-30 22:13:11.409800', 'step': 2290, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:11.472241', 'step': 2290, 'epoch': 3} +{'type': 'loss', 'content': 0.004064720589667559, 'timestamp': '2025-09-30 22:13:11.474953', 'step': 2291, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:11.536884', 'step': 2291, 'epoch': 3} +{'type': 'loss', 'content': 0.027135172858834267, 'timestamp': '2025-09-30 22:13:11.543184', 'step': 2292, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:11.602293', 'step': 2292, 'epoch': 3} +{'type': 'loss', 'content': 0.0008837772184051573, 'timestamp': '2025-09-30 22:13:11.608386', 'step': 2293, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:11.665675', 'step': 2293, 'epoch': 3} +{'type': 'loss', 'content': 0.011004587635397911, 'timestamp': '2025-09-30 22:13:11.668035', 'step': 2294, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:11.727743', 'step': 2294, 'epoch': 3} +{'type': 'loss', 'content': 0.004693236667662859, 'timestamp': '2025-09-30 22:13:11.730020', 'step': 2295, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:11.788422', 'step': 2295, 'epoch': 3} +{'type': 'loss', 'content': 0.0006029639625921845, 'timestamp': '2025-09-30 22:13:11.798640', 'step': 2296, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:11.854358', 'step': 2296, 'epoch': 3} +{'type': 'loss', 'content': 0.001073610968887806, 'timestamp': '2025-09-30 22:13:11.856714', 'step': 2297, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:11.915935', 'step': 2297, 'epoch': 3} +{'type': 'loss', 'content': 0.0027264878153800964, 'timestamp': '2025-09-30 22:13:11.920656', 'step': 2298, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:11.980816', 'step': 2298, 'epoch': 3} +{'type': 'loss', 'content': 0.0007334630936384201, 'timestamp': '2025-09-30 22:13:11.985247', 'step': 2299, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:12.059254', 'step': 2299, 'epoch': 3} +{'type': 'loss', 'content': 0.0011301154736429453, 'timestamp': '2025-09-30 22:13:12.065752', 'step': 2300, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:12.123890', 'step': 2300, 'epoch': 3} +{'type': 'loss', 'content': 0.00015225273091346025, 'timestamp': '2025-09-30 22:13:12.128224', 'step': 2301, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:12.196012', 'step': 2301, 'epoch': 3} +{'type': 'loss', 'content': 0.000535293947905302, 'timestamp': '2025-09-30 22:13:12.198299', 'step': 2302, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:12.252848', 'step': 2302, 'epoch': 3} +{'type': 'loss', 'content': 0.004955708514899015, 'timestamp': '2025-09-30 22:13:12.260839', 'step': 2303, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:12.319753', 'step': 2303, 'epoch': 3} +{'type': 'loss', 'content': 0.00846810918301344, 'timestamp': '2025-09-30 22:13:12.328509', 'step': 2304, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:12.383083', 'step': 2304, 'epoch': 3} +{'type': 'loss', 'content': 6.449552165577188e-05, 'timestamp': '2025-09-30 22:13:12.388975', 'step': 2305, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:12.465214', 'step': 2305, 'epoch': 3} +{'type': 'loss', 'content': 0.004094495438039303, 'timestamp': '2025-09-30 22:13:12.468338', 'step': 2306, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:12.524076', 'step': 2306, 'epoch': 3} +{'type': 'loss', 'content': 0.0043173558078706264, 'timestamp': '2025-09-30 22:13:12.531143', 'step': 2307, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:12.586832', 'step': 2307, 'epoch': 3} +{'type': 'loss', 'content': 0.013621537014842033, 'timestamp': '2025-09-30 22:13:12.593105', 'step': 2308, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:12.647208', 'step': 2308, 'epoch': 3} +{'type': 'loss', 'content': 0.00045176432467997074, 'timestamp': '2025-09-30 22:13:12.651462', 'step': 2309, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:12.706835', 'step': 2309, 'epoch': 3} +{'type': 'loss', 'content': 0.00153114995919168, 'timestamp': '2025-09-30 22:13:12.709825', 'step': 2310, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:12.768571', 'step': 2310, 'epoch': 3} +{'type': 'loss', 'content': 0.0002749506966210902, 'timestamp': '2025-09-30 22:13:12.773973', 'step': 2311, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:12.830788', 'step': 2311, 'epoch': 3} +{'type': 'loss', 'content': 0.0001482378429500386, 'timestamp': '2025-09-30 22:13:12.836862', 'step': 2312, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:12.899423', 'step': 2312, 'epoch': 3} +{'type': 'loss', 'content': 0.0001635663356864825, 'timestamp': '2025-09-30 22:13:12.905845', 'step': 2313, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:12.972178', 'step': 2313, 'epoch': 3} +{'type': 'loss', 'content': 2.553769627411384e-05, 'timestamp': '2025-09-30 22:13:12.974513', 'step': 2314, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:13.030917', 'step': 2314, 'epoch': 3} +{'type': 'loss', 'content': 0.014685734175145626, 'timestamp': '2025-09-30 22:13:13.033978', 'step': 2315, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:13.092800', 'step': 2315, 'epoch': 3} +{'type': 'loss', 'content': 7.750854274490848e-05, 'timestamp': '2025-09-30 22:13:13.105173', 'step': 2316, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:13.160613', 'step': 2316, 'epoch': 3} +{'type': 'loss', 'content': 0.0010388302616775036, 'timestamp': '2025-09-30 22:13:13.164054', 'step': 2317, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:13.231936', 'step': 2317, 'epoch': 3} +{'type': 'loss', 'content': 0.0024059859570115805, 'timestamp': '2025-09-30 22:13:13.239512', 'step': 2318, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:13.295025', 'step': 2318, 'epoch': 3} +{'type': 'loss', 'content': 0.00040224703843705356, 'timestamp': '2025-09-30 22:13:13.304151', 'step': 2319, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:13.366702', 'step': 2319, 'epoch': 3} +{'type': 'loss', 'content': 0.000600070517975837, 'timestamp': '2025-09-30 22:13:13.372841', 'step': 2320, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:13.428888', 'step': 2320, 'epoch': 3} +{'type': 'loss', 'content': 0.043553676456213, 'timestamp': '2025-09-30 22:13:13.432384', 'step': 2321, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:13.495898', 'step': 2321, 'epoch': 3} +{'type': 'loss', 'content': 0.01901986077427864, 'timestamp': '2025-09-30 22:13:13.498854', 'step': 2322, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:13.566548', 'step': 2322, 'epoch': 3} +{'type': 'loss', 'content': 0.020047681406140327, 'timestamp': '2025-09-30 22:13:13.569703', 'step': 2323, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:13.625179', 'step': 2323, 'epoch': 3} +{'type': 'loss', 'content': 0.00016043984214775264, 'timestamp': '2025-09-30 22:13:13.632855', 'step': 2324, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:13.696114', 'step': 2324, 'epoch': 3} +{'type': 'loss', 'content': 0.012856409884989262, 'timestamp': '2025-09-30 22:13:13.699713', 'step': 2325, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:13.762802', 'step': 2325, 'epoch': 3} +{'type': 'loss', 'content': 0.004349694121629, 'timestamp': '2025-09-30 22:13:13.765900', 'step': 2326, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:13.825731', 'step': 2326, 'epoch': 3} +{'type': 'loss', 'content': 0.0007128869765438139, 'timestamp': '2025-09-30 22:13:13.829485', 'step': 2327, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:13.886792', 'step': 2327, 'epoch': 3} +{'type': 'loss', 'content': 0.006236379034817219, 'timestamp': '2025-09-30 22:13:13.901117', 'step': 2328, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:13.960939', 'step': 2328, 'epoch': 3} +{'type': 'loss', 'content': 0.0003488468355499208, 'timestamp': '2025-09-30 22:13:13.964455', 'step': 2329, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:14.036143', 'step': 2329, 'epoch': 3} +{'type': 'loss', 'content': 0.00010532321175560355, 'timestamp': '2025-09-30 22:13:14.039724', 'step': 2330, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:14.099722', 'step': 2330, 'epoch': 3} +{'type': 'loss', 'content': 0.03490881994366646, 'timestamp': '2025-09-30 22:13:14.103491', 'step': 2331, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:14.168611', 'step': 2331, 'epoch': 3} +{'type': 'loss', 'content': 0.013101531192660332, 'timestamp': '2025-09-30 22:13:14.179820', 'step': 2332, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:14.240568', 'step': 2332, 'epoch': 3} +{'type': 'loss', 'content': 0.0005494463839568198, 'timestamp': '2025-09-30 22:13:14.244365', 'step': 2333, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:14.304060', 'step': 2333, 'epoch': 3} +{'type': 'loss', 'content': 0.0002779501664917916, 'timestamp': '2025-09-30 22:13:14.315947', 'step': 2334, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:14.375221', 'step': 2334, 'epoch': 3} +{'type': 'loss', 'content': 0.0017243318725377321, 'timestamp': '2025-09-30 22:13:14.378038', 'step': 2335, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:14.440264', 'step': 2335, 'epoch': 3} +{'type': 'loss', 'content': 0.0026360393967479467, 'timestamp': '2025-09-30 22:13:14.452840', 'step': 2336, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:14.513305', 'step': 2336, 'epoch': 3} +{'type': 'loss', 'content': 0.0022121057845652103, 'timestamp': '2025-09-30 22:13:14.521054', 'step': 2337, 'epoch': 3} +{'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}], 'timestamp': '2025-09-30 22:13:15.906861', 'step': 2337, 'epoch': 3} +{'type': 'pplx', 'content': 35685924.762230136, 'timestamp': '2025-09-30 22:13:15.911272', 'step': 2337, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:15.969547', 'step': 2337, 'epoch': 3} +{'type': 'loss', 'content': 0.0001224317093146965, 'timestamp': '2025-09-30 22:13:15.973320', 'step': 2338, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:16.041128', 'step': 2338, 'epoch': 3} +{'type': 'loss', 'content': 0.0006064993212930858, 'timestamp': '2025-09-30 22:13:16.044973', 'step': 2339, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:16.100730', 'step': 2339, 'epoch': 3} +{'type': 'loss', 'content': 0.003981334622949362, 'timestamp': '2025-09-30 22:13:16.107765', 'step': 2340, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:16.166753', 'step': 2340, 'epoch': 3} +{'type': 'loss', 'content': 0.00029052264289930463, 'timestamp': '2025-09-30 22:13:16.171192', 'step': 2341, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:16.226872', 'step': 2341, 'epoch': 3} +{'type': 'loss', 'content': 0.010386792942881584, 'timestamp': '2025-09-30 22:13:16.230245', 'step': 2342, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:16.294054', 'step': 2342, 'epoch': 3} +{'type': 'loss', 'content': 5.578704076469876e-05, 'timestamp': '2025-09-30 22:13:16.297024', 'step': 2343, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:16.356213', 'step': 2343, 'epoch': 3} +{'type': 'loss', 'content': 0.0007381403702311218, 'timestamp': '2025-09-30 22:13:16.369322', 'step': 2344, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:16.426141', 'step': 2344, 'epoch': 3} +{'type': 'loss', 'content': 0.03852478787302971, 'timestamp': '2025-09-30 22:13:16.430172', 'step': 2345, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:16.489012', 'step': 2345, 'epoch': 3} +{'type': 'loss', 'content': 0.0006424127495847642, 'timestamp': '2025-09-30 22:13:16.499829', 'step': 2346, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:16.556480', 'step': 2346, 'epoch': 3} +{'type': 'loss', 'content': 9.567866800352931e-05, 'timestamp': '2025-09-30 22:13:16.560414', 'step': 2347, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:16.615496', 'step': 2347, 'epoch': 3} +{'type': 'loss', 'content': 0.009399794973433018, 'timestamp': '2025-09-30 22:13:16.630284', 'step': 2348, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:16.693055', 'step': 2348, 'epoch': 3} +{'type': 'loss', 'content': 0.0003211422299500555, 'timestamp': '2025-09-30 22:13:16.697092', 'step': 2349, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:16.757223', 'step': 2349, 'epoch': 3} +{'type': 'loss', 'content': 0.0036444012075662613, 'timestamp': '2025-09-30 22:13:16.762122', 'step': 2350, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:16.821943', 'step': 2350, 'epoch': 3} +{'type': 'loss', 'content': 0.026028577238321304, 'timestamp': '2025-09-30 22:13:16.832804', 'step': 2351, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:16.901109', 'step': 2351, 'epoch': 3} +{'type': 'loss', 'content': 0.0048171780072152615, 'timestamp': '2025-09-30 22:13:16.908059', 'step': 2352, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:16.962841', 'step': 2352, 'epoch': 3} +{'type': 'loss', 'content': 3.750595715246163e-05, 'timestamp': '2025-09-30 22:13:16.965327', 'step': 2353, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:17.025383', 'step': 2353, 'epoch': 3} +{'type': 'loss', 'content': 0.008849648758769035, 'timestamp': '2025-09-30 22:13:17.028464', 'step': 2354, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:17.092237', 'step': 2354, 'epoch': 3} +{'type': 'loss', 'content': 6.525561184389517e-05, 'timestamp': '2025-09-30 22:13:17.100739', 'step': 2355, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:17.163544', 'step': 2355, 'epoch': 3} +{'type': 'loss', 'content': 9.634840534999967e-05, 'timestamp': '2025-09-30 22:13:17.171199', 'step': 2356, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:17.228897', 'step': 2356, 'epoch': 3} +{'type': 'loss', 'content': 0.0001659039407968521, 'timestamp': '2025-09-30 22:13:17.233707', 'step': 2357, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:17.301413', 'step': 2357, 'epoch': 3} +{'type': 'loss', 'content': 8.300376066472381e-05, 'timestamp': '2025-09-30 22:13:17.306270', 'step': 2358, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:17.365873', 'step': 2358, 'epoch': 3} +{'type': 'loss', 'content': 0.0006306396098807454, 'timestamp': '2025-09-30 22:13:17.369232', 'step': 2359, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:17.436726', 'step': 2359, 'epoch': 3} +{'type': 'loss', 'content': 0.012602239847183228, 'timestamp': '2025-09-30 22:13:17.443521', 'step': 2360, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:17.504255', 'step': 2360, 'epoch': 3} +{'type': 'loss', 'content': 6.385560845956206e-05, 'timestamp': '2025-09-30 22:13:17.513534', 'step': 2361, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:17.570422', 'step': 2361, 'epoch': 3} +{'type': 'loss', 'content': 0.0011304274667054415, 'timestamp': '2025-09-30 22:13:17.573485', 'step': 2362, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:17.631234', 'step': 2362, 'epoch': 3} +{'type': 'loss', 'content': 0.0038645591121166945, 'timestamp': '2025-09-30 22:13:17.634878', 'step': 2363, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:17.691433', 'step': 2363, 'epoch': 3} +{'type': 'loss', 'content': 0.007124242372810841, 'timestamp': '2025-09-30 22:13:17.704027', 'step': 2364, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:17.764565', 'step': 2364, 'epoch': 3} +{'type': 'loss', 'content': 0.0018359085079282522, 'timestamp': '2025-09-30 22:13:17.773024', 'step': 2365, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:17.833422', 'step': 2365, 'epoch': 3} +{'type': 'loss', 'content': 0.014947568997740746, 'timestamp': '2025-09-30 22:13:17.835913', 'step': 2366, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:17.893406', 'step': 2366, 'epoch': 3} +{'type': 'loss', 'content': 0.00046386828762479126, 'timestamp': '2025-09-30 22:13:17.896906', 'step': 2367, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:17.960708', 'step': 2367, 'epoch': 3} +{'type': 'loss', 'content': 0.008881064131855965, 'timestamp': '2025-09-30 22:13:17.968071', 'step': 2368, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:18.023659', 'step': 2368, 'epoch': 3} +{'type': 'loss', 'content': 0.0001027476173476316, 'timestamp': '2025-09-30 22:13:18.026536', 'step': 2369, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:18.086101', 'step': 2369, 'epoch': 3} +{'type': 'loss', 'content': 5.726577728637494e-05, 'timestamp': '2025-09-30 22:13:18.089037', 'step': 2370, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:18.152428', 'step': 2370, 'epoch': 3} +{'type': 'loss', 'content': 0.0024566694628447294, 'timestamp': '2025-09-30 22:13:18.155698', 'step': 2371, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:18.215358', 'step': 2371, 'epoch': 3} +{'type': 'loss', 'content': 0.002534597646445036, 'timestamp': '2025-09-30 22:13:18.222732', 'step': 2372, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:18.287719', 'step': 2372, 'epoch': 3} +{'type': 'loss', 'content': 0.04354690760374069, 'timestamp': '2025-09-30 22:13:18.292228', 'step': 2373, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:18.348467', 'step': 2373, 'epoch': 3} +{'type': 'loss', 'content': 0.017217475920915604, 'timestamp': '2025-09-30 22:13:18.352618', 'step': 2374, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:18.411457', 'step': 2374, 'epoch': 3} +{'type': 'loss', 'content': 0.007565335836261511, 'timestamp': '2025-09-30 22:13:18.415699', 'step': 2375, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:18.472300', 'step': 2375, 'epoch': 3} +{'type': 'loss', 'content': 0.026072219014167786, 'timestamp': '2025-09-30 22:13:18.480814', 'step': 2376, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:18.548341', 'step': 2376, 'epoch': 3} +{'type': 'loss', 'content': 0.0003426198090892285, 'timestamp': '2025-09-30 22:13:18.552600', 'step': 2377, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:18.609365', 'step': 2377, 'epoch': 3} +{'type': 'loss', 'content': 0.07469062507152557, 'timestamp': '2025-09-30 22:13:18.613435', 'step': 2378, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:18.677596', 'step': 2378, 'epoch': 3} +{'type': 'loss', 'content': 0.023098181933164597, 'timestamp': '2025-09-30 22:13:18.680446', 'step': 2379, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:18.742608', 'step': 2379, 'epoch': 3} +{'type': 'loss', 'content': 0.0065413895063102245, 'timestamp': '2025-09-30 22:13:18.749044', 'step': 2380, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:18.804305', 'step': 2380, 'epoch': 3} +{'type': 'loss', 'content': 0.009016537107527256, 'timestamp': '2025-09-30 22:13:18.808806', 'step': 2381, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:18.872537', 'step': 2381, 'epoch': 3} +{'type': 'loss', 'content': 0.005769102368503809, 'timestamp': '2025-09-30 22:13:18.883286', 'step': 2382, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:18.939108', 'step': 2382, 'epoch': 3} +{'type': 'loss', 'content': 0.03586021065711975, 'timestamp': '2025-09-30 22:13:18.943050', 'step': 2383, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:18.999422', 'step': 2383, 'epoch': 3} +{'type': 'loss', 'content': 0.032269734889268875, 'timestamp': '2025-09-30 22:13:19.006351', 'step': 2384, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:19.069159', 'step': 2384, 'epoch': 3} +{'type': 'loss', 'content': 0.00165548047516495, 'timestamp': '2025-09-30 22:13:19.072039', 'step': 2385, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:19.131862', 'step': 2385, 'epoch': 3} +{'type': 'loss', 'content': 0.0024231227580457926, 'timestamp': '2025-09-30 22:13:19.134492', 'step': 2386, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:19.198543', 'step': 2386, 'epoch': 3} +{'type': 'loss', 'content': 0.04199283942580223, 'timestamp': '2025-09-30 22:13:19.201655', 'step': 2387, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:19.257463', 'step': 2387, 'epoch': 3} +{'type': 'loss', 'content': 0.03284038230776787, 'timestamp': '2025-09-30 22:13:19.273136', 'step': 2388, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:19.330743', 'step': 2388, 'epoch': 3} +{'type': 'loss', 'content': 0.0017338943434879184, 'timestamp': '2025-09-30 22:13:19.333819', 'step': 2389, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:19.390251', 'step': 2389, 'epoch': 3} +{'type': 'loss', 'content': 0.004067179746925831, 'timestamp': '2025-09-30 22:13:19.407477', 'step': 2390, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:19.473602', 'step': 2390, 'epoch': 3} +{'type': 'loss', 'content': 0.002591915661469102, 'timestamp': '2025-09-30 22:13:19.482602', 'step': 2391, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:19.543242', 'step': 2391, 'epoch': 3} +{'type': 'loss', 'content': 0.006193013396114111, 'timestamp': '2025-09-30 22:13:19.550256', 'step': 2392, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:19.613204', 'step': 2392, 'epoch': 3} +{'type': 'loss', 'content': 0.018228931352496147, 'timestamp': '2025-09-30 22:13:19.616517', 'step': 2393, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:19.673910', 'step': 2393, 'epoch': 3} +{'type': 'loss', 'content': 0.0011682414915412664, 'timestamp': '2025-09-30 22:13:19.676994', 'step': 2394, 'epoch': 3} +{'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}], 'timestamp': '2025-09-30 22:13:21.124634', 'step': 2394, 'epoch': 3} +{'type': 'pplx', 'content': 32188811.96573708, 'timestamp': '2025-09-30 22:13:21.128270', 'step': 2394, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:21.183488', 'step': 2394, 'epoch': 3} +{'type': 'loss', 'content': 0.005388319492340088, 'timestamp': '2025-09-30 22:13:21.187245', 'step': 2395, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:21.247593', 'step': 2395, 'epoch': 3} +{'type': 'loss', 'content': 0.013586322776973248, 'timestamp': '2025-09-30 22:13:21.254124', 'step': 2396, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:21.312845', 'step': 2396, 'epoch': 3} +{'type': 'loss', 'content': 0.006401594262570143, 'timestamp': '2025-09-30 22:13:21.321760', 'step': 2397, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:21.379109', 'step': 2397, 'epoch': 3} +{'type': 'loss', 'content': 0.009359225630760193, 'timestamp': '2025-09-30 22:13:21.381831', 'step': 2398, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:21.444723', 'step': 2398, 'epoch': 3} +{'type': 'loss', 'content': 0.021020669490098953, 'timestamp': '2025-09-30 22:13:21.448756', 'step': 2399, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:21.512288', 'step': 2399, 'epoch': 3} +{'type': 'loss', 'content': 0.012792134657502174, 'timestamp': '2025-09-30 22:13:21.518632', 'step': 2400, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:21.573224', 'step': 2400, 'epoch': 3} +{'type': 'loss', 'content': 0.004996354691684246, 'timestamp': '2025-09-30 22:13:21.575642', 'step': 2401, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-30 22:13:21.637750', 'step': 2401, 'epoch': 3} +{'type': 'loss', 'content': 0.029606416821479797, 'timestamp': '2025-09-30 22:13:21.642603', 'step': 2402, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:21.700319', 'step': 2402, 'epoch': 3} +{'type': 'loss', 'content': 0.006963818334043026, 'timestamp': '2025-09-30 22:13:21.703901', 'step': 2403, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:21.761290', 'step': 2403, 'epoch': 3} +{'type': 'loss', 'content': 0.0031231886241585016, 'timestamp': '2025-09-30 22:13:21.768141', 'step': 2404, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:21.832875', 'step': 2404, 'epoch': 3} +{'type': 'loss', 'content': 0.005416753236204386, 'timestamp': '2025-09-30 22:13:21.836985', 'step': 2405, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:21.901336', 'step': 2405, 'epoch': 3} +{'type': 'loss', 'content': 0.007782486267387867, 'timestamp': '2025-09-30 22:13:21.914547', 'step': 2406, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:21.973355', 'step': 2406, 'epoch': 3} +{'type': 'loss', 'content': 0.013428935781121254, 'timestamp': '2025-09-30 22:13:21.976497', 'step': 2407, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:22.032826', 'step': 2407, 'epoch': 3} +{'type': 'loss', 'content': 0.016297975555062294, 'timestamp': '2025-09-30 22:13:22.040439', 'step': 2408, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:22.100144', 'step': 2408, 'epoch': 3} +{'type': 'loss', 'content': 0.00910912174731493, 'timestamp': '2025-09-30 22:13:22.105263', 'step': 2409, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:22.160966', 'step': 2409, 'epoch': 3} +{'type': 'loss', 'content': 0.011663010343909264, 'timestamp': '2025-09-30 22:13:22.170743', 'step': 2410, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:22.228090', 'step': 2410, 'epoch': 3} +{'type': 'loss', 'content': 0.010669506154954433, 'timestamp': '2025-09-30 22:13:22.230710', 'step': 2411, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-30 22:13:22.289260', 'step': 2411, 'epoch': 3} +{'type': 'loss', 'content': 0.009635014459490776, 'timestamp': '2025-09-30 22:13:22.295350', 'step': 2412, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:22.355178', 'step': 2412, 'epoch': 3} +{'type': 'loss', 'content': 0.006291415076702833, 'timestamp': '2025-09-30 22:13:22.358455', 'step': 2413, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:22.416008', 'step': 2413, 'epoch': 3} +{'type': 'loss', 'content': 0.005919110961258411, 'timestamp': '2025-09-30 22:13:22.424105', 'step': 2414, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:22.480075', 'step': 2414, 'epoch': 3} +{'type': 'loss', 'content': 0.009971419349312782, 'timestamp': '2025-09-30 22:13:22.482487', 'step': 2415, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:22.538731', 'step': 2415, 'epoch': 3} +{'type': 'loss', 'content': 0.006832667160779238, 'timestamp': '2025-09-30 22:13:22.545680', 'step': 2416, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:22.601841', 'step': 2416, 'epoch': 3} +{'type': 'loss', 'content': 0.012653900310397148, 'timestamp': '2025-09-30 22:13:22.604658', 'step': 2417, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:22.663734', 'step': 2417, 'epoch': 3} +{'type': 'loss', 'content': 0.009062431752681732, 'timestamp': '2025-09-30 22:13:22.667671', 'step': 2418, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:22.732330', 'step': 2418, 'epoch': 3} +{'type': 'loss', 'content': 0.0064752777107059956, 'timestamp': '2025-09-30 22:13:22.737635', 'step': 2419, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:22.798336', 'step': 2419, 'epoch': 3} +{'type': 'loss', 'content': 0.003598024370148778, 'timestamp': '2025-09-30 22:13:22.806812', 'step': 2420, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:22.865289', 'step': 2420, 'epoch': 3} +{'type': 'loss', 'content': 0.002874630270525813, 'timestamp': '2025-09-30 22:13:22.869382', 'step': 2421, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:22.925654', 'step': 2421, 'epoch': 3} +{'type': 'loss', 'content': 0.03282652422785759, 'timestamp': '2025-09-30 22:13:22.929496', 'step': 2422, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:22.984177', 'step': 2422, 'epoch': 3} +{'type': 'loss', 'content': 0.013241315260529518, 'timestamp': '2025-09-30 22:13:22.987735', 'step': 2423, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:23.051284', 'step': 2423, 'epoch': 3} +{'type': 'loss', 'content': 0.014345109462738037, 'timestamp': '2025-09-30 22:13:23.059332', 'step': 2424, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-30 22:13:23.121038', 'step': 2424, 'epoch': 3} +{'type': 'loss', 'content': 0.015767786651849747, 'timestamp': '2025-09-30 22:13:23.123336', 'step': 2425, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:23.184027', 'step': 2425, 'epoch': 3} +{'type': 'loss', 'content': 0.004914857912808657, 'timestamp': '2025-09-30 22:13:23.190312', 'step': 2426, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:23.247883', 'step': 2426, 'epoch': 3} +{'type': 'loss', 'content': 0.021198097616434097, 'timestamp': '2025-09-30 22:13:23.252020', 'step': 2427, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:23.314039', 'step': 2427, 'epoch': 3} +{'type': 'loss', 'content': 0.000634256808552891, 'timestamp': '2025-09-30 22:13:23.320233', 'step': 2428, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:23.373727', 'step': 2428, 'epoch': 3} +{'type': 'loss', 'content': 0.010591315105557442, 'timestamp': '2025-09-30 22:13:23.376300', 'step': 2429, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:23.434026', 'step': 2429, 'epoch': 3} +{'type': 'loss', 'content': 0.019355354830622673, 'timestamp': '2025-09-30 22:13:23.443163', 'step': 2430, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:23.505531', 'step': 2430, 'epoch': 3} +{'type': 'loss', 'content': 0.01241142489016056, 'timestamp': '2025-09-30 22:13:23.511100', 'step': 2431, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:23.567662', 'step': 2431, 'epoch': 3} +{'type': 'loss', 'content': 0.009565332904458046, 'timestamp': '2025-09-30 22:13:23.575405', 'step': 2432, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:23.632837', 'step': 2432, 'epoch': 3} +{'type': 'loss', 'content': 0.025015641003847122, 'timestamp': '2025-09-30 22:13:23.635530', 'step': 2433, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:23.697422', 'step': 2433, 'epoch': 3} +{'type': 'loss', 'content': 0.006486161146312952, 'timestamp': '2025-09-30 22:13:23.709455', 'step': 2434, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:23.771333', 'step': 2434, 'epoch': 3} +{'type': 'loss', 'content': 0.011576437391340733, 'timestamp': '2025-09-30 22:13:23.776736', 'step': 2435, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:23.837294', 'step': 2435, 'epoch': 3} +{'type': 'loss', 'content': 0.015482393093407154, 'timestamp': '2025-09-30 22:13:23.844023', 'step': 2436, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:23.899133', 'step': 2436, 'epoch': 3} +{'type': 'loss', 'content': 0.003182504326105118, 'timestamp': '2025-09-30 22:13:23.904724', 'step': 2437, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:23.964847', 'step': 2437, 'epoch': 3} +{'type': 'loss', 'content': 0.011627678759396076, 'timestamp': '2025-09-30 22:13:23.968295', 'step': 2438, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:24.025865', 'step': 2438, 'epoch': 3} +{'type': 'loss', 'content': 0.0038793303538113832, 'timestamp': '2025-09-30 22:13:24.032278', 'step': 2439, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:24.096569', 'step': 2439, 'epoch': 3} +{'type': 'loss', 'content': 0.005496421363204718, 'timestamp': '2025-09-30 22:13:24.108928', 'step': 2440, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:24.169991', 'step': 2440, 'epoch': 3} +{'type': 'loss', 'content': 0.012969347648322582, 'timestamp': '2025-09-30 22:13:24.176292', 'step': 2441, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:24.237921', 'step': 2441, 'epoch': 3} +{'type': 'loss', 'content': 0.005602761637419462, 'timestamp': '2025-09-30 22:13:24.242046', 'step': 2442, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:24.305589', 'step': 2442, 'epoch': 3} +{'type': 'loss', 'content': 0.00627124821767211, 'timestamp': '2025-09-30 22:13:24.309441', 'step': 2443, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:24.369126', 'step': 2443, 'epoch': 3} +{'type': 'loss', 'content': 0.01086896751075983, 'timestamp': '2025-09-30 22:13:24.377066', 'step': 2444, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:24.430661', 'step': 2444, 'epoch': 3} +{'type': 'loss', 'content': 0.026062259450554848, 'timestamp': '2025-09-30 22:13:24.434410', 'step': 2445, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:24.496901', 'step': 2445, 'epoch': 3} +{'type': 'loss', 'content': 0.012748646549880505, 'timestamp': '2025-09-30 22:13:24.499871', 'step': 2446, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:24.560400', 'step': 2446, 'epoch': 3} +{'type': 'loss', 'content': 0.018161652609705925, 'timestamp': '2025-09-30 22:13:24.567688', 'step': 2447, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:24.621315', 'step': 2447, 'epoch': 3} +{'type': 'loss', 'content': 0.006955179385840893, 'timestamp': '2025-09-30 22:13:24.627375', 'step': 2448, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:24.684306', 'step': 2448, 'epoch': 3} +{'type': 'loss', 'content': 0.011628582142293453, 'timestamp': '2025-09-30 22:13:24.688144', 'step': 2449, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:24.748192', 'step': 2449, 'epoch': 3} +{'type': 'loss', 'content': 0.00847693346440792, 'timestamp': '2025-09-30 22:13:24.751297', 'step': 2450, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:24.811240', 'step': 2450, 'epoch': 3} +{'type': 'loss', 'content': 0.017132606357336044, 'timestamp': '2025-09-30 22:13:24.814566', 'step': 2451, 'epoch': 3} +{'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}], 'timestamp': '2025-09-30 22:13:26.264249', 'step': 2451, 'epoch': 3} +{'type': 'pplx', 'content': 28315239.44436738, 'timestamp': '2025-09-30 22:13:26.268190', 'step': 2451, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:26.323933', 'step': 2451, 'epoch': 3} +{'type': 'loss', 'content': 0.03608303517103195, 'timestamp': '2025-09-30 22:13:26.329858', 'step': 2452, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:26.390834', 'step': 2452, 'epoch': 3} +{'type': 'loss', 'content': 0.005534134339541197, 'timestamp': '2025-09-30 22:13:26.397219', 'step': 2453, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:26.465956', 'step': 2453, 'epoch': 3} +{'type': 'loss', 'content': 0.003311963053420186, 'timestamp': '2025-09-30 22:13:26.470711', 'step': 2454, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:26.534839', 'step': 2454, 'epoch': 3} +{'type': 'loss', 'content': 0.03324703872203827, 'timestamp': '2025-09-30 22:13:26.539042', 'step': 2455, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:26.598664', 'step': 2455, 'epoch': 3} +{'type': 'loss', 'content': 0.0027091566007584333, 'timestamp': '2025-09-30 22:13:26.606372', 'step': 2456, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:26.666515', 'step': 2456, 'epoch': 3} +{'type': 'loss', 'content': 0.012049831449985504, 'timestamp': '2025-09-30 22:13:26.669406', 'step': 2457, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:26.725248', 'step': 2457, 'epoch': 3} +{'type': 'loss', 'content': 0.007934476248919964, 'timestamp': '2025-09-30 22:13:26.728122', 'step': 2458, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:26.784509', 'step': 2458, 'epoch': 3} +{'type': 'loss', 'content': 0.0067762285470962524, 'timestamp': '2025-09-30 22:13:26.787065', 'step': 2459, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:26.846731', 'step': 2459, 'epoch': 3} +{'type': 'loss', 'content': 0.0006125275394879282, 'timestamp': '2025-09-30 22:13:26.852534', 'step': 2460, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:26.909867', 'step': 2460, 'epoch': 3} +{'type': 'loss', 'content': 0.001969063188880682, 'timestamp': '2025-09-30 22:13:26.919777', 'step': 2461, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:26.982301', 'step': 2461, 'epoch': 3} +{'type': 'loss', 'content': 0.01936742290854454, 'timestamp': '2025-09-30 22:13:26.985867', 'step': 2462, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:27.056042', 'step': 2462, 'epoch': 3} +{'type': 'loss', 'content': 0.02352396585047245, 'timestamp': '2025-09-30 22:13:27.062482', 'step': 2463, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:27.118380', 'step': 2463, 'epoch': 3} +{'type': 'loss', 'content': 0.021059229969978333, 'timestamp': '2025-09-30 22:13:27.124270', 'step': 2464, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:27.185631', 'step': 2464, 'epoch': 3} +{'type': 'loss', 'content': 0.0014644553884863853, 'timestamp': '2025-09-30 22:13:27.191249', 'step': 2465, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:27.261864', 'step': 2465, 'epoch': 3} +{'type': 'loss', 'content': 0.002741077449172735, 'timestamp': '2025-09-30 22:13:27.264795', 'step': 2466, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:27.320924', 'step': 2466, 'epoch': 3} +{'type': 'loss', 'content': 0.003315993817523122, 'timestamp': '2025-09-30 22:13:27.324832', 'step': 2467, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:27.380192', 'step': 2467, 'epoch': 3} +{'type': 'loss', 'content': 0.016506491228938103, 'timestamp': '2025-09-30 22:13:27.387391', 'step': 2468, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:27.450382', 'step': 2468, 'epoch': 3} +{'type': 'loss', 'content': 0.005784686654806137, 'timestamp': '2025-09-30 22:13:27.453645', 'step': 2469, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:27.515525', 'step': 2469, 'epoch': 3} +{'type': 'loss', 'content': 0.024664176627993584, 'timestamp': '2025-09-30 22:13:27.518250', 'step': 2470, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:27.572542', 'step': 2470, 'epoch': 3} +{'type': 'loss', 'content': 0.00011582062143133953, 'timestamp': '2025-09-30 22:13:27.578850', 'step': 2471, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:27.637005', 'step': 2471, 'epoch': 3} +{'type': 'loss', 'content': 0.007763525936752558, 'timestamp': '2025-09-30 22:13:27.646004', 'step': 2472, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:27.702172', 'step': 2472, 'epoch': 3} +{'type': 'loss', 'content': 0.010659711435437202, 'timestamp': '2025-09-30 22:13:27.706063', 'step': 2473, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:27.777540', 'step': 2473, 'epoch': 3} +{'type': 'loss', 'content': 0.0018466237233951688, 'timestamp': '2025-09-30 22:13:27.779959', 'step': 2474, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:27.852539', 'step': 2474, 'epoch': 3} +{'type': 'loss', 'content': 0.0035447957925498486, 'timestamp': '2025-09-30 22:13:27.859813', 'step': 2475, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:27.920238', 'step': 2475, 'epoch': 3} +{'type': 'loss', 'content': 0.005354622844606638, 'timestamp': '2025-09-30 22:13:27.926963', 'step': 2476, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:27.989674', 'step': 2476, 'epoch': 3} +{'type': 'loss', 'content': 0.0005044917925260961, 'timestamp': '2025-09-30 22:13:27.992747', 'step': 2477, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:28.048142', 'step': 2477, 'epoch': 3} +{'type': 'loss', 'content': 0.02057606168091297, 'timestamp': '2025-09-30 22:13:28.051060', 'step': 2478, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:28.108531', 'step': 2478, 'epoch': 3} +{'type': 'loss', 'content': 0.00041592912748456, 'timestamp': '2025-09-30 22:13:28.110908', 'step': 2479, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:28.173240', 'step': 2479, 'epoch': 3} +{'type': 'loss', 'content': 0.02707737870514393, 'timestamp': '2025-09-30 22:13:28.179409', 'step': 2480, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:28.234239', 'step': 2480, 'epoch': 3} +{'type': 'loss', 'content': 0.008141888305544853, 'timestamp': '2025-09-30 22:13:28.237098', 'step': 2481, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:28.294134', 'step': 2481, 'epoch': 3} +{'type': 'loss', 'content': 0.0005826229462400079, 'timestamp': '2025-09-30 22:13:28.297607', 'step': 2482, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:28.354318', 'step': 2482, 'epoch': 3} +{'type': 'loss', 'content': 0.0010274297092109919, 'timestamp': '2025-09-30 22:13:28.358929', 'step': 2483, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:28.416658', 'step': 2483, 'epoch': 3} +{'type': 'loss', 'content': 0.00012387447350192815, 'timestamp': '2025-09-30 22:13:28.424114', 'step': 2484, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:28.486895', 'step': 2484, 'epoch': 3} +{'type': 'loss', 'content': 0.0458819679915905, 'timestamp': '2025-09-30 22:13:28.489480', 'step': 2485, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:28.545121', 'step': 2485, 'epoch': 3} +{'type': 'loss', 'content': 0.0024083128664642572, 'timestamp': '2025-09-30 22:13:28.549784', 'step': 2486, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:28.612063', 'step': 2486, 'epoch': 3} +{'type': 'loss', 'content': 0.018058612942695618, 'timestamp': '2025-09-30 22:13:28.615950', 'step': 2487, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:28.677326', 'step': 2487, 'epoch': 3} +{'type': 'loss', 'content': 0.016298463568091393, 'timestamp': '2025-09-30 22:13:28.683260', 'step': 2488, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:28.739919', 'step': 2488, 'epoch': 3} +{'type': 'loss', 'content': 0.007573925890028477, 'timestamp': '2025-09-30 22:13:28.742847', 'step': 2489, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:28.802419', 'step': 2489, 'epoch': 3} +{'type': 'loss', 'content': 0.00014498885138891637, 'timestamp': '2025-09-30 22:13:28.807170', 'step': 2490, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:28.872060', 'step': 2490, 'epoch': 3} +{'type': 'loss', 'content': 6.842377479188144e-05, 'timestamp': '2025-09-30 22:13:28.879991', 'step': 2491, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:28.951699', 'step': 2491, 'epoch': 3} +{'type': 'loss', 'content': 0.027848348021507263, 'timestamp': '2025-09-30 22:13:28.958119', 'step': 2492, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:29.023111', 'step': 2492, 'epoch': 3} +{'type': 'loss', 'content': 0.0003194676246494055, 'timestamp': '2025-09-30 22:13:29.026405', 'step': 2493, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:29.085635', 'step': 2493, 'epoch': 3} +{'type': 'loss', 'content': 0.004121209029108286, 'timestamp': '2025-09-30 22:13:29.091479', 'step': 2494, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:29.151939', 'step': 2494, 'epoch': 3} +{'type': 'loss', 'content': 0.030184591189026833, 'timestamp': '2025-09-30 22:13:29.154364', 'step': 2495, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:29.213774', 'step': 2495, 'epoch': 3} +{'type': 'loss', 'content': 0.00010497510811546817, 'timestamp': '2025-09-30 22:13:29.225978', 'step': 2496, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:29.288120', 'step': 2496, 'epoch': 3} +{'type': 'loss', 'content': 0.006152989808470011, 'timestamp': '2025-09-30 22:13:29.298523', 'step': 2497, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:29.366801', 'step': 2497, 'epoch': 3} +{'type': 'loss', 'content': 0.010256282053887844, 'timestamp': '2025-09-30 22:13:29.378777', 'step': 2498, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:29.443430', 'step': 2498, 'epoch': 3} +{'type': 'loss', 'content': 0.0030539052095264196, 'timestamp': '2025-09-30 22:13:29.445695', 'step': 2499, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:29.502248', 'step': 2499, 'epoch': 3} +{'type': 'loss', 'content': 0.005430816672742367, 'timestamp': '2025-09-30 22:13:29.512673', 'step': 2500, 'epoch': 3} +{'type': 'info', 'content': 'Checkpoint saved at step 2500', 'timestamp': '2025-09-30 22:13:29.992354', 'step': 2500, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:30.056711', 'step': 2500, 'epoch': 3} +{'type': 'loss', 'content': 0.0053467415273189545, 'timestamp': '2025-09-30 22:13:30.068444', 'step': 2501, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:30.125796', 'step': 2501, 'epoch': 3} +{'type': 'loss', 'content': 0.011339363642036915, 'timestamp': '2025-09-30 22:13:30.131339', 'step': 2502, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:30.192817', 'step': 2502, 'epoch': 3} +{'type': 'loss', 'content': 0.010253089480102062, 'timestamp': '2025-09-30 22:13:30.202343', 'step': 2503, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:30.266346', 'step': 2503, 'epoch': 3} +{'type': 'loss', 'content': 0.04441189020872116, 'timestamp': '2025-09-30 22:13:30.284633', 'step': 2504, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:30.343260', 'step': 2504, 'epoch': 3} +{'type': 'loss', 'content': 0.0009698430076241493, 'timestamp': '2025-09-30 22:13:30.346379', 'step': 2505, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:30.415313', 'step': 2505, 'epoch': 3} +{'type': 'loss', 'content': 0.008782203309237957, 'timestamp': '2025-09-30 22:13:30.421399', 'step': 2506, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:30.488357', 'step': 2506, 'epoch': 3} +{'type': 'loss', 'content': 0.0034844086039811373, 'timestamp': '2025-09-30 22:13:30.495408', 'step': 2507, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:30.561594', 'step': 2507, 'epoch': 3} +{'type': 'loss', 'content': 0.02779925987124443, 'timestamp': '2025-09-30 22:13:30.570543', 'step': 2508, 'epoch': 3} +{'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}], 'timestamp': '2025-09-30 22:13:32.146436', 'step': 2508, 'epoch': 3} +{'type': 'pplx', 'content': 28484982.005562108, 'timestamp': '2025-09-30 22:13:32.149120', 'step': 2508, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:32.201869', 'step': 2508, 'epoch': 3} +{'type': 'loss', 'content': 0.04372706264257431, 'timestamp': '2025-09-30 22:13:32.204873', 'step': 2509, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:32.277804', 'step': 2509, 'epoch': 3} +{'type': 'loss', 'content': 0.006275986321270466, 'timestamp': '2025-09-30 22:13:32.294723', 'step': 2510, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:32.372737', 'step': 2510, 'epoch': 3} +{'type': 'loss', 'content': 0.007544786669313908, 'timestamp': '2025-09-30 22:13:32.395531', 'step': 2511, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:32.472031', 'step': 2511, 'epoch': 3} +{'type': 'loss', 'content': 0.030385946854948997, 'timestamp': '2025-09-30 22:13:32.497664', 'step': 2512, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:32.576892', 'step': 2512, 'epoch': 3} +{'type': 'loss', 'content': 0.02495487593114376, 'timestamp': '2025-09-30 22:13:32.608326', 'step': 2513, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:32.686036', 'step': 2513, 'epoch': 3} +{'type': 'loss', 'content': 0.005949577782303095, 'timestamp': '2025-09-30 22:13:32.709754', 'step': 2514, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:32.779864', 'step': 2514, 'epoch': 3} +{'type': 'loss', 'content': 0.016375141218304634, 'timestamp': '2025-09-30 22:13:32.800113', 'step': 2515, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:32.881676', 'step': 2515, 'epoch': 3} +{'type': 'loss', 'content': 0.03323351964354515, 'timestamp': '2025-09-30 22:13:32.895106', 'step': 2516, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:32.968779', 'step': 2516, 'epoch': 3} +{'type': 'loss', 'content': 0.010078134015202522, 'timestamp': '2025-09-30 22:13:32.983705', 'step': 2517, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:33.056435', 'step': 2517, 'epoch': 3} +{'type': 'loss', 'content': 0.005874228663742542, 'timestamp': '2025-09-30 22:13:33.068804', 'step': 2518, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:33.139380', 'step': 2518, 'epoch': 3} +{'type': 'loss', 'content': 0.026764482259750366, 'timestamp': '2025-09-30 22:13:33.150956', 'step': 2519, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:33.215944', 'step': 2519, 'epoch': 3} +{'type': 'loss', 'content': 0.007020972203463316, 'timestamp': '2025-09-30 22:13:33.230128', 'step': 2520, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:33.306129', 'step': 2520, 'epoch': 3} +{'type': 'loss', 'content': 0.002482133684679866, 'timestamp': '2025-09-30 22:13:33.321965', 'step': 2521, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:33.407397', 'step': 2521, 'epoch': 3} +{'type': 'loss', 'content': 0.0031678727827966213, 'timestamp': '2025-09-30 22:13:33.422133', 'step': 2522, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:33.491337', 'step': 2522, 'epoch': 3} +{'type': 'loss', 'content': 0.03531847894191742, 'timestamp': '2025-09-30 22:13:33.512412', 'step': 2523, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:33.578179', 'step': 2523, 'epoch': 3} +{'type': 'loss', 'content': 0.012394100427627563, 'timestamp': '2025-09-30 22:13:33.600136', 'step': 2524, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:33.666404', 'step': 2524, 'epoch': 3} +{'type': 'loss', 'content': 0.011437847279012203, 'timestamp': '2025-09-30 22:13:33.682021', 'step': 2525, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:33.771138', 'step': 2525, 'epoch': 3} +{'type': 'loss', 'content': 0.002483924152329564, 'timestamp': '2025-09-30 22:13:33.791110', 'step': 2526, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:33.863813', 'step': 2526, 'epoch': 3} +{'type': 'loss', 'content': 0.01781102456152439, 'timestamp': '2025-09-30 22:13:33.879117', 'step': 2527, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:33.948914', 'step': 2527, 'epoch': 3} +{'type': 'loss', 'content': 0.02498701587319374, 'timestamp': '2025-09-30 22:13:33.974229', 'step': 2528, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:34.047825', 'step': 2528, 'epoch': 3} +{'type': 'loss', 'content': 0.004967492539435625, 'timestamp': '2025-09-30 22:13:34.057256', 'step': 2529, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:34.120092', 'step': 2529, 'epoch': 3} +{'type': 'loss', 'content': 0.008416562341153622, 'timestamp': '2025-09-30 22:13:34.124286', 'step': 2530, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:34.180050', 'step': 2530, 'epoch': 3} +{'type': 'loss', 'content': 0.004394114948809147, 'timestamp': '2025-09-30 22:13:34.188245', 'step': 2531, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:34.255255', 'step': 2531, 'epoch': 3} +{'type': 'loss', 'content': 0.0030704778619110584, 'timestamp': '2025-09-30 22:13:34.262524', 'step': 2532, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:34.322691', 'step': 2532, 'epoch': 3} +{'type': 'loss', 'content': 0.021440336480736732, 'timestamp': '2025-09-30 22:13:34.325981', 'step': 2533, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:34.381080', 'step': 2533, 'epoch': 3} +{'type': 'loss', 'content': 0.013712027110159397, 'timestamp': '2025-09-30 22:13:34.389556', 'step': 2534, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:34.449231', 'step': 2534, 'epoch': 3} +{'type': 'loss', 'content': 0.006967604160308838, 'timestamp': '2025-09-30 22:13:34.457892', 'step': 2535, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:34.518202', 'step': 2535, 'epoch': 3} +{'type': 'loss', 'content': 0.003231785027310252, 'timestamp': '2025-09-30 22:13:34.529400', 'step': 2536, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:34.584782', 'step': 2536, 'epoch': 3} +{'type': 'loss', 'content': 0.018372103571891785, 'timestamp': '2025-09-30 22:13:34.594720', 'step': 2537, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:34.652122', 'step': 2537, 'epoch': 3} +{'type': 'loss', 'content': 0.005035504698753357, 'timestamp': '2025-09-30 22:13:34.660471', 'step': 2538, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:34.717692', 'step': 2538, 'epoch': 3} +{'type': 'loss', 'content': 0.008455309085547924, 'timestamp': '2025-09-30 22:13:34.720813', 'step': 2539, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:34.778433', 'step': 2539, 'epoch': 3} +{'type': 'loss', 'content': 0.000451569736469537, 'timestamp': '2025-09-30 22:13:34.785623', 'step': 2540, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:34.857888', 'step': 2540, 'epoch': 3} +{'type': 'loss', 'content': 0.0005161632434464991, 'timestamp': '2025-09-30 22:13:34.861478', 'step': 2541, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:34.929400', 'step': 2541, 'epoch': 3} +{'type': 'loss', 'content': 0.004660214763134718, 'timestamp': '2025-09-30 22:13:34.931747', 'step': 2542, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:35.002296', 'step': 2542, 'epoch': 3} +{'type': 'loss', 'content': 0.005621172953397036, 'timestamp': '2025-09-30 22:13:35.004815', 'step': 2543, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:35.062321', 'step': 2543, 'epoch': 3} +{'type': 'loss', 'content': 0.02957882173359394, 'timestamp': '2025-09-30 22:13:35.069169', 'step': 2544, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-30 22:13:35.140995', 'step': 2544, 'epoch': 3} +{'type': 'loss', 'content': 0.009165803901851177, 'timestamp': '2025-09-30 22:13:35.143954', 'step': 2545, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:35.201122', 'step': 2545, 'epoch': 3} +{'type': 'loss', 'content': 0.02128801867365837, 'timestamp': '2025-09-30 22:13:35.208203', 'step': 2546, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:35.268216', 'step': 2546, 'epoch': 3} +{'type': 'loss', 'content': 0.005340006668120623, 'timestamp': '2025-09-30 22:13:35.276293', 'step': 2547, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:35.340625', 'step': 2547, 'epoch': 3} +{'type': 'loss', 'content': 0.007607621140778065, 'timestamp': '2025-09-30 22:13:35.348466', 'step': 2548, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:35.415112', 'step': 2548, 'epoch': 3} +{'type': 'loss', 'content': 0.021063433960080147, 'timestamp': '2025-09-30 22:13:35.418212', 'step': 2549, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:35.482643', 'step': 2549, 'epoch': 3} +{'type': 'loss', 'content': 0.011848625726997852, 'timestamp': '2025-09-30 22:13:35.486294', 'step': 2550, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:35.553913', 'step': 2550, 'epoch': 3} +{'type': 'loss', 'content': 0.010470031760632992, 'timestamp': '2025-09-30 22:13:35.557896', 'step': 2551, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:35.614322', 'step': 2551, 'epoch': 3} +{'type': 'loss', 'content': 0.007319572381675243, 'timestamp': '2025-09-30 22:13:35.625177', 'step': 2552, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:35.683824', 'step': 2552, 'epoch': 3} +{'type': 'loss', 'content': 0.002631863346323371, 'timestamp': '2025-09-30 22:13:35.686736', 'step': 2553, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:35.743155', 'step': 2553, 'epoch': 3} +{'type': 'loss', 'content': 0.026114294305443764, 'timestamp': '2025-09-30 22:13:35.745574', 'step': 2554, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:35.799953', 'step': 2554, 'epoch': 3} +{'type': 'loss', 'content': 0.0004028195107821375, 'timestamp': '2025-09-30 22:13:35.802048', 'step': 2555, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:35.861107', 'step': 2555, 'epoch': 3} +{'type': 'loss', 'content': 0.0005155097460374236, 'timestamp': '2025-09-30 22:13:35.868717', 'step': 2556, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:35.931861', 'step': 2556, 'epoch': 3} +{'type': 'loss', 'content': 0.014193429611623287, 'timestamp': '2025-09-30 22:13:35.935249', 'step': 2557, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:35.991284', 'step': 2557, 'epoch': 3} +{'type': 'loss', 'content': 0.023700231686234474, 'timestamp': '2025-09-30 22:13:35.997758', 'step': 2558, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:36.057147', 'step': 2558, 'epoch': 3} +{'type': 'loss', 'content': 0.013391217216849327, 'timestamp': '2025-09-30 22:13:36.060664', 'step': 2559, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:36.126611', 'step': 2559, 'epoch': 3} +{'type': 'loss', 'content': 0.0035434234887361526, 'timestamp': '2025-09-30 22:13:36.132365', 'step': 2560, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:36.191606', 'step': 2560, 'epoch': 3} +{'type': 'loss', 'content': 0.02454542927443981, 'timestamp': '2025-09-30 22:13:36.199936', 'step': 2561, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:36.279053', 'step': 2561, 'epoch': 3} +{'type': 'loss', 'content': 0.0029066719580441713, 'timestamp': '2025-09-30 22:13:36.287710', 'step': 2562, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:36.365339', 'step': 2562, 'epoch': 3} +{'type': 'loss', 'content': 0.006104097701609135, 'timestamp': '2025-09-30 22:13:36.368124', 'step': 2563, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:36.428303', 'step': 2563, 'epoch': 3} +{'type': 'loss', 'content': 0.010215085931122303, 'timestamp': '2025-09-30 22:13:36.437744', 'step': 2564, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:36.503906', 'step': 2564, 'epoch': 3} +{'type': 'loss', 'content': 0.005062797572463751, 'timestamp': '2025-09-30 22:13:36.509952', 'step': 2565, 'epoch': 3} +{'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}], 'timestamp': '2025-09-30 22:13:37.982199', 'step': 2565, 'epoch': 3} +{'type': 'pplx', 'content': 28095844.67479086, 'timestamp': '2025-09-30 22:13:37.984464', 'step': 2565, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:38.037632', 'step': 2565, 'epoch': 3} +{'type': 'loss', 'content': 0.010721017606556416, 'timestamp': '2025-09-30 22:13:38.039899', 'step': 2566, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:38.108144', 'step': 2566, 'epoch': 3} +{'type': 'loss', 'content': 0.003516018856316805, 'timestamp': '2025-09-30 22:13:38.110577', 'step': 2567, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:38.167050', 'step': 2567, 'epoch': 3} +{'type': 'loss', 'content': 0.0029169064946472645, 'timestamp': '2025-09-30 22:13:38.174014', 'step': 2568, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:38.237774', 'step': 2568, 'epoch': 3} +{'type': 'loss', 'content': 0.0035148721653968096, 'timestamp': '2025-09-30 22:13:38.240537', 'step': 2569, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:38.296057', 'step': 2569, 'epoch': 3} +{'type': 'loss', 'content': 0.033282749354839325, 'timestamp': '2025-09-30 22:13:38.300103', 'step': 2570, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:38.361289', 'step': 2570, 'epoch': 3} +{'type': 'loss', 'content': 0.010799924843013287, 'timestamp': '2025-09-30 22:13:38.363894', 'step': 2571, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:38.428150', 'step': 2571, 'epoch': 3} +{'type': 'loss', 'content': 0.005024234298616648, 'timestamp': '2025-09-30 22:13:38.434615', 'step': 2572, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:38.494982', 'step': 2572, 'epoch': 3} +{'type': 'loss', 'content': 0.0024888780899345875, 'timestamp': '2025-09-30 22:13:38.497928', 'step': 2573, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:38.564411', 'step': 2573, 'epoch': 3} +{'type': 'loss', 'content': 0.014496888034045696, 'timestamp': '2025-09-30 22:13:38.573061', 'step': 2574, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:38.629771', 'step': 2574, 'epoch': 3} +{'type': 'loss', 'content': 0.006988645065575838, 'timestamp': '2025-09-30 22:13:38.637345', 'step': 2575, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:38.697004', 'step': 2575, 'epoch': 3} +{'type': 'loss', 'content': 0.01132346410304308, 'timestamp': '2025-09-30 22:13:38.703263', 'step': 2576, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:38.764133', 'step': 2576, 'epoch': 3} +{'type': 'loss', 'content': 0.0035836149472743273, 'timestamp': '2025-09-30 22:13:38.767094', 'step': 2577, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:38.830451', 'step': 2577, 'epoch': 3} +{'type': 'loss', 'content': 0.013794556260108948, 'timestamp': '2025-09-30 22:13:38.833653', 'step': 2578, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:38.892206', 'step': 2578, 'epoch': 3} +{'type': 'loss', 'content': 0.004456694237887859, 'timestamp': '2025-09-30 22:13:38.895467', 'step': 2579, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:38.958609', 'step': 2579, 'epoch': 3} +{'type': 'loss', 'content': 0.006100859493017197, 'timestamp': '2025-09-30 22:13:38.965114', 'step': 2580, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:39.029425', 'step': 2580, 'epoch': 3} +{'type': 'loss', 'content': 0.0033564306795597076, 'timestamp': '2025-09-30 22:13:39.031433', 'step': 2581, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:39.088665', 'step': 2581, 'epoch': 3} +{'type': 'loss', 'content': 0.0037206211127340794, 'timestamp': '2025-09-30 22:13:39.095796', 'step': 2582, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:39.153293', 'step': 2582, 'epoch': 3} +{'type': 'loss', 'content': 0.006556731648743153, 'timestamp': '2025-09-30 22:13:39.155960', 'step': 2583, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:39.210529', 'step': 2583, 'epoch': 3} +{'type': 'loss', 'content': 0.01759198307991028, 'timestamp': '2025-09-30 22:13:39.216788', 'step': 2584, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:39.271308', 'step': 2584, 'epoch': 3} +{'type': 'loss', 'content': 0.012082146480679512, 'timestamp': '2025-09-30 22:13:39.273747', 'step': 2585, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:39.328828', 'step': 2585, 'epoch': 3} +{'type': 'loss', 'content': 0.034108925610780716, 'timestamp': '2025-09-30 22:13:39.331233', 'step': 2586, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:39.389845', 'step': 2586, 'epoch': 3} +{'type': 'loss', 'content': 0.003907191567122936, 'timestamp': '2025-09-30 22:13:39.403544', 'step': 2587, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:39.465603', 'step': 2587, 'epoch': 3} +{'type': 'loss', 'content': 0.009326784871518612, 'timestamp': '2025-09-30 22:13:39.472112', 'step': 2588, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:39.531850', 'step': 2588, 'epoch': 3} +{'type': 'loss', 'content': 0.008430427871644497, 'timestamp': '2025-09-30 22:13:39.535223', 'step': 2589, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:39.599643', 'step': 2589, 'epoch': 3} +{'type': 'loss', 'content': 0.005537763237953186, 'timestamp': '2025-09-30 22:13:39.602815', 'step': 2590, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:39.659746', 'step': 2590, 'epoch': 3} +{'type': 'loss', 'content': 0.00847475416958332, 'timestamp': '2025-09-30 22:13:39.664121', 'step': 2591, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:39.726143', 'step': 2591, 'epoch': 3} +{'type': 'loss', 'content': 0.0008981380960904062, 'timestamp': '2025-09-30 22:13:39.736910', 'step': 2592, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:39.801757', 'step': 2592, 'epoch': 3} +{'type': 'loss', 'content': 0.0013419969473034143, 'timestamp': '2025-09-30 22:13:39.805646', 'step': 2593, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:39.860221', 'step': 2593, 'epoch': 3} +{'type': 'loss', 'content': 0.0038829329423606396, 'timestamp': '2025-09-30 22:13:39.870646', 'step': 2594, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:39.925607', 'step': 2594, 'epoch': 3} +{'type': 'loss', 'content': 0.0077403089962899685, 'timestamp': '2025-09-30 22:13:39.928827', 'step': 2595, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-30 22:13:39.984471', 'step': 2595, 'epoch': 3} +{'type': 'loss', 'content': 0.013570351526141167, 'timestamp': '2025-09-30 22:13:39.990851', 'step': 2596, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:40.054183', 'step': 2596, 'epoch': 3} +{'type': 'loss', 'content': 0.005910519044846296, 'timestamp': '2025-09-30 22:13:40.057403', 'step': 2597, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:40.115868', 'step': 2597, 'epoch': 3} +{'type': 'loss', 'content': 0.0021806147415190935, 'timestamp': '2025-09-30 22:13:40.119422', 'step': 2598, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:40.176335', 'step': 2598, 'epoch': 3} +{'type': 'loss', 'content': 0.002977479714900255, 'timestamp': '2025-09-30 22:13:40.184658', 'step': 2599, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:40.248724', 'step': 2599, 'epoch': 3} +{'type': 'loss', 'content': 0.0011497695231810212, 'timestamp': '2025-09-30 22:13:40.259275', 'step': 2600, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:40.316010', 'step': 2600, 'epoch': 3} +{'type': 'loss', 'content': 0.0021200214978307486, 'timestamp': '2025-09-30 22:13:40.322723', 'step': 2601, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:40.384991', 'step': 2601, 'epoch': 3} +{'type': 'loss', 'content': 0.003689644392579794, 'timestamp': '2025-09-30 22:13:40.388971', 'step': 2602, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:40.451268', 'step': 2602, 'epoch': 3} +{'type': 'loss', 'content': 0.002164714504033327, 'timestamp': '2025-09-30 22:13:40.454859', 'step': 2603, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:40.516437', 'step': 2603, 'epoch': 3} +{'type': 'loss', 'content': 0.024041039869189262, 'timestamp': '2025-09-30 22:13:40.523219', 'step': 2604, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:40.581524', 'step': 2604, 'epoch': 3} +{'type': 'loss', 'content': 0.053784094750881195, 'timestamp': '2025-09-30 22:13:40.587466', 'step': 2605, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:40.643075', 'step': 2605, 'epoch': 3} +{'type': 'loss', 'content': 0.00025220931274816394, 'timestamp': '2025-09-30 22:13:40.648320', 'step': 2606, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:40.706291', 'step': 2606, 'epoch': 3} +{'type': 'loss', 'content': 0.0412961021065712, 'timestamp': '2025-09-30 22:13:40.709924', 'step': 2607, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:40.768686', 'step': 2607, 'epoch': 3} +{'type': 'loss', 'content': 0.0008691218099556863, 'timestamp': '2025-09-30 22:13:40.775025', 'step': 2608, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:40.829933', 'step': 2608, 'epoch': 3} +{'type': 'loss', 'content': 0.012250705622136593, 'timestamp': '2025-09-30 22:13:40.840134', 'step': 2609, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:40.896800', 'step': 2609, 'epoch': 3} +{'type': 'loss', 'content': 0.007466053124517202, 'timestamp': '2025-09-30 22:13:40.899879', 'step': 2610, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:40.962887', 'step': 2610, 'epoch': 3} +{'type': 'loss', 'content': 0.0008426569984294474, 'timestamp': '2025-09-30 22:13:40.975062', 'step': 2611, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:41.033291', 'step': 2611, 'epoch': 3} +{'type': 'loss', 'content': 0.001527618383988738, 'timestamp': '2025-09-30 22:13:41.040007', 'step': 2612, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:41.094083', 'step': 2612, 'epoch': 3} +{'type': 'loss', 'content': 0.05261802300810814, 'timestamp': '2025-09-30 22:13:41.098284', 'step': 2613, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:41.154590', 'step': 2613, 'epoch': 3} +{'type': 'loss', 'content': 0.00033618827001191676, 'timestamp': '2025-09-30 22:13:41.159766', 'step': 2614, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:41.219906', 'step': 2614, 'epoch': 3} +{'type': 'loss', 'content': 0.013994032517075539, 'timestamp': '2025-09-30 22:13:41.222645', 'step': 2615, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:41.277574', 'step': 2615, 'epoch': 3} +{'type': 'loss', 'content': 0.001805710606276989, 'timestamp': '2025-09-30 22:13:41.283924', 'step': 2616, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:41.338699', 'step': 2616, 'epoch': 3} +{'type': 'loss', 'content': 0.01090067345649004, 'timestamp': '2025-09-30 22:13:41.341341', 'step': 2617, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:41.395509', 'step': 2617, 'epoch': 3} +{'type': 'loss', 'content': 0.010763258673250675, 'timestamp': '2025-09-30 22:13:41.398233', 'step': 2618, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:41.456411', 'step': 2618, 'epoch': 3} +{'type': 'loss', 'content': 0.00023230792430695146, 'timestamp': '2025-09-30 22:13:41.459052', 'step': 2619, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:41.517705', 'step': 2619, 'epoch': 3} +{'type': 'loss', 'content': 0.00010520854266360402, 'timestamp': '2025-09-30 22:13:41.526262', 'step': 2620, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:41.592677', 'step': 2620, 'epoch': 3} +{'type': 'loss', 'content': 8.937703387346119e-05, 'timestamp': '2025-09-30 22:13:41.596741', 'step': 2621, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:41.651606', 'step': 2621, 'epoch': 3} +{'type': 'loss', 'content': 0.003324234625324607, 'timestamp': '2025-09-30 22:13:41.656269', 'step': 2622, 'epoch': 3} +{'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}], 'timestamp': '2025-09-30 22:13:42.991390', 'step': 2622, 'epoch': 3} +{'type': 'pplx', 'content': 31437886.455709014, 'timestamp': '2025-09-30 22:13:42.996628', 'step': 2622, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:43.052565', 'step': 2622, 'epoch': 3} +{'type': 'loss', 'content': 0.005293331108987331, 'timestamp': '2025-09-30 22:13:43.059695', 'step': 2623, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:43.124834', 'step': 2623, 'epoch': 3} +{'type': 'loss', 'content': 9.892590605886653e-05, 'timestamp': '2025-09-30 22:13:43.132164', 'step': 2624, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:43.192440', 'step': 2624, 'epoch': 3} +{'type': 'loss', 'content': 0.033769041299819946, 'timestamp': '2025-09-30 22:13:43.201469', 'step': 2625, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:43.266274', 'step': 2625, 'epoch': 3} +{'type': 'loss', 'content': 0.036175686866045, 'timestamp': '2025-09-30 22:13:43.269492', 'step': 2626, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:43.334734', 'step': 2626, 'epoch': 3} +{'type': 'loss', 'content': 0.0016300681745633483, 'timestamp': '2025-09-30 22:13:43.349674', 'step': 2627, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:43.405168', 'step': 2627, 'epoch': 3} +{'type': 'loss', 'content': 0.052250683307647705, 'timestamp': '2025-09-30 22:13:43.412369', 'step': 2628, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:43.477284', 'step': 2628, 'epoch': 3} +{'type': 'loss', 'content': 0.003264912636950612, 'timestamp': '2025-09-30 22:13:43.480096', 'step': 2629, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:43.539111', 'step': 2629, 'epoch': 3} +{'type': 'loss', 'content': 0.0006807534955441952, 'timestamp': '2025-09-30 22:13:43.544956', 'step': 2630, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:43.601818', 'step': 2630, 'epoch': 3} +{'type': 'loss', 'content': 0.0006088690715841949, 'timestamp': '2025-09-30 22:13:43.610272', 'step': 2631, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:43.673117', 'step': 2631, 'epoch': 3} +{'type': 'loss', 'content': 0.006588431540876627, 'timestamp': '2025-09-30 22:13:43.691680', 'step': 2632, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:43.747030', 'step': 2632, 'epoch': 3} +{'type': 'loss', 'content': 0.0006200451171025634, 'timestamp': '2025-09-30 22:13:43.750018', 'step': 2633, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:43.811484', 'step': 2633, 'epoch': 3} +{'type': 'loss', 'content': 0.01981678046286106, 'timestamp': '2025-09-30 22:13:43.814840', 'step': 2634, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:43.873252', 'step': 2634, 'epoch': 3} +{'type': 'loss', 'content': 0.003335049608722329, 'timestamp': '2025-09-30 22:13:43.875994', 'step': 2635, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:43.939080', 'step': 2635, 'epoch': 3} +{'type': 'loss', 'content': 0.02097000740468502, 'timestamp': '2025-09-30 22:13:43.951340', 'step': 2636, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:44.008281', 'step': 2636, 'epoch': 3} +{'type': 'loss', 'content': 0.007635528687387705, 'timestamp': '2025-09-30 22:13:44.014140', 'step': 2637, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:44.072624', 'step': 2637, 'epoch': 3} +{'type': 'loss', 'content': 0.0021020916756242514, 'timestamp': '2025-09-30 22:13:44.078675', 'step': 2638, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:44.142189', 'step': 2638, 'epoch': 3} +{'type': 'loss', 'content': 0.00463180523365736, 'timestamp': '2025-09-30 22:13:44.145158', 'step': 2639, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:44.207719', 'step': 2639, 'epoch': 3} +{'type': 'loss', 'content': 0.00478312699124217, 'timestamp': '2025-09-30 22:13:44.215952', 'step': 2640, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:44.272292', 'step': 2640, 'epoch': 3} +{'type': 'loss', 'content': 0.0008612548117525876, 'timestamp': '2025-09-30 22:13:44.276660', 'step': 2641, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:44.344905', 'step': 2641, 'epoch': 3} +{'type': 'loss', 'content': 0.0024010143242776394, 'timestamp': '2025-09-30 22:13:44.357248', 'step': 2642, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:44.423670', 'step': 2642, 'epoch': 3} +{'type': 'loss', 'content': 0.003124420763924718, 'timestamp': '2025-09-30 22:13:44.427994', 'step': 2643, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:44.495267', 'step': 2643, 'epoch': 3} +{'type': 'loss', 'content': 0.013605816289782524, 'timestamp': '2025-09-30 22:13:44.502640', 'step': 2644, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:44.560412', 'step': 2644, 'epoch': 3} +{'type': 'loss', 'content': 0.0009547712397761643, 'timestamp': '2025-09-30 22:13:44.572377', 'step': 2645, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:44.628288', 'step': 2645, 'epoch': 3} +{'type': 'loss', 'content': 0.0024611575063318014, 'timestamp': '2025-09-30 22:13:44.632917', 'step': 2646, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:44.693593', 'step': 2646, 'epoch': 3} +{'type': 'loss', 'content': 0.010525861755013466, 'timestamp': '2025-09-30 22:13:44.701639', 'step': 2647, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:44.761423', 'step': 2647, 'epoch': 3} +{'type': 'loss', 'content': 0.014524830505251884, 'timestamp': '2025-09-30 22:13:44.768375', 'step': 2648, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:44.830634', 'step': 2648, 'epoch': 3} +{'type': 'loss', 'content': 0.0008129668422043324, 'timestamp': '2025-09-30 22:13:44.836483', 'step': 2649, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:44.893886', 'step': 2649, 'epoch': 3} +{'type': 'loss', 'content': 0.004618306644260883, 'timestamp': '2025-09-30 22:13:44.896386', 'step': 2650, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:44.953263', 'step': 2650, 'epoch': 3} +{'type': 'loss', 'content': 0.0023160020355135202, 'timestamp': '2025-09-30 22:13:44.955676', 'step': 2651, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:45.012135', 'step': 2651, 'epoch': 3} +{'type': 'loss', 'content': 0.0021287633571773767, 'timestamp': '2025-09-30 22:13:45.020394', 'step': 2652, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:45.075628', 'step': 2652, 'epoch': 3} +{'type': 'loss', 'content': 0.015038742683827877, 'timestamp': '2025-09-30 22:13:45.082391', 'step': 2653, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:45.137061', 'step': 2653, 'epoch': 3} +{'type': 'loss', 'content': 0.0270217452198267, 'timestamp': '2025-09-30 22:13:45.142322', 'step': 2654, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:45.202368', 'step': 2654, 'epoch': 3} +{'type': 'loss', 'content': 0.05145685747265816, 'timestamp': '2025-09-30 22:13:45.206286', 'step': 2655, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:45.265867', 'step': 2655, 'epoch': 3} +{'type': 'loss', 'content': 0.008141874335706234, 'timestamp': '2025-09-30 22:13:45.272859', 'step': 2656, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:45.327084', 'step': 2656, 'epoch': 3} +{'type': 'loss', 'content': 0.002790646394714713, 'timestamp': '2025-09-30 22:13:45.335543', 'step': 2657, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:45.394029', 'step': 2657, 'epoch': 3} +{'type': 'loss', 'content': 0.008883380331099033, 'timestamp': '2025-09-30 22:13:45.396873', 'step': 2658, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:45.459598', 'step': 2658, 'epoch': 3} +{'type': 'loss', 'content': 0.006654282100498676, 'timestamp': '2025-09-30 22:13:45.466734', 'step': 2659, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:45.533117', 'step': 2659, 'epoch': 3} +{'type': 'loss', 'content': 0.01170284952968359, 'timestamp': '2025-09-30 22:13:45.539352', 'step': 2660, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:45.593881', 'step': 2660, 'epoch': 3} +{'type': 'loss', 'content': 0.007521068211644888, 'timestamp': '2025-09-30 22:13:45.597392', 'step': 2661, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:45.658090', 'step': 2661, 'epoch': 3} +{'type': 'loss', 'content': 0.007094702683389187, 'timestamp': '2025-09-30 22:13:45.661569', 'step': 2662, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:45.716779', 'step': 2662, 'epoch': 3} +{'type': 'loss', 'content': 0.008204026147723198, 'timestamp': '2025-09-30 22:13:45.719813', 'step': 2663, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:45.776614', 'step': 2663, 'epoch': 3} +{'type': 'loss', 'content': 0.011364804580807686, 'timestamp': '2025-09-30 22:13:45.783795', 'step': 2664, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:45.840029', 'step': 2664, 'epoch': 3} +{'type': 'loss', 'content': 0.015917399898171425, 'timestamp': '2025-09-30 22:13:45.842921', 'step': 2665, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:45.905060', 'step': 2665, 'epoch': 3} +{'type': 'loss', 'content': 0.006572159472852945, 'timestamp': '2025-09-30 22:13:45.914197', 'step': 2666, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:45.975171', 'step': 2666, 'epoch': 3} +{'type': 'loss', 'content': 0.0034901797771453857, 'timestamp': '2025-09-30 22:13:45.977763', 'step': 2667, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:46.035916', 'step': 2667, 'epoch': 3} +{'type': 'loss', 'content': 0.014389263466000557, 'timestamp': '2025-09-30 22:13:46.042903', 'step': 2668, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:46.099722', 'step': 2668, 'epoch': 3} +{'type': 'loss', 'content': 0.002424341393634677, 'timestamp': '2025-09-30 22:13:46.106840', 'step': 2669, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:46.166856', 'step': 2669, 'epoch': 3} +{'type': 'loss', 'content': 0.01114154327660799, 'timestamp': '2025-09-30 22:13:46.169241', 'step': 2670, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:46.223757', 'step': 2670, 'epoch': 3} +{'type': 'loss', 'content': 0.023684168234467506, 'timestamp': '2025-09-30 22:13:46.226315', 'step': 2671, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:46.283561', 'step': 2671, 'epoch': 3} +{'type': 'loss', 'content': 0.002832164289429784, 'timestamp': '2025-09-30 22:13:46.299391', 'step': 2672, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:46.362243', 'step': 2672, 'epoch': 3} +{'type': 'loss', 'content': 0.008885924704372883, 'timestamp': '2025-09-30 22:13:46.367794', 'step': 2673, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:46.430281', 'step': 2673, 'epoch': 3} +{'type': 'loss', 'content': 0.0018890589708462358, 'timestamp': '2025-09-30 22:13:46.433980', 'step': 2674, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:46.491992', 'step': 2674, 'epoch': 3} +{'type': 'loss', 'content': 0.004877452738583088, 'timestamp': '2025-09-30 22:13:46.503349', 'step': 2675, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:46.572081', 'step': 2675, 'epoch': 3} +{'type': 'loss', 'content': 0.004309082869440317, 'timestamp': '2025-09-30 22:13:46.578554', 'step': 2676, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:46.639190', 'step': 2676, 'epoch': 3} +{'type': 'loss', 'content': 0.009656992740929127, 'timestamp': '2025-09-30 22:13:46.642116', 'step': 2677, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:46.699896', 'step': 2677, 'epoch': 3} +{'type': 'loss', 'content': 0.020586643368005753, 'timestamp': '2025-09-30 22:13:46.702389', 'step': 2678, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:46.761163', 'step': 2678, 'epoch': 3} +{'type': 'loss', 'content': 0.0008080621482804418, 'timestamp': '2025-09-30 22:13:46.770580', 'step': 2679, 'epoch': 3} +{'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}], 'timestamp': '2025-09-30 22:13:48.223899', 'step': 2679, 'epoch': 3} +{'type': 'pplx', 'content': 27226157.57321593, 'timestamp': '2025-09-30 22:13:48.229459', 'step': 2679, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:48.285279', 'step': 2679, 'epoch': 3} +{'type': 'loss', 'content': 0.0005241225007921457, 'timestamp': '2025-09-30 22:13:48.291639', 'step': 2680, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:48.347738', 'step': 2680, 'epoch': 3} +{'type': 'loss', 'content': 0.00306515721604228, 'timestamp': '2025-09-30 22:13:48.350274', 'step': 2681, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:48.408822', 'step': 2681, 'epoch': 3} +{'type': 'loss', 'content': 0.00246833311393857, 'timestamp': '2025-09-30 22:13:48.411670', 'step': 2682, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:48.465693', 'step': 2682, 'epoch': 3} +{'type': 'loss', 'content': 0.0012627762043848634, 'timestamp': '2025-09-30 22:13:48.468124', 'step': 2683, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:48.525025', 'step': 2683, 'epoch': 3} +{'type': 'loss', 'content': 0.00040524639189243317, 'timestamp': '2025-09-30 22:13:48.530951', 'step': 2684, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:48.596407', 'step': 2684, 'epoch': 3} +{'type': 'loss', 'content': 0.0009502816246822476, 'timestamp': '2025-09-30 22:13:48.606005', 'step': 2685, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:48.671814', 'step': 2685, 'epoch': 3} +{'type': 'loss', 'content': 0.007616551127284765, 'timestamp': '2025-09-30 22:13:48.675296', 'step': 2686, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:48.732914', 'step': 2686, 'epoch': 3} +{'type': 'loss', 'content': 0.005208977032452822, 'timestamp': '2025-09-30 22:13:48.736853', 'step': 2687, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:48.806110', 'step': 2687, 'epoch': 3} +{'type': 'loss', 'content': 0.0008302520145662129, 'timestamp': '2025-09-30 22:13:48.823721', 'step': 2688, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:48.883982', 'step': 2688, 'epoch': 3} +{'type': 'loss', 'content': 0.0001742185850162059, 'timestamp': '2025-09-30 22:13:48.887570', 'step': 2689, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:48.956348', 'step': 2689, 'epoch': 3} +{'type': 'loss', 'content': 0.0018354164203628898, 'timestamp': '2025-09-30 22:13:48.959330', 'step': 2690, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:49.020243', 'step': 2690, 'epoch': 3} +{'type': 'loss', 'content': 0.006067172158509493, 'timestamp': '2025-09-30 22:13:49.023125', 'step': 2691, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:49.083643', 'step': 2691, 'epoch': 3} +{'type': 'loss', 'content': 0.023693010210990906, 'timestamp': '2025-09-30 22:13:49.095187', 'step': 2692, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:49.161575', 'step': 2692, 'epoch': 3} +{'type': 'loss', 'content': 0.0013803176116198301, 'timestamp': '2025-09-30 22:13:49.168354', 'step': 2693, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:49.230112', 'step': 2693, 'epoch': 3} +{'type': 'loss', 'content': 0.001813818933442235, 'timestamp': '2025-09-30 22:13:49.233718', 'step': 2694, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:49.293224', 'step': 2694, 'epoch': 3} +{'type': 'loss', 'content': 0.0011404008837416768, 'timestamp': '2025-09-30 22:13:49.302174', 'step': 2695, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:49.359140', 'step': 2695, 'epoch': 3} +{'type': 'loss', 'content': 0.007178679574280977, 'timestamp': '2025-09-30 22:13:49.370459', 'step': 2696, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:49.429837', 'step': 2696, 'epoch': 3} +{'type': 'loss', 'content': 0.0011296061566099524, 'timestamp': '2025-09-30 22:13:49.438620', 'step': 2697, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:49.496637', 'step': 2697, 'epoch': 3} +{'type': 'loss', 'content': 0.0011266040382906795, 'timestamp': '2025-09-30 22:13:49.502242', 'step': 2698, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:49.561787', 'step': 2698, 'epoch': 3} +{'type': 'loss', 'content': 0.0006445986800827086, 'timestamp': '2025-09-30 22:13:49.567761', 'step': 2699, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:49.624521', 'step': 2699, 'epoch': 3} +{'type': 'loss', 'content': 0.0005764567176811397, 'timestamp': '2025-09-30 22:13:49.630654', 'step': 2700, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:49.696603', 'step': 2700, 'epoch': 3} +{'type': 'loss', 'content': 0.0010664776200428605, 'timestamp': '2025-09-30 22:13:49.700446', 'step': 2701, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:49.756793', 'step': 2701, 'epoch': 3} +{'type': 'loss', 'content': 0.00033894533407874405, 'timestamp': '2025-09-30 22:13:49.759941', 'step': 2702, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:49.820434', 'step': 2702, 'epoch': 3} +{'type': 'loss', 'content': 0.001302658929489553, 'timestamp': '2025-09-30 22:13:49.824764', 'step': 2703, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:49.879323', 'step': 2703, 'epoch': 3} +{'type': 'loss', 'content': 0.003532660659402609, 'timestamp': '2025-09-30 22:13:49.889824', 'step': 2704, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:49.945843', 'step': 2704, 'epoch': 3} +{'type': 'loss', 'content': 0.0034972601570189, 'timestamp': '2025-09-30 22:13:49.951608', 'step': 2705, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:50.016545', 'step': 2705, 'epoch': 3} +{'type': 'loss', 'content': 0.00021062916493974626, 'timestamp': '2025-09-30 22:13:50.020864', 'step': 2706, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:50.078562', 'step': 2706, 'epoch': 3} +{'type': 'loss', 'content': 0.000698353978805244, 'timestamp': '2025-09-30 22:13:50.081409', 'step': 2707, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:50.144600', 'step': 2707, 'epoch': 3} +{'type': 'loss', 'content': 0.014168920926749706, 'timestamp': '2025-09-30 22:13:50.154442', 'step': 2708, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:50.212646', 'step': 2708, 'epoch': 3} +{'type': 'loss', 'content': 0.0037853510584682226, 'timestamp': '2025-09-30 22:13:50.220630', 'step': 2709, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:50.283443', 'step': 2709, 'epoch': 3} +{'type': 'loss', 'content': 0.010242769494652748, 'timestamp': '2025-09-30 22:13:50.290377', 'step': 2710, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:50.344434', 'step': 2710, 'epoch': 3} +{'type': 'loss', 'content': 0.0002763148513622582, 'timestamp': '2025-09-30 22:13:50.347449', 'step': 2711, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:50.411418', 'step': 2711, 'epoch': 3} +{'type': 'loss', 'content': 0.0003090985701419413, 'timestamp': '2025-09-30 22:13:50.428813', 'step': 2712, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:50.489893', 'step': 2712, 'epoch': 3} +{'type': 'loss', 'content': 0.003153016325086355, 'timestamp': '2025-09-30 22:13:50.493106', 'step': 2713, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:50.551946', 'step': 2713, 'epoch': 3} +{'type': 'loss', 'content': 0.014958287589251995, 'timestamp': '2025-09-30 22:13:50.555908', 'step': 2714, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:50.626790', 'step': 2714, 'epoch': 3} +{'type': 'loss', 'content': 0.03713773563504219, 'timestamp': '2025-09-30 22:13:50.630948', 'step': 2715, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:50.690432', 'step': 2715, 'epoch': 3} +{'type': 'loss', 'content': 0.0021324113477021456, 'timestamp': '2025-09-30 22:13:50.699332', 'step': 2716, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:50.755210', 'step': 2716, 'epoch': 3} +{'type': 'loss', 'content': 0.010403887368738651, 'timestamp': '2025-09-30 22:13:50.758629', 'step': 2717, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:50.822734', 'step': 2717, 'epoch': 3} +{'type': 'loss', 'content': 9.978410525945947e-05, 'timestamp': '2025-09-30 22:13:50.831743', 'step': 2718, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:50.892991', 'step': 2718, 'epoch': 3} +{'type': 'loss', 'content': 0.0008979692356660962, 'timestamp': '2025-09-30 22:13:50.896480', 'step': 2719, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:50.952988', 'step': 2719, 'epoch': 3} +{'type': 'loss', 'content': 0.012770486064255238, 'timestamp': '2025-09-30 22:13:50.965793', 'step': 2720, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:51.027607', 'step': 2720, 'epoch': 3} +{'type': 'loss', 'content': 0.0006667285342700779, 'timestamp': '2025-09-30 22:13:51.031206', 'step': 2721, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:51.089950', 'step': 2721, 'epoch': 3} +{'type': 'loss', 'content': 0.001039764960296452, 'timestamp': '2025-09-30 22:13:51.092464', 'step': 2722, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:51.157150', 'step': 2722, 'epoch': 3} +{'type': 'loss', 'content': 0.007044041994959116, 'timestamp': '2025-09-30 22:13:51.166520', 'step': 2723, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:51.225414', 'step': 2723, 'epoch': 3} +{'type': 'loss', 'content': 0.006314881145954132, 'timestamp': '2025-09-30 22:13:51.231890', 'step': 2724, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:51.293636', 'step': 2724, 'epoch': 3} +{'type': 'loss', 'content': 0.012850704602897167, 'timestamp': '2025-09-30 22:13:51.297398', 'step': 2725, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:51.352110', 'step': 2725, 'epoch': 3} +{'type': 'loss', 'content': 0.0020042883697897196, 'timestamp': '2025-09-30 22:13:51.355893', 'step': 2726, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:51.410769', 'step': 2726, 'epoch': 3} +{'type': 'loss', 'content': 0.0015756689244881272, 'timestamp': '2025-09-30 22:13:51.414151', 'step': 2727, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:51.472208', 'step': 2727, 'epoch': 3} +{'type': 'loss', 'content': 0.0008090141927823424, 'timestamp': '2025-09-30 22:13:51.480117', 'step': 2728, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:51.543520', 'step': 2728, 'epoch': 3} +{'type': 'loss', 'content': 0.0029392943251878023, 'timestamp': '2025-09-30 22:13:51.548126', 'step': 2729, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:51.613147', 'step': 2729, 'epoch': 3} +{'type': 'loss', 'content': 0.0013291776413097978, 'timestamp': '2025-09-30 22:13:51.616109', 'step': 2730, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:51.672856', 'step': 2730, 'epoch': 3} +{'type': 'loss', 'content': 0.001892697880975902, 'timestamp': '2025-09-30 22:13:51.684947', 'step': 2731, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:51.755930', 'step': 2731, 'epoch': 3} +{'type': 'loss', 'content': 0.0166336540132761, 'timestamp': '2025-09-30 22:13:51.769304', 'step': 2732, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:51.826448', 'step': 2732, 'epoch': 3} +{'type': 'loss', 'content': 0.0006099882302805781, 'timestamp': '2025-09-30 22:13:51.829660', 'step': 2733, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:51.889560', 'step': 2733, 'epoch': 3} +{'type': 'loss', 'content': 0.00124065310228616, 'timestamp': '2025-09-30 22:13:51.892903', 'step': 2734, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:51.953941', 'step': 2734, 'epoch': 3} +{'type': 'loss', 'content': 0.008868148550391197, 'timestamp': '2025-09-30 22:13:51.966422', 'step': 2735, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:52.031390', 'step': 2735, 'epoch': 3} +{'type': 'loss', 'content': 0.031378235667943954, 'timestamp': '2025-09-30 22:13:52.038860', 'step': 2736, 'epoch': 3} +{'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}], 'timestamp': '2025-09-30 22:13:53.450275', 'step': 2736, 'epoch': 3} +{'type': 'pplx', 'content': 27265120.538364556, 'timestamp': '2025-09-30 22:13:53.453548', 'step': 2736, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:53.506520', 'step': 2736, 'epoch': 3} +{'type': 'loss', 'content': 0.004425609949976206, 'timestamp': '2025-09-30 22:13:53.510006', 'step': 2737, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:53.564344', 'step': 2737, 'epoch': 3} +{'type': 'loss', 'content': 0.00230988091789186, 'timestamp': '2025-09-30 22:13:53.567973', 'step': 2738, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:53.629941', 'step': 2738, 'epoch': 3} +{'type': 'loss', 'content': 0.012407422065734863, 'timestamp': '2025-09-30 22:13:53.632681', 'step': 2739, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:53.698644', 'step': 2739, 'epoch': 3} +{'type': 'loss', 'content': 0.0033368500880897045, 'timestamp': '2025-09-30 22:13:53.705763', 'step': 2740, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:53.761332', 'step': 2740, 'epoch': 3} +{'type': 'loss', 'content': 0.001278606941923499, 'timestamp': '2025-09-30 22:13:53.764488', 'step': 2741, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:53.832321', 'step': 2741, 'epoch': 3} +{'type': 'loss', 'content': 0.0054036402143538, 'timestamp': '2025-09-30 22:13:53.835475', 'step': 2742, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:53.891626', 'step': 2742, 'epoch': 3} +{'type': 'loss', 'content': 0.02728707529604435, 'timestamp': '2025-09-30 22:13:53.895367', 'step': 2743, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:53.954983', 'step': 2743, 'epoch': 3} +{'type': 'loss', 'content': 0.005847959313541651, 'timestamp': '2025-09-30 22:13:53.961864', 'step': 2744, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:54.025521', 'step': 2744, 'epoch': 3} +{'type': 'loss', 'content': 0.006271410267800093, 'timestamp': '2025-09-30 22:13:54.028753', 'step': 2745, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:54.089180', 'step': 2745, 'epoch': 3} +{'type': 'loss', 'content': 0.006004456430673599, 'timestamp': '2025-09-30 22:13:54.093167', 'step': 2746, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:54.169048', 'step': 2746, 'epoch': 3} +{'type': 'loss', 'content': 0.019149092957377434, 'timestamp': '2025-09-30 22:13:54.171857', 'step': 2747, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 2240013665728.0}, 'timestamp': '2025-09-30 22:13:54.246050', 'step': 2747, 'epoch': 3} +{'type': 'loss', 'content': 0.0032882175873965025, 'timestamp': '2025-09-30 22:13:54.264454', 'step': 2748, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:54.320406', 'step': 2748, 'epoch': 3} +{'type': 'loss', 'content': 0.024987680837512016, 'timestamp': '2025-09-30 22:13:54.325559', 'step': 2749, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-30 22:13:54.390676', 'step': 2749, 'epoch': 3} +{'type': 'loss', 'content': 0.043651703745126724, 'timestamp': '2025-09-30 22:13:54.394730', 'step': 2750, 'epoch': 3} +{'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-30 22:13:54.466019', 'step': 2750, 'epoch': 3} +{'type': 'loss', 'content': 0.03387341648340225, 'timestamp': '2025-09-30 22:13:54.468879', 'step': 2751, 'epoch': 3} +{'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1596914505344}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1916297392896}], 'timestamp': '2025-09-30 22:13:55.951133', 'step': 2751, 'epoch': 3} +{'type': 'pplx', 'content': 29126071.55555798, 'timestamp': '2025-09-30 22:13:55.953357', 'step': 2751, 'epoch': 3} +{'type': 'best_pplx', 'content': 27226157.57321593, 'timestamp': '2025-09-30 22:13:55.955439', 'step': 2751, 'epoch': 3} +{'type': 'best_step', 'content': 2679, 'timestamp': '2025-09-30 22:13:55.957537', 'step': 2751, 'epoch': 3} +{'type': 'total_pplx_flops', 'content': 5062218940038400, 'timestamp': '2025-09-30 22:13:55.959569', 'step': 2751, 'epoch': 3} +{'type': 'total_train_flops', 'content': 7174123736893632.0, 'timestamp': '2025-09-30 22:13:55.961809', 'step': 2751, 'epoch': 3}