Twinkles01 commited on Mar 30

Commit

2d9ea87

verified ·

1 Parent(s): 1bc2779

Upload Main_200023

Browse files

Files changed (19) hide show

Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/TimeMoE4_005000.pt +3 -0
Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/TimeMoE4_010000.pt +3 -0
Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/TimeMoE4_015000.pt +3 -0
Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/TimeMoE4_020000.pt +3 -0
Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/TimeMoE4_025000.pt +3 -0
Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/TimeMoE4_030000.pt +3 -0
Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/TimeMoE4_035000.pt +3 -0
Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/TimeMoE4_040000.pt +3 -0
Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/TimeMoE4_045000.pt +3 -0
Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/TimeMoE4_050000.pt +3 -0
Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/TimeMoE4_055000.pt +3 -0
Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/TimeMoE4_060000.pt +3 -0
Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/TimeMoE4_065000.pt +3 -0
Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/TimeMoE4_070000.pt +3 -0
Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/TimeMoE4_best_val_loss.pt +3 -0
Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/cfg.txt +55 -0
Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/tensorboard/events.out.tfevents.1769351836.brev-5x9knwe1p.567811.0 +3 -0
Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/timemoe_base.py +126 -0
Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/training_log_20260125143706.log +141 -0

Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/TimeMoE4_005000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:36b417727a837208b7afc1b6f1b7b28152b65f6b6998139fe838d61d414256ca
+size 279689720

Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/TimeMoE4_010000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:373a6861c5f0cf289a535d46e97016214171f1499224ec02c3c4f9d9d24dd7f2
+size 279689720

Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/TimeMoE4_015000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7b0b65f98cc92c267baf4f5d11311138b979367e8e1b7393b2ac9b314f4e2a2a
+size 279689720

Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/TimeMoE4_020000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0e1159d04877f1f28faa606e8acdce15d21f99de023ce82a02db41393eabaaf2
+size 279689720

Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/TimeMoE4_025000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ac86ab63b6b5aecec121d6c5f6f8e99f5981554e6f6c9c5f53b9cea47c547ffe
+size 279689720

Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/TimeMoE4_030000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cffe798636b334c38552b8df80450a51811e015aefc68e11ae9bd0399657cddb
+size 279689720

Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/TimeMoE4_035000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0d81cce64ca6aa25bda439570c698b65072dfc7986280d6478cd811c37ecba58
+size 279689720

Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/TimeMoE4_040000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1338e511646da180f8aad17104fd157ad096b7fb1a9a0a35b1d0d7a95de3b2f2
+size 279689720

Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/TimeMoE4_045000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a903c9503352c4cbe5c23c19b68c12df86b80d7a1592d9b0a7e640f518035271
+size 279689720

Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/TimeMoE4_050000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:828240d1bb6627f59b9a40b5fbf6c434fb7555759b1cf59753e53cbb621c7b91
+size 279689720

Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/TimeMoE4_055000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a956e899318a451540a7ccfeb0127c9beec5ad8d96969f0947f8cd2fb23a60ba
+size 279689720

Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/TimeMoE4_060000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a9eadbfd3785883dbeac262d05596b09aacf53c3d960462d1176af2a2062b839
+size 279689720

Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/TimeMoE4_065000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:69d73ca48e7e7a60a7c96b67a774bf0e10ce499d3dfa38cc3c85b53f82ad1def
+size 279689720

Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/TimeMoE4_070000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:14245b70ae224284463ed9844a31f79eacdfd984a8314c0ecf46d6263064ba9b
+size 279689720

Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/TimeMoE4_best_val_loss.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3290700ae0dce2d946e28c1c0688c86481064ed8b2097c5581207eb87b1e2c09
+size 279695083

Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/cfg.txt ADDED Viewed

	@@ -0,0 +1,55 @@

+DESCRIPTION: TimeMoE Base
+DEVICE: gpu
+DEVICE_NUM: 3
+RUNNER: <class 'baselines.TimeMoE4.runner.runner.TimeMoERunner'>
+MODEL:
+  NAME: TimeMoE4
+  ARCH: <class 'baselines.TimeMoE4.arch.timemoe.TimeMoE4'>
+  PARAM:
+    model_id: baselines/TimeMoE/ckpt/TimeMoE-50M
+    from_pretrained: False
+    context_length: 4079
+    trust_remote_code: True
+  DTYPE: bfloat16
+METRICS:
+  FUNCS:
+TRAIN:
+  COMPILE_MODEL: True
+  NUM_ITERATIONS: 200023
+  CKPT_SAVE_DIR: checkpoints/TimeMoE4/Main_200023
+  CKPT_SAVE_STRATEGY: 5000
+  LOSS: fake_loss
+  OPTIM:
+    TYPE: AdamW
+    PARAM:
+      lr: 0.001
+      betas: (0.9, 0.95)
+      fused: True
+  LR_SCHEDULER:
+    TYPE: CosineWarmup
+    PARAM:
+      num_warmup_steps: 10000
+      num_training_steps: 200023
+  CLIP_GRAD_PARAM:
+    max_norm: 1.0
+  DATA:
+    BATCH_SIZE: 85
+    SHUFFLE: True
+    PIN_MEMORY: True
+    PREFETCH: True
+  GRAD_ACCUMULATION_STEPS: 1
+VAL:
+  INTERVAL: 5000
+  DATA:
+    BATCH_SIZE: 170
+EVAL:
+  USE_GPU: True
+DATASET:
+  NAME: Main
+  TYPE: <class 'baselines.TimeMoE4.data.mix_dataset_v2.MixedSourceDataset_v2'>
+  PARAM:
+    num_valid_samples: 1000
+INFERENCE:
+  GENERATION_PARAMS:
+    normalize: True
+MD5: 45c1cfa01e2cd6780a55aeb5959cfff0

Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/tensorboard/events.out.tfevents.1769351836.brev-5x9knwe1p.567811.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a8a996f6fa9080e095de97d2b1218f66305be9c83fd8a82a0d1eeaaa26c72d18
+size 14813348

Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/timemoe_base.py ADDED Viewed

	@@ -0,0 +1,126 @@

+# 采样概率变化
+import os
+import sys
+from easydict import EasyDict
+sys.path.append(os.path.abspath(__file__ + '/../../..'))
+from ..arch import TimeMoE4
+from ..data import MixedSourceDataset_v2
+from ..runner import TimeMoERunner
+from ..loss import fake_loss
+############################## Hot Parameters ##############################
+# Dataset & Metrics configuration
+# Model architecture and parameters
+pretrained = False  # Whether to use a pretrained model
+MODEL_ARCH = TimeMoE4
+MODEL_PARAM = {
+    'model_id': "baselines/TimeMoE/ckpt/TimeMoE-50M",
+    'from_pretrained': pretrained,
+    'context_length': 4079,
+    'trust_remote_code': True,
+}
+DATA_NAME = "Main"
+# N = 20_000_000
+# batch size = 16*8
+# 20_000_000 / 16 / 8 = 156250 iterations
+# 20_000_000 * 4096 / 16 / 8 / 4096 = 156_250
+NUM_ITERATIONS = 200_023 # 总轮数   20_000_000 * 4096 / 16 / 4 / 4096 = 312,500
+VAL_ITERATION_INTERVAL = 5_000 # 每VAL_ITERATION_INTERVAL执行一次验证
+############################## General Configuration ##############################
+CFG = EasyDict()
+# General settings
+CFG.DESCRIPTION = 'TimeMoE Base'
+CFG.DEVICE = 'gpu'
+CFG.DEVICE_NUM = 3
+# Runner
+CFG.RUNNER = TimeMoERunner
+############################## Model Configuration ################################
+CFG.MODEL = EasyDict()
+CFG.MODEL.NAME = MODEL_ARCH.__name__
+CFG.MODEL.ARCH = MODEL_ARCH
+CFG.MODEL.PARAM = MODEL_PARAM
+CFG.MODEL.DTYPE= 'bfloat16'
+# CFG.MODEL.DTYPE= 'float32'
+############################## Metrics Configuration ##############################
+CFG.METRICS = EasyDict()
+# Metrics settings
+CFG.METRICS.FUNCS = EasyDict({})
+############################## Training Configuration ##############################
+CFG.TRAIN = EasyDict()
+CFG.TRAIN.COMPILE_MODEL = True
+CFG.TRAIN.NUM_ITERATIONS = NUM_ITERATIONS
+CFG.TRAIN.CKPT_SAVE_DIR = os.path.join(
+    'checkpoints',
+    MODEL_ARCH.__name__,
+    '_'.join([DATA_NAME, str(CFG.TRAIN.NUM_ITERATIONS)])
+)
+CFG.TRAIN.CKPT_SAVE_STRATEGY = VAL_ITERATION_INTERVAL * 1 # 保存策略，每VAL_ITERATION_INTERVAL * 5保存一次模型
+CFG.TRAIN.LOSS = fake_loss
+# Optimizer settings
+CFG.TRAIN.OPTIM = EasyDict()
+CFG.TRAIN.OPTIM.TYPE = "AdamW"
+CFG.TRAIN.OPTIM.PARAM = {
+    "lr": 1e-3,
+    "betas": (0.9, 0.95),
+    # "betas": (0.9, 0.98),
+    "fused": True,
+    # "weight_decay": 1e-1,
+}
+# Learning rate scheduler settings
+CFG.TRAIN.LR_SCHEDULER = EasyDict()
+CFG.TRAIN.LR_SCHEDULER.TYPE = "CosineWarmup"
+CFG.TRAIN.LR_SCHEDULER.PARAM = {
+    'num_warmup_steps': 10_000, # 10k
+    'num_training_steps': NUM_ITERATIONS,
+}
+CFG.TRAIN.CLIP_GRAD_PARAM = {
+    'max_norm': 1.0
+}
+# Train data loader settings
+CFG.TRAIN.DATA = EasyDict()
+CFG.TRAIN.DATA.BATCH_SIZE = 85  # 16  /  4
+CFG.TRAIN.DATA.SHUFFLE = True # has to be False
+CFG.TRAIN.DATA.PIN_MEMORY = True
+CFG.TRAIN.DATA.PREFETCH = True
+CFG.TRAIN.GRAD_ACCUMULATION_STEPS = 1
+# CFG.TRAIN.DATA.NUM_WORKERS = 4
+############################## Validation Configuration ##############################
+CFG.VAL = EasyDict()
+CFG.VAL.INTERVAL = VAL_ITERATION_INTERVAL
+CFG.VAL.DATA = EasyDict()
+CFG.VAL.DATA.BATCH_SIZE = 170  # 32  /  8
+############################## Evaluation Configuration ##############################
+CFG.EVAL = EasyDict()
+# Evaluation parameters
+CFG.EVAL.USE_GPU = True # Whether to use GPU for evaluation. Default: True
+############################## Dataset Configuration ##############################
+CFG.DATASET = EasyDict()
+# Dataset settings
+CFG.DATASET.NAME = DATA_NAME
+CFG.DATASET.TYPE = MixedSourceDataset_v2
+CFG.DATASET.PARAM = EasyDict({
+    'num_valid_samples': 1000
+})
+############################## Inference Configuration ##############################
+CFG.INFERENCE = EasyDict()
+CFG.INFERENCE.GENERATION_PARAMS = EasyDict({
+    'normalize': not pretrained
+})

Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/training_log_20260125143706.log ADDED Viewed

	@@ -0,0 +1,141 @@

+2026-01-25 14:37:06,651 - easytorch-training - INFO - Initializing training.
+2026-01-25 14:37:06,651 - easytorch-training - INFO - Set clip grad, param: {'max_norm': 1.0}
+2026-01-25 14:37:06,652 - easytorch-training - INFO - Building training data loader.
+2026-01-25 14:37:16,395 - easytorch-training - INFO - MixedSourceDataset initialized for 'train' mode.
+2026-01-25 14:37:16,395 - easytorch-training - INFO -   - real: 3201174 samples
+2026-01-25 14:37:16,396 - easytorch-training - INFO -   - synth: 2000000 samples
+2026-01-25 14:37:16,396 - easytorch-training - INFO - Train dataset length: 3201174
+2026-01-25 14:37:16,398 - easytorch-training - INFO - Set optim: AdamW (
+Parameter Group 0
+    amsgrad: False
+    betas: (0.9, 0.95)
+    capturable: False
+    differentiable: False
+    eps: 1e-08
+    foreach: None
+    fused: True
+    lr: 0.001
+    maximize: False
+    weight_decay: 0.01
+)
+2026-01-25 14:37:16,398 - easytorch-training - INFO - Set lr_scheduler: <basicts.runners.optim.lr_schedulers.CosineWarmup object at 0x7d96bc528dd0>
+2026-01-25 14:37:16,405 - easytorch-training - INFO - Initializing validation.
+2026-01-25 14:37:16,405 - easytorch-training - INFO - Building val data loader.
+2026-01-25 14:37:18,367 - easytorch-training - INFO - Worker 0 initialized for cauker_univariate.
+2026-01-25 14:37:42,611 - easytorch-training - INFO - MixedSourceDataset initialized for 'valid' mode.
+2026-01-25 14:37:42,611 - easytorch-training - INFO -   - real: 1000 samples
+2026-01-25 14:37:42,611 - easytorch-training - INFO - Valid dataset length: 1000
+2026-01-25 14:37:42,612 - easytorch-training - INFO - Number of parameters: 23286528
+2026-01-25 14:37:42,612 - easytorch-training - INFO - Training with 3 GPUs, batch size per GPUs: 85, grad_accumulation_steps: 1
+2026-01-25 14:37:42,612 - easytorch-training - INFO - Effective batch size: 255
+2026-01-25 16:10:58,500 - easytorch-training - INFO - Iteration 5000 / 200023
+2026-01-25 16:11:04,857 - easytorch-training - INFO - Result <train>: [train/iter_time: 1.12 (s), train/lr: 2.50e-04, train/loss: 3.3771, train/grad_norm: 5.9030, train/amp_scale: 1.0000]
+2026-01-25 16:11:04,858 - easytorch-training - INFO - Start validation.
+2026-01-25 16:11:28,463 - easytorch-training - INFO - Result <val>: [val/time: 23.43 (s), val/loss: 3.2646]
+2026-01-25 16:11:28,680 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/TimeMoE4_best_val_loss.pt saved
+2026-01-25 16:11:28,893 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/TimeMoE4_005000.pt saved
+2026-01-25 16:11:28,894 - easytorch-training - INFO - The estimated training finish time is 2026-01-28 05:08:59
+2026-01-25 17:46:14,190 - easytorch-training - INFO - Iteration 10000 / 200023
+2026-01-25 17:46:15,073 - easytorch-training - INFO - Result <train>: [train/iter_time: 1.14 (s), train/lr: 7.50e-04, train/loss: 2.5894, train/grad_norm: 2.9620, train/amp_scale: 1.0000]
+2026-01-25 17:46:15,073 - easytorch-training - INFO - Start validation.
+2026-01-25 17:46:27,668 - easytorch-training - INFO - Result <val>: [val/time: 12.42 (s), val/loss: 3.2085]
+2026-01-25 17:46:27,919 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/TimeMoE4_best_val_loss.pt saved
+2026-01-25 17:46:28,134 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/TimeMoE4_010000.pt saved
+2026-01-25 17:46:28,135 - easytorch-training - INFO - The estimated training finish time is 2026-01-28 05:33:19
+2026-01-25 19:20:57,100 - easytorch-training - INFO - Iteration 15000 / 200023
+2026-01-25 19:20:57,975 - easytorch-training - INFO - Result <train>: [train/iter_time: 1.13 (s), train/lr: 9.99e-04, train/loss: 2.4816, train/grad_norm: 2.6214, train/amp_scale: 1.0000]
+2026-01-25 19:20:57,976 - easytorch-training - INFO - Start validation.
+2026-01-25 19:21:10,565 - easytorch-training - INFO - Result <val>: [val/time: 12.41 (s), val/loss: 3.1724]
+2026-01-25 19:21:10,794 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/TimeMoE4_best_val_loss.pt saved
+2026-01-25 19:21:11,006 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/TimeMoE4_015000.pt saved
+2026-01-25 19:21:11,008 - easytorch-training - INFO - The estimated training finish time is 2026-01-28 05:37:47
+2026-01-25 20:55:08,973 - easytorch-training - INFO - Iteration 20000 / 200023
+2026-01-25 20:55:09,849 - easytorch-training - INFO - Result <train>: [train/iter_time: 1.13 (s), train/lr: 9.96e-04, train/loss: 2.4323, train/grad_norm: 2.5592, train/amp_scale: 1.0000]
+2026-01-25 20:55:09,850 - easytorch-training - INFO - Start validation.
+2026-01-25 20:55:22,426 - easytorch-training - INFO - Result <val>: [val/time: 12.40 (s), val/loss: 3.1459]
+2026-01-25 20:55:22,637 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/TimeMoE4_best_val_loss.pt saved
+2026-01-25 20:55:22,829 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/TimeMoE4_020000.pt saved
+2026-01-25 20:55:22,830 - easytorch-training - INFO - The estimated training finish time is 2026-01-28 05:34:50
+2026-01-25 22:29:53,401 - easytorch-training - INFO - Iteration 25000 / 200023
+2026-01-25 22:29:54,310 - easytorch-training - INFO - Result <train>: [train/iter_time: 1.13 (s), train/lr: 9.89e-04, train/loss: 2.4081, train/grad_norm: 2.5595, train/amp_scale: 1.0000]
+2026-01-25 22:29:54,311 - easytorch-training - INFO - Start validation.
+2026-01-25 22:30:06,865 - easytorch-training - INFO - Result <val>: [val/time: 12.38 (s), val/loss: 3.1363]
+2026-01-25 22:30:07,101 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/TimeMoE4_best_val_loss.pt saved
+2026-01-25 22:30:07,321 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/TimeMoE4_025000.pt saved
+2026-01-25 22:30:07,323 - easytorch-training - INFO - The estimated training finish time is 2026-01-28 05:37:26
+2026-01-26 00:04:20,948 - easytorch-training - INFO - Iteration 30000 / 200023
+2026-01-26 00:04:21,827 - easytorch-training - INFO - Result <train>: [train/iter_time: 1.13 (s), train/lr: 9.79e-04, train/loss: 2.3853, train/grad_norm: 2.4820, train/amp_scale: 1.0000]
+2026-01-26 00:04:21,828 - easytorch-training - INFO - Start validation.
+2026-01-26 00:04:34,405 - easytorch-training - INFO - Result <val>: [val/time: 12.40 (s), val/loss: 3.1428]
+2026-01-26 00:04:34,606 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/TimeMoE4_030000.pt saved
+2026-01-26 00:04:34,607 - easytorch-training - INFO - The estimated training finish time is 2026-01-28 05:37:15
+2026-01-26 01:38:13,439 - easytorch-training - INFO - Iteration 35000 / 200023
+2026-01-26 01:38:14,321 - easytorch-training - INFO - Result <train>: [train/iter_time: 1.12 (s), train/lr: 9.66e-04, train/loss: 2.3841, train/grad_norm: 2.5523, train/amp_scale: 1.0000]
+2026-01-26 01:38:14,323 - easytorch-training - INFO - Start validation.
+2026-01-26 01:38:26,919 - easytorch-training - INFO - Result <val>: [val/time: 12.42 (s), val/loss: 3.1327]
+2026-01-26 01:38:27,150 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/TimeMoE4_best_val_loss.pt saved
+2026-01-26 01:38:27,362 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/TimeMoE4_035000.pt saved
+2026-01-26 01:38:27,364 - easytorch-training - INFO - The estimated training finish time is 2026-01-28 05:33:50
+2026-01-26 03:12:54,459 - easytorch-training - INFO - Iteration 40000 / 200023
+2026-01-26 03:12:55,337 - easytorch-training - INFO - Result <train>: [train/iter_time: 1.13 (s), train/lr: 9.49e-04, train/loss: 2.3749, train/grad_norm: 2.5387, train/amp_scale: 1.0000]
+2026-01-26 03:12:55,338 - easytorch-training - INFO - Start validation.
+2026-01-26 03:13:07,942 - easytorch-training - INFO - Result <val>: [val/time: 12.43 (s), val/loss: 3.1183]
+2026-01-26 03:13:08,172 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/TimeMoE4_best_val_loss.pt saved
+2026-01-26 03:13:08,383 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/TimeMoE4_040000.pt saved
+2026-01-26 03:13:08,384 - easytorch-training - INFO - The estimated training finish time is 2026-01-28 05:35:17
+2026-01-26 04:47:25,927 - easytorch-training - INFO - Iteration 45000 / 200023
+2026-01-26 04:47:26,803 - easytorch-training - INFO - Result <train>: [train/iter_time: 1.13 (s), train/lr: 9.29e-04, train/loss: 2.3690, train/grad_norm: 2.5732, train/amp_scale: 1.0000]
+2026-01-26 04:47:26,804 - easytorch-training - INFO - Start validation.
+2026-01-26 04:47:39,402 - easytorch-training - INFO - Result <val>: [val/time: 12.42 (s), val/loss: 3.1171]
+2026-01-26 04:47:39,629 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/TimeMoE4_best_val_loss.pt saved
+2026-01-26 04:47:39,841 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/TimeMoE4_045000.pt saved
+2026-01-26 04:47:39,843 - easytorch-training - INFO - The estimated training finish time is 2026-01-28 05:35:43
+2026-01-26 06:21:30,776 - easytorch-training - INFO - Iteration 50000 / 200023
+2026-01-26 06:21:31,649 - easytorch-training - INFO - Result <train>: [train/iter_time: 1.12 (s), train/lr: 9.07e-04, train/loss: 2.3566, train/grad_norm: 2.5341, train/amp_scale: 1.0000]
+2026-01-26 06:21:31,650 - easytorch-training - INFO - Start validation.
+2026-01-26 06:21:44,468 - easytorch-training - INFO - Result <val>: [val/time: 12.64 (s), val/loss: 3.1159]
+2026-01-26 06:21:44,694 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/TimeMoE4_best_val_loss.pt saved
+2026-01-26 06:21:44,906 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/TimeMoE4_050000.pt saved
+2026-01-26 06:21:44,907 - easytorch-training - INFO - The estimated training finish time is 2026-01-28 05:34:17
+2026-01-26 07:55:30,272 - easytorch-training - INFO - Iteration 55000 / 200023
+2026-01-26 07:55:31,144 - easytorch-training - INFO - Result <train>: [train/iter_time: 1.12 (s), train/lr: 8.81e-04, train/loss: 2.3585, train/grad_norm: 2.5608, train/amp_scale: 1.0000]
+2026-01-26 07:55:31,144 - easytorch-training - INFO - Start validation.
+2026-01-26 07:55:43,720 - easytorch-training - INFO - Result <val>: [val/time: 12.40 (s), val/loss: 3.1132]
+2026-01-26 07:55:43,950 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/TimeMoE4_best_val_loss.pt saved
+2026-01-26 07:55:44,164 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/TimeMoE4_055000.pt saved
+2026-01-26 07:55:44,165 - easytorch-training - INFO - The estimated training finish time is 2026-01-28 05:32:47
+2026-01-26 09:28:36,432 - easytorch-training - INFO - Iteration 60000 / 200023
+2026-01-26 09:28:37,312 - easytorch-training - INFO - Result <train>: [train/iter_time: 1.11 (s), train/lr: 8.53e-04, train/loss: 2.3459, train/grad_norm: 2.4894, train/amp_scale: 1.0000]
+2026-01-26 09:28:37,314 - easytorch-training - INFO - Start validation.
+2026-01-26 09:28:49,922 - easytorch-training - INFO - Result <val>: [val/time: 12.43 (s), val/loss: 3.1111]
+2026-01-26 09:28:50,153 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/TimeMoE4_best_val_loss.pt saved
+2026-01-26 09:28:50,364 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/TimeMoE4_060000.pt saved
+2026-01-26 09:28:50,365 - easytorch-training - INFO - The estimated training finish time is 2026-01-28 05:28:34
+2026-01-26 11:01:52,176 - easytorch-training - INFO - Iteration 65000 / 200023
+2026-01-26 11:01:53,062 - easytorch-training - INFO - Result <train>: [train/iter_time: 1.12 (s), train/lr: 8.23e-04, train/loss: 2.3397, train/grad_norm: 2.5233, train/amp_scale: 1.0000]
+2026-01-26 11:01:53,064 - easytorch-training - INFO - Start validation.
+2026-01-26 11:02:08,477 - easytorch-training - INFO - Result <val>: [val/time: 15.24 (s), val/loss: 3.1028]
+2026-01-26 11:02:08,695 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/TimeMoE4_best_val_loss.pt saved
+2026-01-26 11:02:08,895 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/TimeMoE4_065000.pt saved
+2026-01-26 11:02:08,897 - easytorch-training - INFO - The estimated training finish time is 2026-01-28 05:25:38
+2026-01-26 12:50:04,231 - easytorch-training - INFO - Iteration 70000 / 200023
+2026-01-26 12:50:05,102 - easytorch-training - INFO - Result <train>: [train/iter_time: 1.29 (s), train/lr: 7.91e-04, train/loss: 2.3634, train/grad_norm: 2.5798, train/amp_scale: 1.0000]
+2026-01-26 12:50:05,103 - easytorch-training - INFO - Start validation.
+2026-01-26 12:50:17,672 - easytorch-training - INFO - Result <val>: [val/time: 12.39 (s), val/loss: 3.2073]
+2026-01-26 12:50:17,889 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200023/45c1cfa01e2cd6780a55aeb5959cfff0/TimeMoE4_070000.pt saved
+2026-01-26 12:50:17,889 - easytorch-training - INFO - The estimated training finish time is 2026-01-28 06:05:32
+2026-01-26 13:31:38,832 - easytorch-training - ERROR - Traceback (most recent call last):
+  File "/home/nvidia/miniconda3/envs/zxx/lib/python3.11/site-packages/easytorch/launcher/launcher.py", line 31, in training_func
+    runner.train(cfg)
+  File "/lp-dev/zhouxx/BasicTS/basicts/runners/base_iteration_runner.py", line 200, in train
+    self.train_iters(iteration=iteration, dataloader=self.train_data_loader)
+  File "/lp-dev/zhouxx/BasicTS/basicts/runners/base_utsf_runner.py", line 281, in train_iters
+    self.backward(loss, accumulating=accumulating)
+  File "/lp-dev/zhouxx/BasicTS/basicts/runners/base_utsf_runner.py", line 337, in backward
+    grad_norm = sum(
+                ^^^^
+  File "/lp-dev/zhouxx/BasicTS/basicts/runners/base_utsf_runner.py", line 338, in <genexpr>
+    param.grad.data.norm(2).item() ** 2 for param in self.model.parameters() if param.grad is not None
+    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+KeyboardInterrupt