Twinkles01 commited on Mar 30

Commit

2c4a1f0

verified ·

1 Parent(s): cbc5729

Upload Main_200011

Browse files

Files changed (19) hide show

Main_200011/f1be674d3eb4ef701f64bdb43b46084c/TimeMoE4_005000.pt +3 -0
Main_200011/f1be674d3eb4ef701f64bdb43b46084c/TimeMoE4_010000.pt +3 -0
Main_200011/f1be674d3eb4ef701f64bdb43b46084c/TimeMoE4_015000.pt +3 -0
Main_200011/f1be674d3eb4ef701f64bdb43b46084c/TimeMoE4_020000.pt +3 -0
Main_200011/f1be674d3eb4ef701f64bdb43b46084c/TimeMoE4_025000.pt +3 -0
Main_200011/f1be674d3eb4ef701f64bdb43b46084c/TimeMoE4_030000.pt +3 -0
Main_200011/f1be674d3eb4ef701f64bdb43b46084c/TimeMoE4_035000.pt +3 -0
Main_200011/f1be674d3eb4ef701f64bdb43b46084c/TimeMoE4_040000.pt +3 -0
Main_200011/f1be674d3eb4ef701f64bdb43b46084c/TimeMoE4_045000.pt +3 -0
Main_200011/f1be674d3eb4ef701f64bdb43b46084c/TimeMoE4_050000.pt +3 -0
Main_200011/f1be674d3eb4ef701f64bdb43b46084c/TimeMoE4_055000.pt +3 -0
Main_200011/f1be674d3eb4ef701f64bdb43b46084c/TimeMoE4_060000.pt +3 -0
Main_200011/f1be674d3eb4ef701f64bdb43b46084c/TimeMoE4_065000.pt +3 -0
Main_200011/f1be674d3eb4ef701f64bdb43b46084c/TimeMoE4_070000.pt +3 -0
Main_200011/f1be674d3eb4ef701f64bdb43b46084c/TimeMoE4_best_val_loss.pt +3 -0
Main_200011/f1be674d3eb4ef701f64bdb43b46084c/cfg.txt +55 -0
Main_200011/f1be674d3eb4ef701f64bdb43b46084c/tensorboard/events.out.tfevents.1768408052.brev-5x9knwe1p.565000.0 +3 -0
Main_200011/f1be674d3eb4ef701f64bdb43b46084c/timemoe_base.py +125 -0
Main_200011/f1be674d3eb4ef701f64bdb43b46084c/training_log_20260114162720.log +139 -0

Main_200011/f1be674d3eb4ef701f64bdb43b46084c/TimeMoE4_005000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bcae71886cabee4674af5f1a70b44ffe72f2ed318b1652b2221852cc4dc55fba
+size 160253400

Main_200011/f1be674d3eb4ef701f64bdb43b46084c/TimeMoE4_010000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:13b7c862f4eb0e3f65e8910f96fad5d4a774682c77a9428b27d6610fea3283e5
+size 160253400

Main_200011/f1be674d3eb4ef701f64bdb43b46084c/TimeMoE4_015000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3db48d9d3e5369ac7d53630f30e552494d3b14ef9fc01f6084ddc9727bd906ad
+size 160253400

Main_200011/f1be674d3eb4ef701f64bdb43b46084c/TimeMoE4_020000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:496e51eb629c137074c057f4716c326e1676b398a04b4d9df480bb7723854082
+size 160253400

Main_200011/f1be674d3eb4ef701f64bdb43b46084c/TimeMoE4_025000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ddabbb650db8d2bba3dd338f456ccee6a7ed6afa9efca6dce21cb94d9b4037a5
+size 160253400

Main_200011/f1be674d3eb4ef701f64bdb43b46084c/TimeMoE4_030000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8aea6404b1bc9cbd690a36e579ef601bba0857b2ac6dd69b554f3c339897d716
+size 160253400

Main_200011/f1be674d3eb4ef701f64bdb43b46084c/TimeMoE4_035000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:52269a1f0209064e090246e1872ec32c7ef3a280bbf383aed8c431b632afbdf5
+size 160253400

Main_200011/f1be674d3eb4ef701f64bdb43b46084c/TimeMoE4_040000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7ceae457cac2fd89be79ba6b7bc0621e485c4bdd7b1d891f95aa80305ec53ba4
+size 160253400

Main_200011/f1be674d3eb4ef701f64bdb43b46084c/TimeMoE4_045000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fbbb5442145ef3a1bfdc318afb95a9938d4ae8a3d650a62625045e2350ba030a
+size 160253400

Main_200011/f1be674d3eb4ef701f64bdb43b46084c/TimeMoE4_050000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a517aa5141dbe00ff2e317b53586c99a955889c605ae8067b9a1449abfc0740a
+size 160253400

Main_200011/f1be674d3eb4ef701f64bdb43b46084c/TimeMoE4_055000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:80c6d4d7c5488c8b7f3382c2beb391389272cedb15bc25c55bb4e05d733acbf0
+size 160253400

Main_200011/f1be674d3eb4ef701f64bdb43b46084c/TimeMoE4_060000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2bf26dac74cdb2a99c45ba82e918365d504a13cd15ed7e3ebc99d923cd89ca00
+size 160253400

Main_200011/f1be674d3eb4ef701f64bdb43b46084c/TimeMoE4_065000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2033158e950f01b443554918a2606ba555f787a645c5e6dfeacc4868c102c2c1
+size 160253400

Main_200011/f1be674d3eb4ef701f64bdb43b46084c/TimeMoE4_070000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:878b0a7db8fe86f08d65cd503d4d29035baebdd7b436087983ad024d365d587f
+size 160253400

Main_200011/f1be674d3eb4ef701f64bdb43b46084c/TimeMoE4_best_val_loss.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7c13f6c2f496a71bb973dbd06628aa9a3070a4e0479d09215d2f7bd3f91c04ed
+size 160256411

Main_200011/f1be674d3eb4ef701f64bdb43b46084c/cfg.txt ADDED Viewed

	@@ -0,0 +1,55 @@

+DESCRIPTION: TimeMoE Base
+DEVICE: gpu
+DEVICE_NUM: 3
+RUNNER: <class 'baselines.TimeMoE4.runner.runner.TimeMoERunner'>
+MODEL:
+  NAME: TimeMoE4
+  ARCH: <class 'baselines.TimeMoE4.arch.timemoe.TimeMoE4'>
+  PARAM:
+    model_id: baselines/TimeMoE/ckpt/TimeMoE-50M
+    from_pretrained: False
+    context_length: 4079
+    trust_remote_code: True
+  DTYPE: bfloat16
+METRICS:
+  FUNCS:
+TRAIN:
+  COMPILE_MODEL: True
+  NUM_ITERATIONS: 200011
+  CKPT_SAVE_DIR: checkpoints/TimeMoE4/Main_200011
+  CKPT_SAVE_STRATEGY: 5000
+  LOSS: fake_loss
+  OPTIM:
+    TYPE: AdamW
+    PARAM:
+      lr: 0.001
+      betas: (0.9, 0.95)
+      fused: True
+  LR_SCHEDULER:
+    TYPE: CosineWarmup
+    PARAM:
+      num_warmup_steps: 10000
+      num_training_steps: 200011
+  CLIP_GRAD_PARAM:
+    max_norm: 1.0
+  DATA:
+    BATCH_SIZE: 85
+    SHUFFLE: True
+    PIN_MEMORY: True
+    PREFETCH: True
+  GRAD_ACCUMULATION_STEPS: 1
+VAL:
+  INTERVAL: 5000
+  DATA:
+    BATCH_SIZE: 170
+EVAL:
+  USE_GPU: True
+DATASET:
+  NAME: Main
+  TYPE: <class 'baselines.TimeMoE4.data.mix_dataset_v2.MixedSourceDataset_v2'>
+  PARAM:
+    num_valid_samples: 1000
+INFERENCE:
+  GENERATION_PARAMS:
+    normalize: True
+MD5: f1be674d3eb4ef701f64bdb43b46084c

Main_200011/f1be674d3eb4ef701f64bdb43b46084c/tensorboard/events.out.tfevents.1768408052.brev-5x9knwe1p.565000.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:627c6d28eca3708d77a5353580b489be31fe0622835016354ac7e3a8e0a31b76
+size 15018436

Main_200011/f1be674d3eb4ef701f64bdb43b46084c/timemoe_base.py ADDED Viewed

	@@ -0,0 +1,125 @@

+# 采样概率变化
+import os
+import sys
+from easydict import EasyDict
+sys.path.append(os.path.abspath(__file__ + '/../../..'))
+from ..arch import TimeMoE4
+from ..data import MixedSourceDataset_v2
+from ..runner import TimeMoERunner
+from ..loss import fake_loss
+############################## Hot Parameters ##############################
+# Dataset & Metrics configuration
+# Model architecture and parameters
+pretrained = False  # Whether to use a pretrained model
+MODEL_ARCH = TimeMoE4
+MODEL_PARAM = {
+    'model_id': "baselines/TimeMoE/ckpt/TimeMoE-50M",
+    'from_pretrained': pretrained,
+    'context_length': 4079,
+    'trust_remote_code': True,
+}
+DATA_NAME = "Main"
+# N = 20_000_000
+# batch size = 16*8
+# 20_000_000 / 16 / 8 = 156250 iterations
+# 20_000_000 * 4096 / 16 / 8 / 4096 = 156_250
+NUM_ITERATIONS = 200_011 # 总轮数   20_000_000 * 4096 / 16 / 4 / 4096 = 312,500
+VAL_ITERATION_INTERVAL = 5_000 # 每VAL_ITERATION_INTERVAL执行一次验证
+############################## General Configuration ##############################
+CFG = EasyDict()
+# General settings
+CFG.DESCRIPTION = 'TimeMoE Base'
+CFG.DEVICE = 'gpu'
+CFG.DEVICE_NUM = 3
+# Runner
+CFG.RUNNER = TimeMoERunner
+############################## Model Configuration ################################
+CFG.MODEL = EasyDict()
+CFG.MODEL.NAME = MODEL_ARCH.__name__
+CFG.MODEL.ARCH = MODEL_ARCH
+CFG.MODEL.PARAM = MODEL_PARAM
+CFG.MODEL.DTYPE= 'bfloat16'
+# CFG.MODEL.DTYPE= 'float32'
+############################## Metrics Configuration ##############################
+CFG.METRICS = EasyDict()
+# Metrics settings
+CFG.METRICS.FUNCS = EasyDict({})
+############################## Training Configuration ##############################
+CFG.TRAIN = EasyDict()
+CFG.TRAIN.COMPILE_MODEL = True
+CFG.TRAIN.NUM_ITERATIONS = NUM_ITERATIONS
+CFG.TRAIN.CKPT_SAVE_DIR = os.path.join(
+    'checkpoints',
+    MODEL_ARCH.__name__,
+    '_'.join([DATA_NAME, str(CFG.TRAIN.NUM_ITERATIONS)])
+)
+CFG.TRAIN.CKPT_SAVE_STRATEGY = VAL_ITERATION_INTERVAL * 1 # 保存策略，每VAL_ITERATION_INTERVAL * 5保存一次模型
+CFG.TRAIN.LOSS = fake_loss
+# Optimizer settings
+CFG.TRAIN.OPTIM = EasyDict()
+CFG.TRAIN.OPTIM.TYPE = "AdamW"
+CFG.TRAIN.OPTIM.PARAM = {
+    "lr": 1e-3,
+    "betas": (0.9, 0.95),
+    "fused": True,
+    # "weight_decay": 1e-1,
+}
+# Learning rate scheduler settings
+CFG.TRAIN.LR_SCHEDULER = EasyDict()
+CFG.TRAIN.LR_SCHEDULER.TYPE = "CosineWarmup"
+CFG.TRAIN.LR_SCHEDULER.PARAM = {
+    'num_warmup_steps': int(NUM_ITERATIONS / 100 * 5), # 5%的warmup启动比例
+    'num_training_steps': NUM_ITERATIONS,
+}
+CFG.TRAIN.CLIP_GRAD_PARAM = {
+    'max_norm': 1.0
+}
+# Train data loader settings
+CFG.TRAIN.DATA = EasyDict()
+CFG.TRAIN.DATA.BATCH_SIZE = 85  # 16  /  4
+CFG.TRAIN.DATA.SHUFFLE = True # has to be False
+CFG.TRAIN.DATA.PIN_MEMORY = True
+CFG.TRAIN.DATA.PREFETCH = True
+CFG.TRAIN.GRAD_ACCUMULATION_STEPS = 1
+# CFG.TRAIN.DATA.NUM_WORKERS = 4
+############################## Validation Configuration ##############################
+CFG.VAL = EasyDict()
+CFG.VAL.INTERVAL = VAL_ITERATION_INTERVAL
+CFG.VAL.DATA = EasyDict()
+CFG.VAL.DATA.BATCH_SIZE = 170  # 32  /  8
+############################## Evaluation Configuration ##############################
+CFG.EVAL = EasyDict()
+# Evaluation parameters
+CFG.EVAL.USE_GPU = True # Whether to use GPU for evaluation. Default: True
+############################## Dataset Configuration ##############################
+CFG.DATASET = EasyDict()
+# Dataset settings
+CFG.DATASET.NAME = DATA_NAME
+CFG.DATASET.TYPE = MixedSourceDataset_v2
+CFG.DATASET.PARAM = EasyDict({
+    'num_valid_samples': 1000
+})
+############################## Inference Configuration ##############################
+CFG.INFERENCE = EasyDict()
+CFG.INFERENCE.GENERATION_PARAMS = EasyDict({
+    'normalize': not pretrained
+})

Main_200011/f1be674d3eb4ef701f64bdb43b46084c/training_log_20260114162720.log ADDED Viewed

	@@ -0,0 +1,139 @@

+2026-01-14 16:27:20,094 - easytorch-training - INFO - Initializing training.
+2026-01-14 16:27:20,096 - easytorch-training - INFO - Set clip grad, param: {'max_norm': 1.0}
+2026-01-14 16:27:20,097 - easytorch-training - INFO - Building training data loader.
+2026-01-14 16:27:32,154 - easytorch-training - INFO - MixedSourceDataset initialized for 'train' mode.
+2026-01-14 16:27:32,154 - easytorch-training - INFO -   - real: 3201174 samples
+2026-01-14 16:27:32,154 - easytorch-training - INFO -   - synth: 2000000 samples
+2026-01-14 16:27:32,154 - easytorch-training - INFO - Train dataset length: 3201174
+2026-01-14 16:27:32,156 - easytorch-training - INFO - Set optim: AdamW (
+Parameter Group 0
+    amsgrad: False
+    betas: (0.9, 0.95)
+    capturable: False
+    differentiable: False
+    eps: 1e-08
+    foreach: None
+    fused: True
+    lr: 0.001
+    maximize: False
+    weight_decay: 0.01
+)
+2026-01-14 16:27:32,157 - easytorch-training - INFO - Set lr_scheduler: <basicts.runners.optim.lr_schedulers.CosineWarmup object at 0x70b44373bd50>
+2026-01-14 16:27:32,158 - easytorch-training - INFO - Initializing validation.
+2026-01-14 16:27:32,158 - easytorch-training - INFO - Building val data loader.
+2026-01-14 16:27:32,445 - easytorch-training - INFO - Worker 0 initialized for cauker_univariate.
+2026-01-14 16:27:59,545 - easytorch-training - INFO - MixedSourceDataset initialized for 'valid' mode.
+2026-01-14 16:27:59,545 - easytorch-training - INFO -   - real: 1000 samples
+2026-01-14 16:27:59,545 - easytorch-training - INFO - Valid dataset length: 1000
+2026-01-14 16:27:59,545 - easytorch-training - INFO - Number of parameters: 13342848
+2026-01-14 16:27:59,546 - easytorch-training - INFO - Training with 3 GPUs, batch size per GPUs: 85, grad_accumulation_steps: 1
+2026-01-14 16:27:59,546 - easytorch-training - INFO - Effective batch size: 255
+2026-01-14 17:31:14,322 - easytorch-training - INFO - Iteration 5000 / 200011
+2026-01-14 17:31:14,858 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.76 (s), train/lr: 2.50e-04, train/loss: 3.8542, train/grad_norm: 6.5245, train/amp_scale: 1.0000]
+2026-01-14 17:31:14,858 - easytorch-training - INFO - Start validation.
+2026-01-14 17:31:27,354 - easytorch-training - INFO - Result <val>: [val/time: 12.27 (s), val/loss: 3.6271]
+2026-01-14 17:31:27,468 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200011/f1be674d3eb4ef701f64bdb43b46084c/TimeMoE4_best_val_loss.pt saved
+2026-01-14 17:31:27,579 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200011/f1be674d3eb4ef701f64bdb43b46084c/TimeMoE4_005000.pt saved
+2026-01-14 17:31:27,580 - easytorch-training - INFO - The estimated training finish time is 2026-01-16 10:46:49
+2026-01-14 18:36:15,727 - easytorch-training - INFO - Iteration 10000 / 200011
+2026-01-14 18:36:16,263 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.78 (s), train/lr: 7.50e-04, train/loss: 2.9735, train/grad_norm: 2.7797, train/amp_scale: 1.0000]
+2026-01-14 18:36:16,264 - easytorch-training - INFO - Start validation.
+2026-01-14 18:36:23,184 - easytorch-training - INFO - Result <val>: [val/time: 6.70 (s), val/loss: 3.6075]
+2026-01-14 18:36:23,310 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200011/f1be674d3eb4ef701f64bdb43b46084c/TimeMoE4_best_val_loss.pt saved
+2026-01-14 18:36:23,422 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200011/f1be674d3eb4ef701f64bdb43b46084c/TimeMoE4_010000.pt saved
+2026-01-14 18:36:23,423 - easytorch-training - INFO - The estimated training finish time is 2026-01-16 11:16:05
+2026-01-14 19:40:42,061 - easytorch-training - INFO - Iteration 15000 / 200011
+2026-01-14 19:40:42,606 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.77 (s), train/lr: 9.99e-04, train/loss: 2.8267, train/grad_norm: 2.5325, train/amp_scale: 1.0000]
+2026-01-14 19:40:42,607 - easytorch-training - INFO - Start validation.
+2026-01-14 19:40:49,528 - easytorch-training - INFO - Result <val>: [val/time: 6.70 (s), val/loss: 3.5225]
+2026-01-14 19:40:49,649 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200011/f1be674d3eb4ef701f64bdb43b46084c/TimeMoE4_best_val_loss.pt saved
+2026-01-14 19:40:49,760 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200011/f1be674d3eb4ef701f64bdb43b46084c/TimeMoE4_015000.pt saved
+2026-01-14 19:40:49,761 - easytorch-training - INFO - The estimated training finish time is 2026-01-16 11:19:17
+2026-01-14 20:44:59,894 - easytorch-training - INFO - Iteration 20000 / 200011
+2026-01-14 20:45:00,427 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.77 (s), train/lr: 9.96e-04, train/loss: 2.7619, train/grad_norm: 2.3554, train/amp_scale: 1.0000]
+2026-01-14 20:45:00,427 - easytorch-training - INFO - Start validation.
+2026-01-14 20:45:07,353 - easytorch-training - INFO - Result <val>: [val/time: 6.70 (s), val/loss: 3.5194]
+2026-01-14 20:45:07,475 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200011/f1be674d3eb4ef701f64bdb43b46084c/TimeMoE4_best_val_loss.pt saved
+2026-01-14 20:45:07,585 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200011/f1be674d3eb4ef701f64bdb43b46084c/TimeMoE4_020000.pt saved
+2026-01-14 20:45:07,587 - easytorch-training - INFO - The estimated training finish time is 2026-01-16 11:19:28
+2026-01-14 21:49:25,050 - easytorch-training - INFO - Iteration 25000 / 200011
+2026-01-14 21:49:25,588 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.77 (s), train/lr: 9.89e-04, train/loss: 2.7355, train/grad_norm: 2.3592, train/amp_scale: 1.0000]
+2026-01-14 21:49:25,589 - easytorch-training - INFO - Start validation.
+2026-01-14 21:49:32,516 - easytorch-training - INFO - Result <val>: [val/time: 6.71 (s), val/loss: 3.5044]
+2026-01-14 21:49:32,640 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200011/f1be674d3eb4ef701f64bdb43b46084c/TimeMoE4_best_val_loss.pt saved
+2026-01-14 21:49:32,752 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200011/f1be674d3eb4ef701f64bdb43b46084c/TimeMoE4_025000.pt saved
+2026-01-14 21:49:32,753 - easytorch-training - INFO - The estimated training finish time is 2026-01-16 11:20:33
+2026-01-14 22:53:34,883 - easytorch-training - INFO - Iteration 30000 / 200011
+2026-01-14 22:53:35,417 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.77 (s), train/lr: 9.79e-04, train/loss: 2.7129, train/grad_norm: 2.3732, train/amp_scale: 1.0000]
+2026-01-14 22:53:35,418 - easytorch-training - INFO - Start validation.
+2026-01-14 22:53:42,342 - easytorch-training - INFO - Result <val>: [val/time: 6.70 (s), val/loss: 3.4967]
+2026-01-14 22:53:42,467 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200011/f1be674d3eb4ef701f64bdb43b46084c/TimeMoE4_best_val_loss.pt saved
+2026-01-14 22:53:42,577 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200011/f1be674d3eb4ef701f64bdb43b46084c/TimeMoE4_030000.pt saved
+2026-01-14 22:53:42,578 - easytorch-training - INFO - The estimated training finish time is 2026-01-16 11:19:34
+2026-01-14 23:57:28,343 - easytorch-training - INFO - Iteration 35000 / 200011
+2026-01-14 23:57:28,877 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.76 (s), train/lr: 9.66e-04, train/loss: 2.7032, train/grad_norm: 2.3791, train/amp_scale: 1.0000]
+2026-01-14 23:57:28,878 - easytorch-training - INFO - Start validation.
+2026-01-14 23:57:35,791 - easytorch-training - INFO - Result <val>: [val/time: 6.69 (s), val/loss: 3.4896]
+2026-01-14 23:57:35,914 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200011/f1be674d3eb4ef701f64bdb43b46084c/TimeMoE4_best_val_loss.pt saved
+2026-01-14 23:57:36,027 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200011/f1be674d3eb4ef701f64bdb43b46084c/TimeMoE4_035000.pt saved
+2026-01-14 23:57:36,027 - easytorch-training - INFO - The estimated training finish time is 2026-01-16 11:17:19
+2026-01-15 01:00:31,669 - easytorch-training - INFO - Iteration 40000 / 200011
+2026-01-15 01:00:32,340 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.75 (s), train/lr: 9.49e-04, train/loss: 2.6969, train/grad_norm: 2.2865, train/amp_scale: 1.0000]
+2026-01-15 01:00:32,341 - easytorch-training - INFO - Start validation.
+2026-01-15 01:00:39,272 - easytorch-training - INFO - Result <val>: [val/time: 6.71 (s), val/loss: 3.4980]
+2026-01-15 01:00:39,401 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200011/f1be674d3eb4ef701f64bdb43b46084c/TimeMoE4_040000.pt saved
+2026-01-15 01:00:39,402 - easytorch-training - INFO - The estimated training finish time is 2026-01-16 11:11:27
+2026-01-15 02:04:31,136 - easytorch-training - INFO - Iteration 45000 / 200011
+2026-01-15 02:04:31,671 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.77 (s), train/lr: 9.29e-04, train/loss: 2.6883, train/grad_norm: 2.2660, train/amp_scale: 1.0000]
+2026-01-15 02:04:31,672 - easytorch-training - INFO - Start validation.
+2026-01-15 02:04:38,600 - easytorch-training - INFO - Result <val>: [val/time: 6.71 (s), val/loss: 3.4953]
+2026-01-15 02:04:38,716 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200011/f1be674d3eb4ef701f64bdb43b46084c/TimeMoE4_045000.pt saved
+2026-01-15 02:04:38,717 - easytorch-training - INFO - The estimated training finish time is 2026-01-16 11:11:02
+2026-01-15 03:08:32,841 - easytorch-training - INFO - Iteration 50000 / 200011
+2026-01-15 03:08:33,374 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.77 (s), train/lr: 9.07e-04, train/loss: 2.6821, train/grad_norm: 2.2420, train/amp_scale: 1.0000]
+2026-01-15 03:08:33,375 - easytorch-training - INFO - Start validation.
+2026-01-15 03:08:40,302 - easytorch-training - INFO - Result <val>: [val/time: 6.71 (s), val/loss: 3.4875]
+2026-01-15 03:08:40,437 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200011/f1be674d3eb4ef701f64bdb43b46084c/TimeMoE4_best_val_loss.pt saved
+2026-01-15 03:08:40,559 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200011/f1be674d3eb4ef701f64bdb43b46084c/TimeMoE4_050000.pt saved
+2026-01-15 03:08:40,561 - easytorch-training - INFO - The estimated training finish time is 2026-01-16 11:10:52
+2026-01-15 04:12:37,353 - easytorch-training - INFO - Iteration 55000 / 200011
+2026-01-15 04:12:37,890 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.77 (s), train/lr: 8.81e-04, train/loss: 2.6890, train/grad_norm: 2.2385, train/amp_scale: 1.0000]
+2026-01-15 04:12:37,891 - easytorch-training - INFO - Start validation.
+2026-01-15 04:12:44,817 - easytorch-training - INFO - Result <val>: [val/time: 6.70 (s), val/loss: 3.4844]
+2026-01-15 04:12:44,940 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200011/f1be674d3eb4ef701f64bdb43b46084c/TimeMoE4_best_val_loss.pt saved
+2026-01-15 04:12:45,052 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200011/f1be674d3eb4ef701f64bdb43b46084c/TimeMoE4_055000.pt saved
+2026-01-15 04:12:45,054 - easytorch-training - INFO - The estimated training finish time is 2026-01-16 11:10:53
+2026-01-15 05:16:08,307 - easytorch-training - INFO - Iteration 60000 / 200011
+2026-01-15 05:16:08,842 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.76 (s), train/lr: 8.53e-04, train/loss: 2.6833, train/grad_norm: 2.1877, train/amp_scale: 1.0000]
+2026-01-15 05:16:08,842 - easytorch-training - INFO - Start validation.
+2026-01-15 05:16:15,748 - easytorch-training - INFO - Result <val>: [val/time: 6.69 (s), val/loss: 3.4787]
+2026-01-15 05:16:15,873 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200011/f1be674d3eb4ef701f64bdb43b46084c/TimeMoE4_best_val_loss.pt saved
+2026-01-15 05:16:15,987 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200011/f1be674d3eb4ef701f64bdb43b46084c/TimeMoE4_060000.pt saved
+2026-01-15 05:16:15,988 - easytorch-training - INFO - The estimated training finish time is 2026-01-16 11:09:02
+2026-01-15 06:19:20,036 - easytorch-training - INFO - Iteration 65000 / 200011
+2026-01-15 06:19:20,567 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.76 (s), train/lr: 8.23e-04, train/loss: 3.3071, train/grad_norm: 296.8033, train/amp_scale: 1.0000]
+2026-01-15 06:19:20,567 - easytorch-training - INFO - Start validation.
+2026-01-15 06:19:27,441 - easytorch-training - INFO - Result <val>: [val/time: 6.65 (s), val/loss: 5.4669]
+2026-01-15 06:19:27,555 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200011/f1be674d3eb4ef701f64bdb43b46084c/TimeMoE4_065000.pt saved
+2026-01-15 06:19:27,556 - easytorch-training - INFO - The estimated training finish time is 2026-01-16 11:06:29
+2026-01-15 07:23:04,632 - easytorch-training - INFO - Iteration 70000 / 200011
+2026-01-15 07:23:05,164 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.76 (s), train/lr: 7.90e-04, train/loss: 5.5967, train/grad_norm: 6903292.9458, train/amp_scale: 1.0000]
+2026-01-15 07:23:05,165 - easytorch-training - INFO - Start validation.
+2026-01-15 07:23:12,055 - easytorch-training - INFO - Result <val>: [val/time: 6.67 (s), val/loss: 5.2645]
+2026-01-15 07:23:12,180 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200011/f1be674d3eb4ef701f64bdb43b46084c/TimeMoE4_070000.pt saved
+2026-01-15 07:23:12,180 - easytorch-training - INFO - The estimated training finish time is 2026-01-16 11:05:52
+2026-01-15 07:55:41,123 - easytorch-training - ERROR - Traceback (most recent call last):
+  File "/home/nvidia/miniconda3/envs/zxx/lib/python3.11/site-packages/easytorch/launcher/launcher.py", line 31, in training_func
+    runner.train(cfg)
+  File "/lp-dev/zhouxx/BasicTS/basicts/runners/base_iteration_runner.py", line 200, in train
+    self.train_iters(iteration=iteration, dataloader=self.train_data_loader)
+  File "/lp-dev/zhouxx/BasicTS/basicts/runners/base_utsf_runner.py", line 281, in train_iters
+    self.backward(loss, accumulating=accumulating)
+  File "/lp-dev/zhouxx/BasicTS/basicts/runners/base_utsf_runner.py", line 337, in backward
+    grad_norm = sum(
+                ^^^^
+  File "/lp-dev/zhouxx/BasicTS/basicts/runners/base_utsf_runner.py", line 338, in <genexpr>
+    param.grad.data.norm(2).item() ** 2 for param in self.model.parameters() if param.grad is not None
+    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+KeyboardInterrupt