Twinkles01 commited on
Commit
9ec5f30
·
verified ·
1 Parent(s): 3d12987

Upload Main_200014

Browse files
Main_200014/50a47da570c7010fe8e0a1c34a78d536/TimeMoE4_005000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b4ed177201c2e3c434e8e2fa72de8652904410576e53ec295ae68d9207bde09b
3
+ size 151685888
Main_200014/50a47da570c7010fe8e0a1c34a78d536/TimeMoE4_010000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1511b7cf6e742d9a9746df22aa738efcf3641245a45ec6b0bcbdb18e0e892cee
3
+ size 151685888
Main_200014/50a47da570c7010fe8e0a1c34a78d536/TimeMoE4_015000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:79094b4ed8e8a15eafdfab9310dc2179d050c951cfcf8c039418879536dd02c9
3
+ size 151685888
Main_200014/50a47da570c7010fe8e0a1c34a78d536/TimeMoE4_020000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:934f3de40b880d2a5c3f39e4653c1f19c66621de722b9937c4c7ea75a62814e1
3
+ size 151685888
Main_200014/50a47da570c7010fe8e0a1c34a78d536/TimeMoE4_025000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c3b1571ca796987c8ddd72371478a2c7263c50f935e3c41adee8a7ae66887db8
3
+ size 151685888
Main_200014/50a47da570c7010fe8e0a1c34a78d536/TimeMoE4_030000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d997dad6cbd7e56b5a3b856e76d71ade438c342d4701586bbef5254993dfb70c
3
+ size 151685888
Main_200014/50a47da570c7010fe8e0a1c34a78d536/TimeMoE4_035000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fa4e413639241176033a719569c4efaa7a4fd8f4c00b0c96fa4acf2c8ee806db
3
+ size 151685888
Main_200014/50a47da570c7010fe8e0a1c34a78d536/TimeMoE4_040000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c40c4a3cdcfe53fb2dfb754a5499fae7b58281fae491d59c0460e4280e0c637
3
+ size 151685888
Main_200014/50a47da570c7010fe8e0a1c34a78d536/TimeMoE4_045000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:17bc2f9f1d5550a48a45bac7ec6a3eb3c91308671ebbb4205029ae7dbb72ddae
3
+ size 151685888
Main_200014/50a47da570c7010fe8e0a1c34a78d536/TimeMoE4_050000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e6b83b6261b8327b56d9d692ea535ac90d0e8f988a68a8c0c44f7ffce1108ff1
3
+ size 151685888
Main_200014/50a47da570c7010fe8e0a1c34a78d536/TimeMoE4_best_val_loss.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:70f12059fa4c52931b265aae8b72a1aa1b93e17c3c16dcbddd357fc2ac88c11d
3
+ size 151688871
Main_200014/50a47da570c7010fe8e0a1c34a78d536/cfg.txt ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ DESCRIPTION: TimeMoE Base
2
+ DEVICE: gpu
3
+ DEVICE_NUM: 3
4
+ RUNNER: <class 'baselines.TimeMoE4.runner.runner.TimeMoERunner'>
5
+ MODEL:
6
+ NAME: TimeMoE4
7
+ ARCH: <class 'baselines.TimeMoE4.arch.timemoe.TimeMoE4'>
8
+ PARAM:
9
+ model_id: baselines/TimeMoE/ckpt/TimeMoE-50M
10
+ from_pretrained: False
11
+ context_length: 4079
12
+ trust_remote_code: True
13
+ DTYPE: bfloat16
14
+ METRICS:
15
+ FUNCS:
16
+ TRAIN:
17
+ COMPILE_MODEL: True
18
+ NUM_ITERATIONS: 200014
19
+ CKPT_SAVE_DIR: checkpoints/TimeMoE4/Main_200014
20
+ CKPT_SAVE_STRATEGY: 5000
21
+ LOSS: fake_loss
22
+ OPTIM:
23
+ TYPE: AdamW
24
+ PARAM:
25
+ lr: 0.001
26
+ betas: (0.9, 0.95)
27
+ fused: True
28
+ LR_SCHEDULER:
29
+ TYPE: CosineWarmup
30
+ PARAM:
31
+ num_warmup_steps: 10000
32
+ num_training_steps: 200014
33
+ CLIP_GRAD_PARAM:
34
+ max_norm: 1.0
35
+ DATA:
36
+ BATCH_SIZE: 85
37
+ SHUFFLE: True
38
+ PIN_MEMORY: True
39
+ PREFETCH: True
40
+ GRAD_ACCUMULATION_STEPS: 1
41
+ VAL:
42
+ INTERVAL: 5000
43
+ DATA:
44
+ BATCH_SIZE: 170
45
+ EVAL:
46
+ USE_GPU: True
47
+ DATASET:
48
+ NAME: Main
49
+ TYPE: <class 'baselines.TimeMoE4.data.mix_dataset_v2.MixedSourceDataset_v2'>
50
+ PARAM:
51
+ num_valid_samples: 1000
52
+ INFERENCE:
53
+ GENERATION_PARAMS:
54
+ normalize: True
55
+ MD5: 50a47da570c7010fe8e0a1c34a78d536
Main_200014/50a47da570c7010fe8e0a1c34a78d536/tensorboard/events.out.tfevents.1768807194.brev-5x9knwe1p.3708293.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:40697a2fea2646111cad3d0310712e4422d6e6d9f69d51c3c65912b2a7002091
3
+ size 11234316
Main_200014/50a47da570c7010fe8e0a1c34a78d536/timemoe_base.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 采样概率变化
2
+
3
+ import os
4
+ import sys
5
+ from easydict import EasyDict
6
+ sys.path.append(os.path.abspath(__file__ + '/../../..'))
7
+
8
+ from ..arch import TimeMoE4
9
+ from ..data import MixedSourceDataset_v2
10
+ from ..runner import TimeMoERunner
11
+ from ..loss import fake_loss
12
+
13
+
14
+ ############################## Hot Parameters ##############################
15
+ # Dataset & Metrics configuration
16
+ # Model architecture and parameters
17
+
18
+ pretrained = False # Whether to use a pretrained model
19
+
20
+ MODEL_ARCH = TimeMoE4
21
+
22
+ MODEL_PARAM = {
23
+ 'model_id': "baselines/TimeMoE/ckpt/TimeMoE-50M",
24
+ 'from_pretrained': pretrained,
25
+ 'context_length': 4079,
26
+ 'trust_remote_code': True,
27
+ }
28
+ DATA_NAME = "Main"
29
+
30
+ # N = 20_000_000
31
+ # batch size = 16*8
32
+ # 20_000_000 / 16 / 8 = 156250 iterations
33
+ # 20_000_000 * 4096 / 16 / 8 / 4096 = 156_250
34
+
35
+ NUM_ITERATIONS = 200_014 # 总轮数 20_000_000 * 4096 / 16 / 4 / 4096 = 312,500
36
+ VAL_ITERATION_INTERVAL = 5_000 # 每VAL_ITERATION_INTERVAL执行一次验证
37
+
38
+ ############################## General Configuration ##############################
39
+ CFG = EasyDict()
40
+ # General settings
41
+ CFG.DESCRIPTION = 'TimeMoE Base'
42
+ CFG.DEVICE = 'gpu'
43
+ CFG.DEVICE_NUM = 3
44
+ # Runner
45
+ CFG.RUNNER = TimeMoERunner
46
+
47
+ ############################## Model Configuration ################################
48
+ CFG.MODEL = EasyDict()
49
+ CFG.MODEL.NAME = MODEL_ARCH.__name__
50
+ CFG.MODEL.ARCH = MODEL_ARCH
51
+ CFG.MODEL.PARAM = MODEL_PARAM
52
+ CFG.MODEL.DTYPE= 'bfloat16'
53
+ # CFG.MODEL.DTYPE= 'float32'
54
+
55
+ ############################## Metrics Configuration ##############################
56
+ CFG.METRICS = EasyDict()
57
+ # Metrics settings
58
+ CFG.METRICS.FUNCS = EasyDict({})
59
+
60
+ ############################## Training Configuration ##############################
61
+ CFG.TRAIN = EasyDict()
62
+ CFG.TRAIN.COMPILE_MODEL = True
63
+ CFG.TRAIN.NUM_ITERATIONS = NUM_ITERATIONS
64
+ CFG.TRAIN.CKPT_SAVE_DIR = os.path.join(
65
+ 'checkpoints',
66
+ MODEL_ARCH.__name__,
67
+ '_'.join([DATA_NAME, str(CFG.TRAIN.NUM_ITERATIONS)])
68
+ )
69
+ CFG.TRAIN.CKPT_SAVE_STRATEGY = VAL_ITERATION_INTERVAL * 1 # 保存策略,每VAL_ITERATION_INTERVAL * 5保存一次模型
70
+ CFG.TRAIN.LOSS = fake_loss
71
+ # Optimizer settings
72
+ CFG.TRAIN.OPTIM = EasyDict()
73
+ CFG.TRAIN.OPTIM.TYPE = "AdamW"
74
+ CFG.TRAIN.OPTIM.PARAM = {
75
+ "lr": 1e-3,
76
+ "betas": (0.9, 0.95),
77
+ # "betas": (0.9, 0.98),
78
+ "fused": True,
79
+ # "weight_decay": 1e-1,
80
+ }
81
+ # Learning rate scheduler settings
82
+ CFG.TRAIN.LR_SCHEDULER = EasyDict()
83
+ CFG.TRAIN.LR_SCHEDULER.TYPE = "CosineWarmup"
84
+ CFG.TRAIN.LR_SCHEDULER.PARAM = {
85
+ 'num_warmup_steps': 10_000, # 10k
86
+ 'num_training_steps': NUM_ITERATIONS,
87
+ }
88
+ CFG.TRAIN.CLIP_GRAD_PARAM = {
89
+ 'max_norm': 1.0
90
+ }
91
+ # Train data loader settings
92
+ CFG.TRAIN.DATA = EasyDict()
93
+ CFG.TRAIN.DATA.BATCH_SIZE = 85 # 16 / 4
94
+ CFG.TRAIN.DATA.SHUFFLE = True # has to be False
95
+ CFG.TRAIN.DATA.PIN_MEMORY = True
96
+ CFG.TRAIN.DATA.PREFETCH = True
97
+ CFG.TRAIN.GRAD_ACCUMULATION_STEPS = 1
98
+ # CFG.TRAIN.DATA.NUM_WORKERS = 4
99
+
100
+ ############################## Validation Configuration ##############################
101
+ CFG.VAL = EasyDict()
102
+ CFG.VAL.INTERVAL = VAL_ITERATION_INTERVAL
103
+ CFG.VAL.DATA = EasyDict()
104
+ CFG.VAL.DATA.BATCH_SIZE = 170 # 32 / 8
105
+
106
+ ############################## Evaluation Configuration ##############################
107
+
108
+ CFG.EVAL = EasyDict()
109
+ # Evaluation parameters
110
+ CFG.EVAL.USE_GPU = True # Whether to use GPU for evaluation. Default: True
111
+
112
+ ############################## Dataset Configuration ##############################
113
+ CFG.DATASET = EasyDict()
114
+ # Dataset settings
115
+ CFG.DATASET.NAME = DATA_NAME
116
+ CFG.DATASET.TYPE = MixedSourceDataset_v2
117
+ CFG.DATASET.PARAM = EasyDict({
118
+ 'num_valid_samples': 1000
119
+ })
120
+
121
+ ############################## Inference Configuration ##############################
122
+ CFG.INFERENCE = EasyDict()
123
+ CFG.INFERENCE.GENERATION_PARAMS = EasyDict({
124
+ 'normalize': not pretrained
125
+ })
126
+
Main_200014/50a47da570c7010fe8e0a1c34a78d536/training_log_20260119071944.log ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-01-19 07:19:44,428 - easytorch-training - INFO - Initializing training.
2
+ 2026-01-19 07:19:44,429 - easytorch-training - INFO - Set clip grad, param: {'max_norm': 1.0}
3
+ 2026-01-19 07:19:44,429 - easytorch-training - INFO - Building training data loader.
4
+ 2026-01-19 07:19:54,179 - easytorch-training - INFO - MixedSourceDataset initialized for 'train' mode.
5
+ 2026-01-19 07:19:54,179 - easytorch-training - INFO - - real: 3201174 samples
6
+ 2026-01-19 07:19:54,179 - easytorch-training - INFO - - synth: 2000000 samples
7
+ 2026-01-19 07:19:54,180 - easytorch-training - INFO - Train dataset length: 3201174
8
+ 2026-01-19 07:19:54,182 - easytorch-training - INFO - Set optim: AdamW (
9
+ Parameter Group 0
10
+ amsgrad: False
11
+ betas: (0.9, 0.95)
12
+ capturable: False
13
+ differentiable: False
14
+ eps: 1e-08
15
+ foreach: None
16
+ fused: True
17
+ lr: 0.001
18
+ maximize: False
19
+ weight_decay: 0.01
20
+ )
21
+ 2026-01-19 07:19:54,182 - easytorch-training - INFO - Set lr_scheduler: <basicts.runners.optim.lr_schedulers.CosineWarmup object at 0x753064a97dd0>
22
+ 2026-01-19 07:19:54,184 - easytorch-training - INFO - Initializing validation.
23
+ 2026-01-19 07:19:54,184 - easytorch-training - INFO - Building val data loader.
24
+ 2026-01-19 07:19:54,463 - easytorch-training - INFO - Worker 0 initialized for cauker_univariate.
25
+ 2026-01-19 07:20:20,862 - easytorch-training - INFO - MixedSourceDataset initialized for 'valid' mode.
26
+ 2026-01-19 07:20:20,862 - easytorch-training - INFO - - real: 1000 samples
27
+ 2026-01-19 07:20:20,862 - easytorch-training - INFO - Valid dataset length: 1000
28
+ 2026-01-19 07:20:20,863 - easytorch-training - INFO - Number of parameters: 12628992
29
+ 2026-01-19 07:20:20,863 - easytorch-training - INFO - Training with 3 GPUs, batch size per GPUs: 85, grad_accumulation_steps: 1
30
+ 2026-01-19 07:20:20,863 - easytorch-training - INFO - Effective batch size: 255
31
+ 2026-01-19 08:26:13,305 - easytorch-training - INFO - Iteration 5000 / 200014
32
+ 2026-01-19 08:26:13,725 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.79 (s), train/lr: 2.50e-04, train/loss: 3.9592, train/grad_norm: 6.1824, train/amp_scale: 1.0000]
33
+ 2026-01-19 08:26:13,725 - easytorch-training - INFO - Start validation.
34
+ 2026-01-19 08:26:27,598 - easytorch-training - INFO - Result <val>: [val/time: 13.70 (s), val/loss: 3.3753]
35
+ 2026-01-19 08:26:27,704 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200014/50a47da570c7010fe8e0a1c34a78d536/TimeMoE4_best_val_loss.pt saved
36
+ 2026-01-19 08:26:27,815 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200014/50a47da570c7010fe8e0a1c34a78d536/TimeMoE4_005000.pt saved
37
+ 2026-01-19 08:26:27,816 - easytorch-training - INFO - The estimated training finish time is 2026-01-21 03:25:10
38
+ 2026-01-19 09:20:49,426 - easytorch-training - INFO - Iteration 10000 / 200014
39
+ 2026-01-19 09:20:49,846 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.65 (s), train/lr: 7.50e-04, train/loss: 3.2398, train/grad_norm: 2.2217, train/amp_scale: 1.0000]
40
+ 2026-01-19 09:20:49,846 - easytorch-training - INFO - Start validation.
41
+ 2026-01-19 09:20:56,465 - easytorch-training - INFO - Result <val>: [val/time: 6.44 (s), val/loss: 3.3116]
42
+ 2026-01-19 09:20:56,596 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200014/50a47da570c7010fe8e0a1c34a78d536/TimeMoE4_best_val_loss.pt saved
43
+ 2026-01-19 09:20:56,713 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200014/50a47da570c7010fe8e0a1c34a78d536/TimeMoE4_010000.pt saved
44
+ 2026-01-19 09:20:56,714 - easytorch-training - INFO - The estimated training finish time is 2026-01-20 23:32:28
45
+ 2026-01-19 10:15:17,743 - easytorch-training - INFO - Iteration 15000 / 200014
46
+ 2026-01-19 10:15:18,161 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.65 (s), train/lr: 9.99e-04, train/loss: 3.1413, train/grad_norm: 1.9954, train/amp_scale: 1.0000]
47
+ 2026-01-19 10:15:18,161 - easytorch-training - INFO - Start validation.
48
+ 2026-01-19 10:15:24,759 - easytorch-training - INFO - Result <val>: [val/time: 6.42 (s), val/loss: 3.2741]
49
+ 2026-01-19 10:15:24,887 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200014/50a47da570c7010fe8e0a1c34a78d536/TimeMoE4_best_val_loss.pt saved
50
+ 2026-01-19 10:15:25,004 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200014/50a47da570c7010fe8e0a1c34a78d536/TimeMoE4_015000.pt saved
51
+ 2026-01-19 10:15:25,005 - easytorch-training - INFO - The estimated training finish time is 2026-01-20 22:14:45
52
+ 2026-01-19 11:08:50,142 - easytorch-training - INFO - Iteration 20000 / 200014
53
+ 2026-01-19 11:08:50,559 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.64 (s), train/lr: 9.96e-04, train/loss: 3.0921, train/grad_norm: 2.0023, train/amp_scale: 1.0000]
54
+ 2026-01-19 11:08:50,559 - easytorch-training - INFO - Start validation.
55
+ 2026-01-19 11:08:57,149 - easytorch-training - INFO - Result <val>: [val/time: 6.42 (s), val/loss: 3.2675]
56
+ 2026-01-19 11:08:57,264 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200014/50a47da570c7010fe8e0a1c34a78d536/TimeMoE4_best_val_loss.pt saved
57
+ 2026-01-19 11:08:57,374 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200014/50a47da570c7010fe8e0a1c34a78d536/TimeMoE4_020000.pt saved
58
+ 2026-01-19 11:08:57,375 - easytorch-training - INFO - The estimated training finish time is 2026-01-20 21:26:35
59
+ 2026-01-19 12:01:38,196 - easytorch-training - INFO - Iteration 25000 / 200014
60
+ 2026-01-19 12:01:38,617 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.63 (s), train/lr: 9.89e-04, train/loss: 3.0706, train/grad_norm: 1.9924, train/amp_scale: 1.0000]
61
+ 2026-01-19 12:01:38,617 - easytorch-training - INFO - Start validation.
62
+ 2026-01-19 12:01:45,244 - easytorch-training - INFO - Result <val>: [val/time: 6.45 (s), val/loss: 3.2473]
63
+ 2026-01-19 12:01:45,377 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200014/50a47da570c7010fe8e0a1c34a78d536/TimeMoE4_best_val_loss.pt saved
64
+ 2026-01-19 12:01:45,497 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200014/50a47da570c7010fe8e0a1c34a78d536/TimeMoE4_025000.pt saved
65
+ 2026-01-19 12:01:45,497 - easytorch-training - INFO - The estimated training finish time is 2026-01-20 20:51:47
66
+ 2026-01-19 12:55:14,438 - easytorch-training - INFO - Iteration 30000 / 200014
67
+ 2026-01-19 12:55:14,860 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.64 (s), train/lr: 9.79e-04, train/loss: 3.0554, train/grad_norm: 2.0358, train/amp_scale: 1.0000]
68
+ 2026-01-19 12:55:14,861 - easytorch-training - INFO - Start validation.
69
+ 2026-01-19 12:55:21,487 - easytorch-training - INFO - Result <val>: [val/time: 6.45 (s), val/loss: 3.2401]
70
+ 2026-01-19 12:55:21,614 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200014/50a47da570c7010fe8e0a1c34a78d536/TimeMoE4_best_val_loss.pt saved
71
+ 2026-01-19 12:55:21,732 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200014/50a47da570c7010fe8e0a1c34a78d536/TimeMoE4_030000.pt saved
72
+ 2026-01-19 12:55:21,733 - easytorch-training - INFO - The estimated training finish time is 2026-01-20 20:33:56
73
+ 2026-01-19 13:49:00,480 - easytorch-training - INFO - Iteration 35000 / 200014
74
+ 2026-01-19 13:49:00,899 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.64 (s), train/lr: 9.66e-04, train/loss: 3.0499, train/grad_norm: 2.0126, train/amp_scale: 1.0000]
75
+ 2026-01-19 13:49:00,900 - easytorch-training - INFO - Start validation.
76
+ 2026-01-19 13:49:07,719 - easytorch-training - INFO - Result <val>: [val/time: 6.64 (s), val/loss: 3.2354]
77
+ 2026-01-19 13:49:07,840 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200014/50a47da570c7010fe8e0a1c34a78d536/TimeMoE4_best_val_loss.pt saved
78
+ 2026-01-19 13:49:07,951 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200014/50a47da570c7010fe8e0a1c34a78d536/TimeMoE4_035000.pt saved
79
+ 2026-01-19 13:49:07,952 - easytorch-training - INFO - The estimated training finish time is 2026-01-20 20:22:07
80
+ 2026-01-19 14:43:01,179 - easytorch-training - INFO - Iteration 40000 / 200014
81
+ 2026-01-19 14:43:01,597 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.65 (s), train/lr: 9.49e-04, train/loss: 3.0318, train/grad_norm: 2.0432, train/amp_scale: 1.0000]
82
+ 2026-01-19 14:43:01,597 - easytorch-training - INFO - Start validation.
83
+ 2026-01-19 14:43:08,189 - easytorch-training - INFO - Result <val>: [val/time: 6.42 (s), val/loss: 3.2270]
84
+ 2026-01-19 14:43:08,316 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200014/50a47da570c7010fe8e0a1c34a78d536/TimeMoE4_best_val_loss.pt saved
85
+ 2026-01-19 14:43:08,434 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200014/50a47da570c7010fe8e0a1c34a78d536/TimeMoE4_040000.pt saved
86
+ 2026-01-19 14:43:08,434 - easytorch-training - INFO - The estimated training finish time is 2026-01-20 20:14:28
87
+ 2026-01-19 15:36:00,962 - easytorch-training - INFO - Iteration 45000 / 200014
88
+ 2026-01-19 15:36:01,379 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.63 (s), train/lr: 9.29e-04, train/loss: 3.0320, train/grad_norm: 2.0299, train/amp_scale: 1.0000]
89
+ 2026-01-19 15:36:01,380 - easytorch-training - INFO - Start validation.
90
+ 2026-01-19 15:36:07,990 - easytorch-training - INFO - Result <val>: [val/time: 6.44 (s), val/loss: 3.2236]
91
+ 2026-01-19 15:36:08,124 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200014/50a47da570c7010fe8e0a1c34a78d536/TimeMoE4_best_val_loss.pt saved
92
+ 2026-01-19 15:36:08,245 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200014/50a47da570c7010fe8e0a1c34a78d536/TimeMoE4_045000.pt saved
93
+ 2026-01-19 15:36:08,246 - easytorch-training - INFO - The estimated training finish time is 2026-01-20 20:04:00
94
+ 2026-01-19 16:29:58,457 - easytorch-training - INFO - Iteration 50000 / 200014
95
+ 2026-01-19 16:29:58,876 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.64 (s), train/lr: 9.07e-04, train/loss: 3.0149, train/grad_norm: 2.0357, train/amp_scale: 1.0000]
96
+ 2026-01-19 16:29:58,876 - easytorch-training - INFO - Start validation.
97
+ 2026-01-19 16:30:05,501 - easytorch-training - INFO - Result <val>: [val/time: 6.45 (s), val/loss: 3.2239]
98
+ 2026-01-19 16:30:05,621 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200014/50a47da570c7010fe8e0a1c34a78d536/TimeMoE4_050000.pt saved
99
+ 2026-01-19 16:30:05,622 - easytorch-training - INFO - The estimated training finish time is 2026-01-20 19:59:29
100
+ 2026-01-19 17:17:11,237 - easytorch-training - ERROR - Traceback (most recent call last):
101
+ File "/home/nvidia/miniconda3/envs/zxx/lib/python3.11/site-packages/easytorch/launcher/launcher.py", line 31, in training_func
102
+ runner.train(cfg)
103
+ File "/lp-dev/zhouxx/BasicTS/basicts/runners/base_iteration_runner.py", line 200, in train
104
+ self.train_iters(iteration=iteration, dataloader=self.train_data_loader)
105
+ File "/lp-dev/zhouxx/BasicTS/basicts/runners/base_utsf_runner.py", line 281, in train_iters
106
+ self.backward(loss, accumulating=accumulating)
107
+ File "/lp-dev/zhouxx/BasicTS/basicts/runners/base_utsf_runner.py", line 337, in backward
108
+ grad_norm = sum(
109
+ ^^^^
110
+ File "/lp-dev/zhouxx/BasicTS/basicts/runners/base_utsf_runner.py", line 338, in <genexpr>
111
+ param.grad.data.norm(2).item() ** 2 for param in self.model.parameters() if param.grad is not None
112
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
113
+ KeyboardInterrupt
114
+