Upload Main_100001
Browse files- Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_005000.pt +3 -0
- Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_010000.pt +3 -0
- Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_015000.pt +3 -0
- Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_020000.pt +3 -0
- Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_025000.pt +3 -0
- Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_030000.pt +3 -0
- Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_035000.pt +3 -0
- Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_040000.pt +3 -0
- Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_045000.pt +3 -0
- Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_050000.pt +3 -0
- Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_055000.pt +3 -0
- Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_060000.pt +3 -0
- Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_065000.pt +3 -0
- Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_070000.pt +3 -0
- Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_075000.pt +3 -0
- Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_080000.pt +3 -0
- Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_085000.pt +3 -0
- Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_best_val_loss.pt +3 -0
- Main_100001/f4e0658d7311189963757513e6bcc722/cfg.txt +55 -0
- Main_100001/f4e0658d7311189963757513e6bcc722/tensorboard/events.out.tfevents.1768319790.brev-5x9knwe1p.2215689.0 +3 -0
- Main_100001/f4e0658d7311189963757513e6bcc722/tensorboard/events.out.tfevents.1768397977.brev-5x9knwe1p.401834.0 +3 -0
- Main_100001/f4e0658d7311189963757513e6bcc722/timemoe_base.py +125 -0
- Main_100001/f4e0658d7311189963757513e6bcc722/training_log_20260113155620.log +163 -0
- Main_100001/f4e0658d7311189963757513e6bcc722/training_log_20260114133926.log +62 -0
Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_005000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6d6dc407fb0be0fb6ad80b6967fbbdfd1e2989379a9c7bf88ea88ea36362f93b
|
| 3 |
+
size 151982040
|
Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_010000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ccfc52e60d75c23a6f5fd5e483b076ca4fc6ff4b4df1eaa07af93e45f3a718c1
|
| 3 |
+
size 151982040
|
Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_015000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a1faee34994214a0a1c8e84c00a466cead6382a97f3952b841e407e0749a53c0
|
| 3 |
+
size 151982040
|
Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_020000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8c11b8d01092d7eb57bc613c0fdab50f6a749ad7cbb4c0ed4d69c9d732dd6d55
|
| 3 |
+
size 151982040
|
Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_025000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:acb4e53c565e40342293ce0184271a3f33c806b93a0a85be7f31db9d2cf5bfe3
|
| 3 |
+
size 151982040
|
Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_030000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:180f4939d9f72d89e101e4a9a7dad84f7cf14679f50790278707c5997d4eaa1b
|
| 3 |
+
size 151982040
|
Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_035000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8fcda71dfd9f6d9898446e030d2033ddeb1c22cf25959dc1f41a5f1f7e1cdb40
|
| 3 |
+
size 151982040
|
Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_040000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9998bb515231ebe70dc8613cd8e6408b19c17a781f114027da94e529136aa140
|
| 3 |
+
size 151982040
|
Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_045000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:10d637c6af13b00a90683b158e0d1fb58e3aadada14781d4b94f7c3a509562fe
|
| 3 |
+
size 151982040
|
Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_050000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:730d6c2ce372c2e81e1bd71b135a7d6ffc5060daab85d04f3aa0302ee8ead9c8
|
| 3 |
+
size 151982040
|
Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_055000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:282698c88f8df090b3d83f211ea3775d9067ecbb187d9223a522a13f5c08bdcf
|
| 3 |
+
size 151982040
|
Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_060000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1c22566e25e1fa0d5aef5fdf6008526fd7c8b4445161e64a665daf0756da29c6
|
| 3 |
+
size 151982040
|
Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_065000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:115db5a3d4c9a8812cf1eab7ed86557e1f561aec5d7f8b5a7dbc883fd3eb10fb
|
| 3 |
+
size 151982040
|
Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_070000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d99023add7360245e48df81f65f86fadc19b14fad5978f8978f3d176be5548c6
|
| 3 |
+
size 151982040
|
Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_075000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c8948293c5efa1fb600e0fc59e3df9b8b5797a30451dda73a76509ff8f2c0cb0
|
| 3 |
+
size 151982040
|
Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_080000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:253dc5996b47129d4c8cbd6bcb32027dba7c4948ca3d27fb8ae668ad768c3df3
|
| 3 |
+
size 151982040
|
Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_085000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1af20dd7e982b7183f68db504e0a37ca98a423fda0b59534200cc0ed0903bc6b
|
| 3 |
+
size 151982040
|
Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_best_val_loss.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f3b1223a08275e261d156d5e1d20f9bd8709412f90e1c2fb9c3ac54d5d15ada7
|
| 3 |
+
size 151985051
|
Main_100001/f4e0658d7311189963757513e6bcc722/cfg.txt
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
DESCRIPTION: TimeMoE Base
|
| 2 |
+
DEVICE: gpu
|
| 3 |
+
DEVICE_NUM: 3
|
| 4 |
+
RUNNER: <class 'baselines.TimeMoE4.runner.runner.TimeMoERunner'>
|
| 5 |
+
MODEL:
|
| 6 |
+
NAME: TimeMoE4
|
| 7 |
+
ARCH: <class 'baselines.TimeMoE4.arch.timemoe.TimeMoE4'>
|
| 8 |
+
PARAM:
|
| 9 |
+
model_id: baselines/TimeMoE/ckpt/TimeMoE-50M
|
| 10 |
+
from_pretrained: False
|
| 11 |
+
context_length: 4079
|
| 12 |
+
trust_remote_code: True
|
| 13 |
+
DTYPE: bfloat16
|
| 14 |
+
METRICS:
|
| 15 |
+
FUNCS:
|
| 16 |
+
TRAIN:
|
| 17 |
+
COMPILE_MODEL: True
|
| 18 |
+
NUM_ITERATIONS: 100001
|
| 19 |
+
CKPT_SAVE_DIR: checkpoints/TimeMoE4/Main_100001
|
| 20 |
+
CKPT_SAVE_STRATEGY: 5000
|
| 21 |
+
LOSS: fake_loss
|
| 22 |
+
OPTIM:
|
| 23 |
+
TYPE: AdamW
|
| 24 |
+
PARAM:
|
| 25 |
+
lr: 0.001
|
| 26 |
+
betas: (0.9, 0.95)
|
| 27 |
+
fused: True
|
| 28 |
+
LR_SCHEDULER:
|
| 29 |
+
TYPE: CosineWarmup
|
| 30 |
+
PARAM:
|
| 31 |
+
num_warmup_steps: 5000
|
| 32 |
+
num_training_steps: 100001
|
| 33 |
+
CLIP_GRAD_PARAM:
|
| 34 |
+
max_norm: 1.0
|
| 35 |
+
DATA:
|
| 36 |
+
BATCH_SIZE: 85
|
| 37 |
+
SHUFFLE: True
|
| 38 |
+
PIN_MEMORY: True
|
| 39 |
+
PREFETCH: True
|
| 40 |
+
GRAD_ACCUMULATION_STEPS: 1
|
| 41 |
+
VAL:
|
| 42 |
+
INTERVAL: 5000
|
| 43 |
+
DATA:
|
| 44 |
+
BATCH_SIZE: 170
|
| 45 |
+
EVAL:
|
| 46 |
+
USE_GPU: True
|
| 47 |
+
DATASET:
|
| 48 |
+
NAME: Main
|
| 49 |
+
TYPE: <class 'baselines.TimeMoE4.data.mix_dataset_v2.MixedSourceDataset_v2'>
|
| 50 |
+
PARAM:
|
| 51 |
+
num_valid_samples: 1000
|
| 52 |
+
INFERENCE:
|
| 53 |
+
GENERATION_PARAMS:
|
| 54 |
+
normalize: True
|
| 55 |
+
MD5: f4e0658d7311189963757513e6bcc722
|
Main_100001/f4e0658d7311189963757513e6bcc722/tensorboard/events.out.tfevents.1768319790.brev-5x9knwe1p.2215689.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:31494f5acb14baf5a46dae9d222553cc68b743fbbe12d4e26debe6cc8788e3a5
|
| 3 |
+
size 20735176
|
Main_100001/f4e0658d7311189963757513e6bcc722/tensorboard/events.out.tfevents.1768397977.brev-5x9knwe1p.401834.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0da9945fd96d3b0c7ad32357fc19a8700b6d07b72fb39f57e2982359239958fc
|
| 3 |
+
size 2849025
|
Main_100001/f4e0658d7311189963757513e6bcc722/timemoe_base.py
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 采样概率变化
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import sys
|
| 5 |
+
from easydict import EasyDict
|
| 6 |
+
sys.path.append(os.path.abspath(__file__ + '/../../..'))
|
| 7 |
+
|
| 8 |
+
from ..arch import TimeMoE4
|
| 9 |
+
from ..data import MixedSourceDataset_v2
|
| 10 |
+
from ..runner import TimeMoERunner
|
| 11 |
+
from ..loss import fake_loss
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
############################## Hot Parameters ##############################
|
| 15 |
+
# Dataset & Metrics configuration
|
| 16 |
+
# Model architecture and parameters
|
| 17 |
+
|
| 18 |
+
pretrained = False # Whether to use a pretrained model
|
| 19 |
+
|
| 20 |
+
MODEL_ARCH = TimeMoE4
|
| 21 |
+
|
| 22 |
+
MODEL_PARAM = {
|
| 23 |
+
'model_id': "baselines/TimeMoE/ckpt/TimeMoE-50M",
|
| 24 |
+
'from_pretrained': pretrained,
|
| 25 |
+
'context_length': 4079,
|
| 26 |
+
'trust_remote_code': True,
|
| 27 |
+
}
|
| 28 |
+
DATA_NAME = "Main"
|
| 29 |
+
|
| 30 |
+
# N = 20_000_000
|
| 31 |
+
# batch size = 16*8
|
| 32 |
+
# 20_000_000 / 16 / 8 = 156250 iterations
|
| 33 |
+
# 20_000_000 * 4096 / 16 / 8 / 4096 = 156_250
|
| 34 |
+
|
| 35 |
+
NUM_ITERATIONS = 100_001 # 总轮数 20_000_000 * 4096 / 16 / 4 / 4096 = 312,500
|
| 36 |
+
VAL_ITERATION_INTERVAL = 5_000 # 每VAL_ITERATION_INTERVAL执行一次验证
|
| 37 |
+
|
| 38 |
+
############################## General Configuration ##############################
|
| 39 |
+
CFG = EasyDict()
|
| 40 |
+
# General settings
|
| 41 |
+
CFG.DESCRIPTION = 'TimeMoE Base'
|
| 42 |
+
CFG.DEVICE = 'gpu'
|
| 43 |
+
CFG.DEVICE_NUM = 3
|
| 44 |
+
# Runner
|
| 45 |
+
CFG.RUNNER = TimeMoERunner
|
| 46 |
+
|
| 47 |
+
############################## Model Configuration ################################
|
| 48 |
+
CFG.MODEL = EasyDict()
|
| 49 |
+
CFG.MODEL.NAME = MODEL_ARCH.__name__
|
| 50 |
+
CFG.MODEL.ARCH = MODEL_ARCH
|
| 51 |
+
CFG.MODEL.PARAM = MODEL_PARAM
|
| 52 |
+
CFG.MODEL.DTYPE= 'bfloat16'
|
| 53 |
+
# CFG.MODEL.DTYPE= 'float32'
|
| 54 |
+
|
| 55 |
+
############################## Metrics Configuration ##############################
|
| 56 |
+
CFG.METRICS = EasyDict()
|
| 57 |
+
# Metrics settings
|
| 58 |
+
CFG.METRICS.FUNCS = EasyDict({})
|
| 59 |
+
|
| 60 |
+
############################## Training Configuration ##############################
|
| 61 |
+
CFG.TRAIN = EasyDict()
|
| 62 |
+
CFG.TRAIN.COMPILE_MODEL = True
|
| 63 |
+
CFG.TRAIN.NUM_ITERATIONS = NUM_ITERATIONS
|
| 64 |
+
CFG.TRAIN.CKPT_SAVE_DIR = os.path.join(
|
| 65 |
+
'checkpoints',
|
| 66 |
+
MODEL_ARCH.__name__,
|
| 67 |
+
'_'.join([DATA_NAME, str(CFG.TRAIN.NUM_ITERATIONS)])
|
| 68 |
+
)
|
| 69 |
+
CFG.TRAIN.CKPT_SAVE_STRATEGY = VAL_ITERATION_INTERVAL * 1 # 保存策略,每VAL_ITERATION_INTERVAL * 5保存一次模型
|
| 70 |
+
CFG.TRAIN.LOSS = fake_loss
|
| 71 |
+
# Optimizer settings
|
| 72 |
+
CFG.TRAIN.OPTIM = EasyDict()
|
| 73 |
+
CFG.TRAIN.OPTIM.TYPE = "AdamW"
|
| 74 |
+
CFG.TRAIN.OPTIM.PARAM = {
|
| 75 |
+
"lr": 1e-3,
|
| 76 |
+
"betas": (0.9, 0.95),
|
| 77 |
+
"fused": True,
|
| 78 |
+
# "weight_decay": 1e-1,
|
| 79 |
+
}
|
| 80 |
+
# Learning rate scheduler settings
|
| 81 |
+
CFG.TRAIN.LR_SCHEDULER = EasyDict()
|
| 82 |
+
CFG.TRAIN.LR_SCHEDULER.TYPE = "CosineWarmup"
|
| 83 |
+
CFG.TRAIN.LR_SCHEDULER.PARAM = {
|
| 84 |
+
'num_warmup_steps': int(NUM_ITERATIONS / 100 * 5), # 5%的warmup启动比例
|
| 85 |
+
'num_training_steps': NUM_ITERATIONS,
|
| 86 |
+
}
|
| 87 |
+
CFG.TRAIN.CLIP_GRAD_PARAM = {
|
| 88 |
+
'max_norm': 1.0
|
| 89 |
+
}
|
| 90 |
+
# Train data loader settings
|
| 91 |
+
CFG.TRAIN.DATA = EasyDict()
|
| 92 |
+
CFG.TRAIN.DATA.BATCH_SIZE = 85 # 16 / 4
|
| 93 |
+
CFG.TRAIN.DATA.SHUFFLE = True # has to be False
|
| 94 |
+
CFG.TRAIN.DATA.PIN_MEMORY = True
|
| 95 |
+
CFG.TRAIN.DATA.PREFETCH = True
|
| 96 |
+
CFG.TRAIN.GRAD_ACCUMULATION_STEPS = 1
|
| 97 |
+
# CFG.TRAIN.DATA.NUM_WORKERS = 4
|
| 98 |
+
|
| 99 |
+
############################## Validation Configuration ##############################
|
| 100 |
+
CFG.VAL = EasyDict()
|
| 101 |
+
CFG.VAL.INTERVAL = VAL_ITERATION_INTERVAL
|
| 102 |
+
CFG.VAL.DATA = EasyDict()
|
| 103 |
+
CFG.VAL.DATA.BATCH_SIZE = 170 # 32 / 8
|
| 104 |
+
|
| 105 |
+
############################## Evaluation Configuration ##############################
|
| 106 |
+
|
| 107 |
+
CFG.EVAL = EasyDict()
|
| 108 |
+
# Evaluation parameters
|
| 109 |
+
CFG.EVAL.USE_GPU = True # Whether to use GPU for evaluation. Default: True
|
| 110 |
+
|
| 111 |
+
############################## Dataset Configuration ##############################
|
| 112 |
+
CFG.DATASET = EasyDict()
|
| 113 |
+
# Dataset settings
|
| 114 |
+
CFG.DATASET.NAME = DATA_NAME
|
| 115 |
+
CFG.DATASET.TYPE = MixedSourceDataset_v2
|
| 116 |
+
CFG.DATASET.PARAM = EasyDict({
|
| 117 |
+
'num_valid_samples': 1000
|
| 118 |
+
})
|
| 119 |
+
|
| 120 |
+
############################## Inference Configuration ##############################
|
| 121 |
+
CFG.INFERENCE = EasyDict()
|
| 122 |
+
CFG.INFERENCE.GENERATION_PARAMS = EasyDict({
|
| 123 |
+
'normalize': not pretrained
|
| 124 |
+
})
|
| 125 |
+
|
Main_100001/f4e0658d7311189963757513e6bcc722/training_log_20260113155620.log
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2026-01-13 15:56:20,825 - easytorch-training - INFO - Initializing training.
|
| 2 |
+
2026-01-13 15:56:20,826 - easytorch-training - INFO - Set clip grad, param: {'max_norm': 1.0}
|
| 3 |
+
2026-01-13 15:56:20,826 - easytorch-training - INFO - Building training data loader.
|
| 4 |
+
2026-01-13 15:56:30,885 - easytorch-training - INFO - MixedSourceDataset initialized for 'train' mode.
|
| 5 |
+
2026-01-13 15:56:30,885 - easytorch-training - INFO - - real: 3201174 samples
|
| 6 |
+
2026-01-13 15:56:30,885 - easytorch-training - INFO - - synth: 2000000 samples
|
| 7 |
+
2026-01-13 15:56:30,886 - easytorch-training - INFO - Train dataset length: 3201174
|
| 8 |
+
2026-01-13 15:56:30,887 - easytorch-training - INFO - Set optim: AdamW (
|
| 9 |
+
Parameter Group 0
|
| 10 |
+
amsgrad: False
|
| 11 |
+
betas: (0.9, 0.95)
|
| 12 |
+
capturable: False
|
| 13 |
+
differentiable: False
|
| 14 |
+
eps: 1e-08
|
| 15 |
+
foreach: None
|
| 16 |
+
fused: True
|
| 17 |
+
lr: 0.001
|
| 18 |
+
maximize: False
|
| 19 |
+
weight_decay: 0.01
|
| 20 |
+
)
|
| 21 |
+
2026-01-13 15:56:30,887 - easytorch-training - INFO - Set lr_scheduler: <basicts.runners.optim.lr_schedulers.CosineWarmup object at 0x7e4ca95b4810>
|
| 22 |
+
2026-01-13 15:56:30,889 - easytorch-training - INFO - Initializing validation.
|
| 23 |
+
2026-01-13 15:56:30,889 - easytorch-training - INFO - Building val data loader.
|
| 24 |
+
2026-01-13 15:56:32,802 - easytorch-training - INFO - Worker 0 initialized for cauker_univariate.
|
| 25 |
+
2026-01-13 15:56:56,807 - easytorch-training - INFO - MixedSourceDataset initialized for 'valid' mode.
|
| 26 |
+
2026-01-13 15:56:56,807 - easytorch-training - INFO - - real: 1000 samples
|
| 27 |
+
2026-01-13 15:56:56,808 - easytorch-training - INFO - Valid dataset length: 1000
|
| 28 |
+
2026-01-13 15:56:56,808 - easytorch-training - INFO - Number of parameters: 12653568
|
| 29 |
+
2026-01-13 15:56:56,809 - easytorch-training - INFO - Training with 3 GPUs, batch size per GPUs: 85, grad_accumulation_steps: 1
|
| 30 |
+
2026-01-13 15:56:56,809 - easytorch-training - INFO - Effective batch size: 255
|
| 31 |
+
2026-01-13 16:55:21,464 - easytorch-training - INFO - Iteration 5000 / 100001
|
| 32 |
+
2026-01-13 16:55:21,949 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.70 (s), train/lr: 5.00e-04, train/loss: 3.3183, train/grad_norm: 4.4124, train/amp_scale: 1.0000]
|
| 33 |
+
2026-01-13 16:55:21,950 - easytorch-training - INFO - Start validation.
|
| 34 |
+
2026-01-13 16:55:34,453 - easytorch-training - INFO - Result <val>: [val/time: 12.33 (s), val/loss: 3.3034]
|
| 35 |
+
2026-01-13 16:55:34,564 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_best_val_loss.pt saved
|
| 36 |
+
2026-01-13 16:55:34,673 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_005000.pt saved
|
| 37 |
+
2026-01-13 16:55:34,674 - easytorch-training - INFO - The estimated training finish time is 2026-01-14 11:29:34
|
| 38 |
+
2026-01-13 17:55:16,579 - easytorch-training - INFO - Iteration 10000 / 100001
|
| 39 |
+
2026-01-13 17:55:17,066 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.72 (s), train/lr: 9.98e-04, train/loss: 2.6317, train/grad_norm: 2.6717, train/amp_scale: 1.0000]
|
| 40 |
+
2026-01-13 17:55:17,067 - easytorch-training - INFO - Start validation.
|
| 41 |
+
2026-01-13 17:55:23,706 - easytorch-training - INFO - Result <val>: [val/time: 6.46 (s), val/loss: 3.2396]
|
| 42 |
+
2026-01-13 17:55:23,838 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_best_val_loss.pt saved
|
| 43 |
+
2026-01-13 17:55:23,956 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_010000.pt saved
|
| 44 |
+
2026-01-13 17:55:23,957 - easytorch-training - INFO - The estimated training finish time is 2026-01-14 11:41:28
|
| 45 |
+
2026-01-13 18:56:03,469 - easytorch-training - INFO - Iteration 15000 / 100001
|
| 46 |
+
2026-01-13 18:56:03,953 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.73 (s), train/lr: 9.84e-04, train/loss: 2.5394, train/grad_norm: 2.6858, train/amp_scale: 1.0000]
|
| 47 |
+
2026-01-13 18:56:03,954 - easytorch-training - INFO - Start validation.
|
| 48 |
+
2026-01-13 18:56:10,552 - easytorch-training - INFO - Result <val>: [val/time: 6.42 (s), val/loss: 3.2419]
|
| 49 |
+
2026-01-13 18:56:10,673 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_015000.pt saved
|
| 50 |
+
2026-01-13 18:56:10,674 - easytorch-training - INFO - The estimated training finish time is 2026-01-14 11:51:49
|
| 51 |
+
2026-01-13 19:55:15,509 - easytorch-training - INFO - Iteration 20000 / 100001
|
| 52 |
+
2026-01-13 19:55:15,996 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.71 (s), train/lr: 9.57e-04, train/loss: 2.5056, train/grad_norm: 2.7168, train/amp_scale: 1.0000]
|
| 53 |
+
2026-01-13 19:55:15,997 - easytorch-training - INFO - Start validation.
|
| 54 |
+
2026-01-13 19:55:22,643 - easytorch-training - INFO - Result <val>: [val/time: 6.47 (s), val/loss: 3.2219]
|
| 55 |
+
2026-01-13 19:55:22,777 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_best_val_loss.pt saved
|
| 56 |
+
2026-01-13 19:55:22,903 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_020000.pt saved
|
| 57 |
+
2026-01-13 19:55:22,904 - easytorch-training - INFO - The estimated training finish time is 2026-01-14 11:49:08
|
| 58 |
+
2026-01-13 20:55:11,702 - easytorch-training - INFO - Iteration 25000 / 100001
|
| 59 |
+
2026-01-13 20:55:12,182 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.72 (s), train/lr: 9.18e-04, train/loss: 2.4844, train/grad_norm: 2.6872, train/amp_scale: 1.0000]
|
| 60 |
+
2026-01-13 20:55:12,183 - easytorch-training - INFO - Start validation.
|
| 61 |
+
2026-01-13 20:55:18,779 - easytorch-training - INFO - Result <val>: [val/time: 6.42 (s), val/loss: 3.2071]
|
| 62 |
+
2026-01-13 20:55:18,894 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_best_val_loss.pt saved
|
| 63 |
+
2026-01-13 20:55:19,001 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_025000.pt saved
|
| 64 |
+
2026-01-13 20:55:19,002 - easytorch-training - INFO - The estimated training finish time is 2026-01-14 11:50:26
|
| 65 |
+
2026-01-13 21:55:44,837 - easytorch-training - INFO - Iteration 30000 / 100001
|
| 66 |
+
2026-01-13 21:55:45,325 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.72 (s), train/lr: 8.67e-04, train/loss: 2.4747, train/grad_norm: 2.6947, train/amp_scale: 1.0000]
|
| 67 |
+
2026-01-13 21:55:45,326 - easytorch-training - INFO - Start validation.
|
| 68 |
+
2026-01-13 21:55:51,975 - easytorch-training - INFO - Result <val>: [val/time: 6.47 (s), val/loss: 3.1960]
|
| 69 |
+
2026-01-13 21:55:52,103 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_best_val_loss.pt saved
|
| 70 |
+
2026-01-13 21:55:52,219 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_030000.pt saved
|
| 71 |
+
2026-01-13 21:55:52,220 - easytorch-training - INFO - The estimated training finish time is 2026-01-14 11:53:22
|
| 72 |
+
2026-01-13 22:55:25,149 - easytorch-training - INFO - Iteration 35000 / 100001
|
| 73 |
+
2026-01-13 22:55:25,641 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.71 (s), train/lr: 8.07e-04, train/loss: 2.4636, train/grad_norm: 2.7098, train/amp_scale: 1.0000]
|
| 74 |
+
2026-01-13 22:55:25,641 - easytorch-training - INFO - Start validation.
|
| 75 |
+
2026-01-13 22:55:32,309 - easytorch-training - INFO - Result <val>: [val/time: 6.49 (s), val/loss: 3.1850]
|
| 76 |
+
2026-01-13 22:55:32,426 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_best_val_loss.pt saved
|
| 77 |
+
2026-01-13 22:55:32,538 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_035000.pt saved
|
| 78 |
+
2026-01-13 22:55:32,540 - easytorch-training - INFO - The estimated training finish time is 2026-01-14 11:52:56
|
| 79 |
+
2026-01-13 23:55:16,092 - easytorch-training - INFO - Iteration 40000 / 100001
|
| 80 |
+
2026-01-13 23:55:16,574 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.72 (s), train/lr: 7.38e-04, train/loss: 2.4577, train/grad_norm: 2.7154, train/amp_scale: 1.0000]
|
| 81 |
+
2026-01-13 23:55:16,575 - easytorch-training - INFO - Start validation.
|
| 82 |
+
2026-01-13 23:55:23,191 - easytorch-training - INFO - Result <val>: [val/time: 6.44 (s), val/loss: 3.1824]
|
| 83 |
+
2026-01-13 23:55:23,307 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_best_val_loss.pt saved
|
| 84 |
+
2026-01-13 23:55:23,414 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_040000.pt saved
|
| 85 |
+
2026-01-13 23:55:23,414 - easytorch-training - INFO - The estimated training finish time is 2026-01-14 11:53:04
|
| 86 |
+
2026-01-14 00:55:01,159 - easytorch-training - INFO - Iteration 45000 / 100001
|
| 87 |
+
2026-01-14 00:55:01,646 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.71 (s), train/lr: 6.62e-04, train/loss: 2.4492, train/grad_norm: 2.6647, train/amp_scale: 1.0000]
|
| 88 |
+
2026-01-14 00:55:01,647 - easytorch-training - INFO - Start validation.
|
| 89 |
+
2026-01-14 00:55:08,277 - easytorch-training - INFO - Result <val>: [val/time: 6.45 (s), val/loss: 3.1904]
|
| 90 |
+
2026-01-14 00:55:08,397 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_045000.pt saved
|
| 91 |
+
2026-01-14 00:55:08,397 - easytorch-training - INFO - The estimated training finish time is 2026-01-14 11:52:56
|
| 92 |
+
2026-01-14 01:57:16,027 - easytorch-training - INFO - Iteration 50000 / 100001
|
| 93 |
+
2026-01-14 01:57:16,524 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.74 (s), train/lr: 5.82e-04, train/loss: 2.4400, train/grad_norm: 2.6267, train/amp_scale: 1.0000]
|
| 94 |
+
2026-01-14 01:57:16,525 - easytorch-training - INFO - Start validation.
|
| 95 |
+
2026-01-14 01:57:23,209 - easytorch-training - INFO - Result <val>: [val/time: 6.51 (s), val/loss: 3.1826]
|
| 96 |
+
2026-01-14 01:57:23,320 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_050000.pt saved
|
| 97 |
+
2026-01-14 01:57:23,323 - easytorch-training - INFO - The estimated training finish time is 2026-01-14 11:57:50
|
| 98 |
+
2026-01-14 02:57:01,069 - easytorch-training - INFO - Iteration 55000 / 100001
|
| 99 |
+
2026-01-14 02:57:01,554 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.71 (s), train/lr: 5.00e-04, train/loss: 2.4324, train/grad_norm: 2.5657, train/amp_scale: 1.0000]
|
| 100 |
+
2026-01-14 02:57:01,554 - easytorch-training - INFO - Start validation.
|
| 101 |
+
2026-01-14 02:57:08,184 - easytorch-training - INFO - Result <val>: [val/time: 6.45 (s), val/loss: 3.1777]
|
| 102 |
+
2026-01-14 02:57:08,300 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_best_val_loss.pt saved
|
| 103 |
+
2026-01-14 02:57:08,408 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_055000.pt saved
|
| 104 |
+
2026-01-14 02:57:08,408 - easytorch-training - INFO - The estimated training finish time is 2026-01-14 11:57:18
|
| 105 |
+
2026-01-14 03:56:18,192 - easytorch-training - INFO - Iteration 60000 / 100001
|
| 106 |
+
2026-01-14 03:56:18,677 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.71 (s), train/lr: 4.18e-04, train/loss: 2.4162, train/grad_norm: 2.5870, train/amp_scale: 1.0000]
|
| 107 |
+
2026-01-14 03:56:18,678 - easytorch-training - INFO - Start validation.
|
| 108 |
+
2026-01-14 03:56:25,306 - easytorch-training - INFO - Result <val>: [val/time: 6.45 (s), val/loss: 3.1696]
|
| 109 |
+
2026-01-14 03:56:25,424 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_best_val_loss.pt saved
|
| 110 |
+
2026-01-14 03:56:25,530 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_060000.pt saved
|
| 111 |
+
2026-01-14 03:56:25,533 - easytorch-training - INFO - The estimated training finish time is 2026-01-14 11:56:05
|
| 112 |
+
2026-01-14 04:54:51,389 - easytorch-training - INFO - Iteration 65000 / 100001
|
| 113 |
+
2026-01-14 04:54:51,875 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.70 (s), train/lr: 3.38e-04, train/loss: 2.4222, train/grad_norm: 2.4241, train/amp_scale: 1.0000]
|
| 114 |
+
2026-01-14 04:54:51,876 - easytorch-training - INFO - Start validation.
|
| 115 |
+
2026-01-14 04:54:58,524 - easytorch-training - INFO - Result <val>: [val/time: 6.50 (s), val/loss: 3.1691]
|
| 116 |
+
2026-01-14 04:54:58,711 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_best_val_loss.pt saved
|
| 117 |
+
2026-01-14 04:54:58,840 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_065000.pt saved
|
| 118 |
+
2026-01-14 04:54:58,841 - easytorch-training - INFO - The estimated training finish time is 2026-01-14 11:53:56
|
| 119 |
+
2026-01-14 05:53:59,132 - easytorch-training - INFO - Iteration 70000 / 100001
|
| 120 |
+
2026-01-14 05:53:59,615 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.71 (s), train/lr: 2.62e-04, train/loss: 2.4089, train/grad_norm: 2.4843, train/amp_scale: 1.0000]
|
| 121 |
+
2026-01-14 05:53:59,615 - easytorch-training - INFO - Start validation.
|
| 122 |
+
2026-01-14 05:54:06,248 - easytorch-training - INFO - Result <val>: [val/time: 6.46 (s), val/loss: 3.1671]
|
| 123 |
+
2026-01-14 05:54:06,364 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_best_val_loss.pt saved
|
| 124 |
+
2026-01-14 05:54:06,470 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_070000.pt saved
|
| 125 |
+
2026-01-14 05:54:06,471 - easytorch-training - INFO - The estimated training finish time is 2026-01-14 11:52:54
|
| 126 |
+
2026-01-14 06:53:01,984 - easytorch-training - INFO - Iteration 75000 / 100001
|
| 127 |
+
2026-01-14 06:53:02,471 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.71 (s), train/lr: 1.93e-04, train/loss: 2.4079, train/grad_norm: 2.4212, train/amp_scale: 1.0000]
|
| 128 |
+
2026-01-14 06:53:02,472 - easytorch-training - INFO - Start validation.
|
| 129 |
+
2026-01-14 06:53:09,113 - easytorch-training - INFO - Result <val>: [val/time: 6.46 (s), val/loss: 3.1626]
|
| 130 |
+
2026-01-14 06:53:09,228 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_best_val_loss.pt saved
|
| 131 |
+
2026-01-14 06:53:09,333 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_075000.pt saved
|
| 132 |
+
2026-01-14 06:53:09,334 - easytorch-training - INFO - The estimated training finish time is 2026-01-14 11:51:54
|
| 133 |
+
2026-01-14 07:51:47,926 - easytorch-training - INFO - Iteration 80000 / 100001
|
| 134 |
+
2026-01-14 07:51:48,403 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.70 (s), train/lr: 1.33e-04, train/loss: 2.5898, train/grad_norm: 49.2251, train/amp_scale: 1.0000]
|
| 135 |
+
2026-01-14 07:51:48,403 - easytorch-training - INFO - Start validation.
|
| 136 |
+
2026-01-14 07:52:05,464 - easytorch-training - INFO - Result <val>: [val/time: 16.88 (s), val/loss: 3.4465]
|
| 137 |
+
2026-01-14 07:52:05,573 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_080000.pt saved
|
| 138 |
+
2026-01-14 07:52:05,574 - easytorch-training - INFO - The estimated training finish time is 2026-01-14 11:50:53
|
| 139 |
+
2026-01-14 09:04:36,456 - easytorch-training - INFO - Iteration 85000 / 100001
|
| 140 |
+
2026-01-14 09:04:36,935 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.87 (s), train/lr: 8.19e-05, train/loss: 3.1473, train/grad_norm: 1094.6103, train/amp_scale: 1.0000]
|
| 141 |
+
2026-01-14 09:04:36,935 - easytorch-training - INFO - Start validation.
|
| 142 |
+
2026-01-14 09:04:43,513 - easytorch-training - INFO - Result <val>: [val/time: 6.40 (s), val/loss: 3.5284]
|
| 143 |
+
2026-01-14 09:04:43,632 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_085000.pt saved
|
| 144 |
+
2026-01-14 09:04:43,633 - easytorch-training - INFO - The estimated training finish time is 2026-01-14 12:06:06
|
| 145 |
+
2026-01-14 10:03:41,575 - easytorch-training - INFO - Iteration 90000 / 100001
|
| 146 |
+
2026-01-14 10:03:42,060 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.71 (s), train/lr: 4.26e-05, train/loss: 3.4897, train/grad_norm: 4883.0374, train/amp_scale: 1.0000]
|
| 147 |
+
2026-01-14 10:03:42,060 - easytorch-training - INFO - Start validation.
|
| 148 |
+
2026-01-14 10:03:48,647 - easytorch-training - INFO - Result <val>: [val/time: 6.41 (s), val/loss: 3.6891]
|
| 149 |
+
2026-01-14 10:03:48,760 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_090000.pt saved
|
| 150 |
+
2026-01-14 10:03:48,761 - easytorch-training - INFO - The estimated training finish time is 2026-01-14 12:04:35
|
| 151 |
+
2026-01-14 11:01:50,633 - easytorch-training - INFO - Iteration 95000 / 100001
|
| 152 |
+
2026-01-14 11:01:51,113 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.70 (s), train/lr: 1.59e-05, train/loss: 3.7558, train/grad_norm: 11587.0733, train/amp_scale: 1.0000]
|
| 153 |
+
2026-01-14 11:01:51,114 - easytorch-training - INFO - Start validation.
|
| 154 |
+
2026-01-14 11:01:57,704 - easytorch-training - INFO - Result <val>: [val/time: 6.42 (s), val/loss: 3.6953]
|
| 155 |
+
2026-01-14 11:01:57,824 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_095000.pt saved
|
| 156 |
+
2026-01-14 11:01:57,824 - easytorch-training - INFO - The estimated training finish time is 2026-01-14 12:02:14
|
| 157 |
+
2026-01-14 12:00:36,193 - easytorch-training - INFO - Iteration 100000 / 100001
|
| 158 |
+
2026-01-14 12:00:36,675 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.70 (s), train/lr: 2.28e-06, train/loss: 3.8332, train/grad_norm: 15573.3141, train/amp_scale: 1.0000]
|
| 159 |
+
2026-01-14 12:00:36,675 - easytorch-training - INFO - Start validation.
|
| 160 |
+
2026-01-14 12:00:43,257 - easytorch-training - INFO - Result <val>: [val/time: 6.41 (s), val/loss: 3.7929]
|
| 161 |
+
2026-01-14 12:00:43,365 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_100000.pt saved
|
| 162 |
+
2026-01-14 12:00:43,366 - easytorch-training - INFO - The estimated training finish time is 2026-01-14 12:00:44
|
| 163 |
+
2026-01-14 12:00:43,850 - easytorch-training - INFO - The training finished at 2026-01-14 12:00:43
|
Main_100001/f4e0658d7311189963757513e6bcc722/training_log_20260114133926.log
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2026-01-14 13:39:26,798 - easytorch-training - INFO - Initializing training.
|
| 2 |
+
2026-01-14 13:39:26,800 - easytorch-training - INFO - Set clip grad, param: {'max_norm': 1.0}
|
| 3 |
+
2026-01-14 13:39:26,801 - easytorch-training - INFO - Building training data loader.
|
| 4 |
+
2026-01-14 13:39:37,188 - easytorch-training - INFO - MixedSourceDataset initialized for 'train' mode.
|
| 5 |
+
2026-01-14 13:39:37,188 - easytorch-training - INFO - - real: 3201174 samples
|
| 6 |
+
2026-01-14 13:39:37,189 - easytorch-training - INFO - - synth: 2000000 samples
|
| 7 |
+
2026-01-14 13:39:37,189 - easytorch-training - INFO - Train dataset length: 3201174
|
| 8 |
+
2026-01-14 13:39:37,191 - easytorch-training - INFO - Set optim: AdamW (
|
| 9 |
+
Parameter Group 0
|
| 10 |
+
amsgrad: False
|
| 11 |
+
betas: (0.9, 0.95)
|
| 12 |
+
capturable: False
|
| 13 |
+
differentiable: False
|
| 14 |
+
eps: 1e-08
|
| 15 |
+
foreach: None
|
| 16 |
+
fused: True
|
| 17 |
+
lr: 0.001
|
| 18 |
+
maximize: False
|
| 19 |
+
weight_decay: 0.01
|
| 20 |
+
)
|
| 21 |
+
2026-01-14 13:39:37,191 - easytorch-training - INFO - Set lr_scheduler: <basicts.runners.optim.lr_schedulers.CosineWarmup object at 0x70966873c790>
|
| 22 |
+
2026-01-14 13:39:37,192 - easytorch-training - INFO - Loading Checkpoint from 'checkpoints/TimeMoE4/Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_075000.pt'
|
| 23 |
+
2026-01-14 13:39:37,434 - easytorch-training - INFO - Worker 0 initialized for cauker_univariate.
|
| 24 |
+
2026-01-14 13:39:37,502 - easytorch-training - INFO - Resume training
|
| 25 |
+
2026-01-14 13:39:37,504 - easytorch-training - INFO - Initializing validation.
|
| 26 |
+
2026-01-14 13:39:37,505 - easytorch-training - INFO - Building val data loader.
|
| 27 |
+
2026-01-14 13:40:04,611 - easytorch-training - INFO - MixedSourceDataset initialized for 'valid' mode.
|
| 28 |
+
2026-01-14 13:40:04,611 - easytorch-training - INFO - - real: 1000 samples
|
| 29 |
+
2026-01-14 13:40:04,612 - easytorch-training - INFO - Valid dataset length: 1000
|
| 30 |
+
2026-01-14 13:40:04,612 - easytorch-training - INFO - Number of parameters: 12653568
|
| 31 |
+
2026-01-14 13:40:04,613 - easytorch-training - INFO - Training with 3 GPUs, batch size per GPUs: 85, grad_accumulation_steps: 1
|
| 32 |
+
2026-01-14 13:40:04,613 - easytorch-training - INFO - Effective batch size: 255
|
| 33 |
+
2026-01-14 14:41:48,824 - easytorch-training - INFO - Iteration 80000 / 100001
|
| 34 |
+
2026-01-14 14:41:49,306 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.74 (s), train/lr: 1.33e-04, train/loss: 2.5827, train/grad_norm: 50.4850, train/amp_scale: 1.0000]
|
| 35 |
+
2026-01-14 14:41:49,307 - easytorch-training - INFO - Start validation.
|
| 36 |
+
2026-01-14 14:42:02,743 - easytorch-training - INFO - Result <val>: [val/time: 13.26 (s), val/loss: 3.3568]
|
| 37 |
+
2026-01-14 14:42:02,870 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_080000.pt saved
|
| 38 |
+
2026-01-14 14:42:02,871 - easytorch-training - INFO - The estimated training finish time is 2026-01-14 18:49:56
|
| 39 |
+
2026-01-14 15:40:51,989 - easytorch-training - INFO - Iteration 85000 / 100001
|
| 40 |
+
2026-01-14 15:40:52,470 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.70 (s), train/lr: 8.19e-05, train/loss: 3.1628, train/grad_norm: 1088.2516, train/amp_scale: 1.0000]
|
| 41 |
+
2026-01-14 15:40:52,470 - easytorch-training - INFO - Start validation.
|
| 42 |
+
2026-01-14 15:41:03,185 - easytorch-training - INFO - Result <val>: [val/time: 10.54 (s), val/loss: 3.4917]
|
| 43 |
+
2026-01-14 15:41:03,291 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_100001/f4e0658d7311189963757513e6bcc722/TimeMoE4_085000.pt saved
|
| 44 |
+
2026-01-14 15:41:03,292 - easytorch-training - INFO - The estimated training finish time is 2026-01-14 18:42:32
|
| 45 |
+
2026-01-14 16:24:36,407 - easytorch-training - ERROR - Traceback (most recent call last):
|
| 46 |
+
File "/home/nvidia/miniconda3/envs/zxx/lib/python3.11/site-packages/easytorch/launcher/launcher.py", line 31, in training_func
|
| 47 |
+
runner.train(cfg)
|
| 48 |
+
File "/lp-dev/zhouxx/BasicTS/basicts/runners/base_iteration_runner.py", line 200, in train
|
| 49 |
+
self.train_iters(iteration=iteration, dataloader=self.train_data_loader)
|
| 50 |
+
File "/lp-dev/zhouxx/BasicTS/basicts/runners/base_utsf_runner.py", line 281, in train_iters
|
| 51 |
+
self.backward(loss, accumulating=accumulating)
|
| 52 |
+
File "/lp-dev/zhouxx/BasicTS/basicts/runners/base_utsf_runner.py", line 335, in backward
|
| 53 |
+
self.amp_scaler.scale(loss).backward()
|
| 54 |
+
File "/home/nvidia/miniconda3/envs/zxx/lib/python3.11/site-packages/torch/_tensor.py", line 581, in backward
|
| 55 |
+
torch.autograd.backward(
|
| 56 |
+
File "/home/nvidia/miniconda3/envs/zxx/lib/python3.11/site-packages/torch/autograd/__init__.py", line 347, in backward
|
| 57 |
+
_engine_run_backward(
|
| 58 |
+
File "/home/nvidia/miniconda3/envs/zxx/lib/python3.11/site-packages/torch/autograd/graph.py", line 825, in _engine_run_backward
|
| 59 |
+
return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
|
| 60 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 61 |
+
KeyboardInterrupt
|
| 62 |
+
|