Upload Main_200018
Browse files- Main_200018/acf6c6763f53a27cd3a36171d821d82b/TimeMoE4_005000.pt +3 -0
- Main_200018/acf6c6763f53a27cd3a36171d821d82b/TimeMoE4_010000.pt +3 -0
- Main_200018/acf6c6763f53a27cd3a36171d821d82b/TimeMoE4_015000.pt +3 -0
- Main_200018/acf6c6763f53a27cd3a36171d821d82b/TimeMoE4_020000.pt +3 -0
- Main_200018/acf6c6763f53a27cd3a36171d821d82b/TimeMoE4_025000.pt +3 -0
- Main_200018/acf6c6763f53a27cd3a36171d821d82b/TimeMoE4_030000.pt +3 -0
- Main_200018/acf6c6763f53a27cd3a36171d821d82b/TimeMoE4_035000.pt +3 -0
- Main_200018/acf6c6763f53a27cd3a36171d821d82b/TimeMoE4_040000.pt +3 -0
- Main_200018/acf6c6763f53a27cd3a36171d821d82b/TimeMoE4_045000.pt +3 -0
- Main_200018/acf6c6763f53a27cd3a36171d821d82b/TimeMoE4_050000.pt +3 -0
- Main_200018/acf6c6763f53a27cd3a36171d821d82b/TimeMoE4_055000.pt +3 -0
- Main_200018/acf6c6763f53a27cd3a36171d821d82b/TimeMoE4_060000.pt +3 -0
- Main_200018/acf6c6763f53a27cd3a36171d821d82b/TimeMoE4_065000.pt +3 -0
- Main_200018/acf6c6763f53a27cd3a36171d821d82b/TimeMoE4_best_val_loss.pt +3 -0
- Main_200018/acf6c6763f53a27cd3a36171d821d82b/cfg.txt +55 -0
- Main_200018/acf6c6763f53a27cd3a36171d821d82b/tensorboard/events.out.tfevents.1769020087.brev-5x9knwe1p.1065478.0 +3 -0
- Main_200018/acf6c6763f53a27cd3a36171d821d82b/timemoe_base.py +126 -0
- Main_200018/acf6c6763f53a27cd3a36171d821d82b/training_log_20260121182753.log +133 -0
Main_200018/acf6c6763f53a27cd3a36171d821d82b/TimeMoE4_005000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b055ed485964dd7741a91478de3d666d5784444e905eec925e708bc7611c628e
|
| 3 |
+
size 151982040
|
Main_200018/acf6c6763f53a27cd3a36171d821d82b/TimeMoE4_010000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:45294f9060bdcc0433996972f79a0884265da9fcefa18d6f101507f94cad5da1
|
| 3 |
+
size 151982040
|
Main_200018/acf6c6763f53a27cd3a36171d821d82b/TimeMoE4_015000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:66f030c32bcb78d2acf5e9dc05efdd50397d7001ec8ada525dc2a74304187317
|
| 3 |
+
size 151982040
|
Main_200018/acf6c6763f53a27cd3a36171d821d82b/TimeMoE4_020000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:510313ae30e72fed0e966055a60e7b6663660186c8fadebe2c1ce37fb86c3f7a
|
| 3 |
+
size 151982040
|
Main_200018/acf6c6763f53a27cd3a36171d821d82b/TimeMoE4_025000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5be20c252bcfc0fe453c74692c31a8942b23d31c326d56a295617de3f945d35f
|
| 3 |
+
size 151982040
|
Main_200018/acf6c6763f53a27cd3a36171d821d82b/TimeMoE4_030000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d4ba08df6d02ebff18b5304647183942c4012dc88205960f18de73ac3d734b7e
|
| 3 |
+
size 151982040
|
Main_200018/acf6c6763f53a27cd3a36171d821d82b/TimeMoE4_035000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ebea5a9258288de570c5f3bc2ac63a86bb5cd9128bebcaa1f08fadf57a330ce7
|
| 3 |
+
size 151982040
|
Main_200018/acf6c6763f53a27cd3a36171d821d82b/TimeMoE4_040000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b7c2aef7d8015a9f29b9597fbb427cf29ed442c5e9728ffc28606dd8d41173bf
|
| 3 |
+
size 151982040
|
Main_200018/acf6c6763f53a27cd3a36171d821d82b/TimeMoE4_045000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:07b38c7612f2a4409848d5de9e09121b1fce804595cb870a44ac9b8f09fe05fe
|
| 3 |
+
size 151982040
|
Main_200018/acf6c6763f53a27cd3a36171d821d82b/TimeMoE4_050000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ac775094798867cf365b9f41803a370f510257e21b84e97d2f63095d7d2c5882
|
| 3 |
+
size 151982040
|
Main_200018/acf6c6763f53a27cd3a36171d821d82b/TimeMoE4_055000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f4081d53371f745c00a3391ab933d05fd524ddc972ec7fa067b6d21346c90e7c
|
| 3 |
+
size 151982040
|
Main_200018/acf6c6763f53a27cd3a36171d821d82b/TimeMoE4_060000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ca8143a722696641ebbd74c2f3858f01005cdac0715d6c9ecdcdd6304524f935
|
| 3 |
+
size 151982040
|
Main_200018/acf6c6763f53a27cd3a36171d821d82b/TimeMoE4_065000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:922faafa533c8f44222d10cfaa7d431349d708de17e10b3a80ed445d1d6668c7
|
| 3 |
+
size 151982040
|
Main_200018/acf6c6763f53a27cd3a36171d821d82b/TimeMoE4_best_val_loss.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b98768df656f42ce4aa9dcf307a49e494b2124276eeb3322be0f41180e379879
|
| 3 |
+
size 151985051
|
Main_200018/acf6c6763f53a27cd3a36171d821d82b/cfg.txt
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
DESCRIPTION: TimeMoE Base
|
| 2 |
+
DEVICE: gpu
|
| 3 |
+
DEVICE_NUM: 3
|
| 4 |
+
RUNNER: <class 'baselines.TimeMoE4.runner.runner.TimeMoERunner'>
|
| 5 |
+
MODEL:
|
| 6 |
+
NAME: TimeMoE4
|
| 7 |
+
ARCH: <class 'baselines.TimeMoE4.arch.timemoe.TimeMoE4'>
|
| 8 |
+
PARAM:
|
| 9 |
+
model_id: baselines/TimeMoE/ckpt/TimeMoE-50M
|
| 10 |
+
from_pretrained: False
|
| 11 |
+
context_length: 4079
|
| 12 |
+
trust_remote_code: True
|
| 13 |
+
DTYPE: bfloat16
|
| 14 |
+
METRICS:
|
| 15 |
+
FUNCS:
|
| 16 |
+
TRAIN:
|
| 17 |
+
COMPILE_MODEL: True
|
| 18 |
+
NUM_ITERATIONS: 200018
|
| 19 |
+
CKPT_SAVE_DIR: checkpoints/TimeMoE4/Main_200018
|
| 20 |
+
CKPT_SAVE_STRATEGY: 5000
|
| 21 |
+
LOSS: fake_loss
|
| 22 |
+
OPTIM:
|
| 23 |
+
TYPE: AdamW
|
| 24 |
+
PARAM:
|
| 25 |
+
lr: 0.001
|
| 26 |
+
betas: (0.9, 0.95)
|
| 27 |
+
fused: True
|
| 28 |
+
LR_SCHEDULER:
|
| 29 |
+
TYPE: CosineWarmup
|
| 30 |
+
PARAM:
|
| 31 |
+
num_warmup_steps: 10000
|
| 32 |
+
num_training_steps: 200018
|
| 33 |
+
CLIP_GRAD_PARAM:
|
| 34 |
+
max_norm: 1.0
|
| 35 |
+
DATA:
|
| 36 |
+
BATCH_SIZE: 85
|
| 37 |
+
SHUFFLE: True
|
| 38 |
+
PIN_MEMORY: True
|
| 39 |
+
PREFETCH: True
|
| 40 |
+
GRAD_ACCUMULATION_STEPS: 1
|
| 41 |
+
VAL:
|
| 42 |
+
INTERVAL: 5000
|
| 43 |
+
DATA:
|
| 44 |
+
BATCH_SIZE: 170
|
| 45 |
+
EVAL:
|
| 46 |
+
USE_GPU: True
|
| 47 |
+
DATASET:
|
| 48 |
+
NAME: Main
|
| 49 |
+
TYPE: <class 'baselines.TimeMoE4.data.mix_dataset_v2.MixedSourceDataset_v2'>
|
| 50 |
+
PARAM:
|
| 51 |
+
num_valid_samples: 1000
|
| 52 |
+
INFERENCE:
|
| 53 |
+
GENERATION_PARAMS:
|
| 54 |
+
normalize: True
|
| 55 |
+
MD5: acf6c6763f53a27cd3a36171d821d82b
|
Main_200018/acf6c6763f53a27cd3a36171d821d82b/tensorboard/events.out.tfevents.1769020087.brev-5x9knwe1p.1065478.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:870c82056b4f0d6bcb55cd2f75d258c14f442be3c69cba996c35e46e17068299
|
| 3 |
+
size 13634150
|
Main_200018/acf6c6763f53a27cd3a36171d821d82b/timemoe_base.py
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 采样概率变化
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import sys
|
| 5 |
+
from easydict import EasyDict
|
| 6 |
+
sys.path.append(os.path.abspath(__file__ + '/../../..'))
|
| 7 |
+
|
| 8 |
+
from ..arch import TimeMoE4
|
| 9 |
+
from ..data import MixedSourceDataset_v2
|
| 10 |
+
from ..runner import TimeMoERunner
|
| 11 |
+
from ..loss import fake_loss
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
############################## Hot Parameters ##############################
|
| 15 |
+
# Dataset & Metrics configuration
|
| 16 |
+
# Model architecture and parameters
|
| 17 |
+
|
| 18 |
+
pretrained = False # Whether to use a pretrained model
|
| 19 |
+
|
| 20 |
+
MODEL_ARCH = TimeMoE4
|
| 21 |
+
|
| 22 |
+
MODEL_PARAM = {
|
| 23 |
+
'model_id': "baselines/TimeMoE/ckpt/TimeMoE-50M",
|
| 24 |
+
'from_pretrained': pretrained,
|
| 25 |
+
'context_length': 4079,
|
| 26 |
+
'trust_remote_code': True,
|
| 27 |
+
}
|
| 28 |
+
DATA_NAME = "Main"
|
| 29 |
+
|
| 30 |
+
# N = 20_000_000
|
| 31 |
+
# batch size = 16*8
|
| 32 |
+
# 20_000_000 / 16 / 8 = 156250 iterations
|
| 33 |
+
# 20_000_000 * 4096 / 16 / 8 / 4096 = 156_250
|
| 34 |
+
|
| 35 |
+
NUM_ITERATIONS = 200_018 # 总轮数 20_000_000 * 4096 / 16 / 4 / 4096 = 312,500
|
| 36 |
+
VAL_ITERATION_INTERVAL = 5_000 # 每VAL_ITERATION_INTERVAL执行一次验证
|
| 37 |
+
|
| 38 |
+
############################## General Configuration ##############################
|
| 39 |
+
CFG = EasyDict()
|
| 40 |
+
# General settings
|
| 41 |
+
CFG.DESCRIPTION = 'TimeMoE Base'
|
| 42 |
+
CFG.DEVICE = 'gpu'
|
| 43 |
+
CFG.DEVICE_NUM = 3
|
| 44 |
+
# Runner
|
| 45 |
+
CFG.RUNNER = TimeMoERunner
|
| 46 |
+
|
| 47 |
+
############################## Model Configuration ################################
|
| 48 |
+
CFG.MODEL = EasyDict()
|
| 49 |
+
CFG.MODEL.NAME = MODEL_ARCH.__name__
|
| 50 |
+
CFG.MODEL.ARCH = MODEL_ARCH
|
| 51 |
+
CFG.MODEL.PARAM = MODEL_PARAM
|
| 52 |
+
CFG.MODEL.DTYPE= 'bfloat16'
|
| 53 |
+
# CFG.MODEL.DTYPE= 'float32'
|
| 54 |
+
|
| 55 |
+
############################## Metrics Configuration ##############################
|
| 56 |
+
CFG.METRICS = EasyDict()
|
| 57 |
+
# Metrics settings
|
| 58 |
+
CFG.METRICS.FUNCS = EasyDict({})
|
| 59 |
+
|
| 60 |
+
############################## Training Configuration ##############################
|
| 61 |
+
CFG.TRAIN = EasyDict()
|
| 62 |
+
CFG.TRAIN.COMPILE_MODEL = True
|
| 63 |
+
CFG.TRAIN.NUM_ITERATIONS = NUM_ITERATIONS
|
| 64 |
+
CFG.TRAIN.CKPT_SAVE_DIR = os.path.join(
|
| 65 |
+
'checkpoints',
|
| 66 |
+
MODEL_ARCH.__name__,
|
| 67 |
+
'_'.join([DATA_NAME, str(CFG.TRAIN.NUM_ITERATIONS)])
|
| 68 |
+
)
|
| 69 |
+
CFG.TRAIN.CKPT_SAVE_STRATEGY = VAL_ITERATION_INTERVAL * 1 # 保存策略,每VAL_ITERATION_INTERVAL * 5保存一次模型
|
| 70 |
+
CFG.TRAIN.LOSS = fake_loss
|
| 71 |
+
# Optimizer settings
|
| 72 |
+
CFG.TRAIN.OPTIM = EasyDict()
|
| 73 |
+
CFG.TRAIN.OPTIM.TYPE = "AdamW"
|
| 74 |
+
CFG.TRAIN.OPTIM.PARAM = {
|
| 75 |
+
"lr": 1e-3,
|
| 76 |
+
"betas": (0.9, 0.95),
|
| 77 |
+
# "betas": (0.9, 0.98),
|
| 78 |
+
"fused": True,
|
| 79 |
+
# "weight_decay": 1e-1,
|
| 80 |
+
}
|
| 81 |
+
# Learning rate scheduler settings
|
| 82 |
+
CFG.TRAIN.LR_SCHEDULER = EasyDict()
|
| 83 |
+
CFG.TRAIN.LR_SCHEDULER.TYPE = "CosineWarmup"
|
| 84 |
+
CFG.TRAIN.LR_SCHEDULER.PARAM = {
|
| 85 |
+
'num_warmup_steps': 10_000, # 10k
|
| 86 |
+
'num_training_steps': NUM_ITERATIONS,
|
| 87 |
+
}
|
| 88 |
+
CFG.TRAIN.CLIP_GRAD_PARAM = {
|
| 89 |
+
'max_norm': 1.0
|
| 90 |
+
}
|
| 91 |
+
# Train data loader settings
|
| 92 |
+
CFG.TRAIN.DATA = EasyDict()
|
| 93 |
+
CFG.TRAIN.DATA.BATCH_SIZE = 85 # 16 / 4
|
| 94 |
+
CFG.TRAIN.DATA.SHUFFLE = True # has to be False
|
| 95 |
+
CFG.TRAIN.DATA.PIN_MEMORY = True
|
| 96 |
+
CFG.TRAIN.DATA.PREFETCH = True
|
| 97 |
+
CFG.TRAIN.GRAD_ACCUMULATION_STEPS = 1
|
| 98 |
+
# CFG.TRAIN.DATA.NUM_WORKERS = 4
|
| 99 |
+
|
| 100 |
+
############################## Validation Configuration ##############################
|
| 101 |
+
CFG.VAL = EasyDict()
|
| 102 |
+
CFG.VAL.INTERVAL = VAL_ITERATION_INTERVAL
|
| 103 |
+
CFG.VAL.DATA = EasyDict()
|
| 104 |
+
CFG.VAL.DATA.BATCH_SIZE = 170 # 32 / 8
|
| 105 |
+
|
| 106 |
+
############################## Evaluation Configuration ##############################
|
| 107 |
+
|
| 108 |
+
CFG.EVAL = EasyDict()
|
| 109 |
+
# Evaluation parameters
|
| 110 |
+
CFG.EVAL.USE_GPU = True # Whether to use GPU for evaluation. Default: True
|
| 111 |
+
|
| 112 |
+
############################## Dataset Configuration ##############################
|
| 113 |
+
CFG.DATASET = EasyDict()
|
| 114 |
+
# Dataset settings
|
| 115 |
+
CFG.DATASET.NAME = DATA_NAME
|
| 116 |
+
CFG.DATASET.TYPE = MixedSourceDataset_v2
|
| 117 |
+
CFG.DATASET.PARAM = EasyDict({
|
| 118 |
+
'num_valid_samples': 1000
|
| 119 |
+
})
|
| 120 |
+
|
| 121 |
+
############################## Inference Configuration ##############################
|
| 122 |
+
CFG.INFERENCE = EasyDict()
|
| 123 |
+
CFG.INFERENCE.GENERATION_PARAMS = EasyDict({
|
| 124 |
+
'normalize': not pretrained
|
| 125 |
+
})
|
| 126 |
+
|
Main_200018/acf6c6763f53a27cd3a36171d821d82b/training_log_20260121182753.log
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2026-01-21 18:27:53,225 - easytorch-training - INFO - Initializing training.
|
| 2 |
+
2026-01-21 18:27:53,225 - easytorch-training - INFO - Set clip grad, param: {'max_norm': 1.0}
|
| 3 |
+
2026-01-21 18:27:53,226 - easytorch-training - INFO - Building training data loader.
|
| 4 |
+
2026-01-21 18:28:07,863 - easytorch-training - INFO - MixedSourceDataset initialized for 'train' mode.
|
| 5 |
+
2026-01-21 18:28:07,864 - easytorch-training - INFO - - real: 3201174 samples
|
| 6 |
+
2026-01-21 18:28:07,864 - easytorch-training - INFO - - synth: 2000000 samples
|
| 7 |
+
2026-01-21 18:28:07,864 - easytorch-training - INFO - Train dataset length: 3201174
|
| 8 |
+
2026-01-21 18:28:07,867 - easytorch-training - INFO - Set optim: AdamW (
|
| 9 |
+
Parameter Group 0
|
| 10 |
+
amsgrad: False
|
| 11 |
+
betas: (0.9, 0.95)
|
| 12 |
+
capturable: False
|
| 13 |
+
differentiable: False
|
| 14 |
+
eps: 1e-08
|
| 15 |
+
foreach: None
|
| 16 |
+
fused: True
|
| 17 |
+
lr: 0.001
|
| 18 |
+
maximize: False
|
| 19 |
+
weight_decay: 0.01
|
| 20 |
+
)
|
| 21 |
+
2026-01-21 18:28:07,867 - easytorch-training - INFO - Set lr_scheduler: <basicts.runners.optim.lr_schedulers.CosineWarmup object at 0x7446b8174950>
|
| 22 |
+
2026-01-21 18:28:07,868 - easytorch-training - INFO - Initializing validation.
|
| 23 |
+
2026-01-21 18:28:07,869 - easytorch-training - INFO - Building val data loader.
|
| 24 |
+
2026-01-21 18:28:11,063 - easytorch-training - INFO - Worker 0 initialized for cauker_univariate.
|
| 25 |
+
2026-01-21 18:28:34,427 - easytorch-training - INFO - MixedSourceDataset initialized for 'valid' mode.
|
| 26 |
+
2026-01-21 18:28:34,427 - easytorch-training - INFO - - real: 1000 samples
|
| 27 |
+
2026-01-21 18:28:34,427 - easytorch-training - INFO - Valid dataset length: 1000
|
| 28 |
+
2026-01-21 18:28:34,428 - easytorch-training - INFO - Number of parameters: 12653568
|
| 29 |
+
2026-01-21 18:28:34,428 - easytorch-training - INFO - Training with 3 GPUs, batch size per GPUs: 85, grad_accumulation_steps: 1
|
| 30 |
+
2026-01-21 18:28:34,428 - easytorch-training - INFO - Effective batch size: 255
|
| 31 |
+
2026-01-21 19:34:46,353 - easytorch-training - INFO - Iteration 5000 / 200018
|
| 32 |
+
2026-01-21 19:34:46,845 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.79 (s), train/lr: 2.50e-04, train/loss: 3.5735, train/grad_norm: 6.2758, train/amp_scale: 1.0000]
|
| 33 |
+
2026-01-21 19:34:46,846 - easytorch-training - INFO - Start validation.
|
| 34 |
+
2026-01-21 19:35:01,878 - easytorch-training - INFO - Result <val>: [val/time: 14.85 (s), val/loss: 3.3217]
|
| 35 |
+
2026-01-21 19:35:01,989 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200018/acf6c6763f53a27cd3a36171d821d82b/TimeMoE4_best_val_loss.pt saved
|
| 36 |
+
2026-01-21 19:35:02,095 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200018/acf6c6763f53a27cd3a36171d821d82b/TimeMoE4_005000.pt saved
|
| 37 |
+
2026-01-21 19:35:02,096 - easytorch-training - INFO - The estimated training finish time is 2026-01-23 14:47:15
|
| 38 |
+
2026-01-21 20:35:05,639 - easytorch-training - INFO - Iteration 10000 / 200018
|
| 39 |
+
2026-01-21 20:35:06,130 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.72 (s), train/lr: 7.50e-04, train/loss: 2.7588, train/grad_norm: 2.9374, train/amp_scale: 1.0000]
|
| 40 |
+
2026-01-21 20:35:06,131 - easytorch-training - INFO - Start validation.
|
| 41 |
+
2026-01-21 20:35:12,849 - easytorch-training - INFO - Result <val>: [val/time: 6.54 (s), val/loss: 3.2390]
|
| 42 |
+
2026-01-21 20:35:12,983 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200018/acf6c6763f53a27cd3a36171d821d82b/TimeMoE4_best_val_loss.pt saved
|
| 43 |
+
2026-01-21 20:35:13,103 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200018/acf6c6763f53a27cd3a36171d821d82b/TimeMoE4_010000.pt saved
|
| 44 |
+
2026-01-21 20:35:13,104 - easytorch-training - INFO - The estimated training finish time is 2026-01-23 12:41:41
|
| 45 |
+
2026-01-21 21:33:44,220 - easytorch-training - INFO - Iteration 15000 / 200018
|
| 46 |
+
2026-01-21 21:33:44,711 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.70 (s), train/lr: 9.99e-04, train/loss: 2.6467, train/grad_norm: 2.5058, train/amp_scale: 1.0000]
|
| 47 |
+
2026-01-21 21:33:44,711 - easytorch-training - INFO - Start validation.
|
| 48 |
+
2026-01-21 21:33:51,425 - easytorch-training - INFO - Result <val>: [val/time: 6.54 (s), val/loss: 3.1982]
|
| 49 |
+
2026-01-21 21:33:51,552 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200018/acf6c6763f53a27cd3a36171d821d82b/TimeMoE4_best_val_loss.pt saved
|
| 50 |
+
2026-01-21 21:33:51,670 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200018/acf6c6763f53a27cd3a36171d821d82b/TimeMoE4_015000.pt saved
|
| 51 |
+
2026-01-21 21:33:51,670 - easytorch-training - INFO - The estimated training finish time is 2026-01-23 11:39:17
|
| 52 |
+
2026-01-21 22:32:30,744 - easytorch-training - INFO - Iteration 20000 / 200018
|
| 53 |
+
2026-01-21 22:32:31,228 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.70 (s), train/lr: 9.96e-04, train/loss: 2.5970, train/grad_norm: 2.4983, train/amp_scale: 1.0000]
|
| 54 |
+
2026-01-21 22:32:31,228 - easytorch-training - INFO - Start validation.
|
| 55 |
+
2026-01-21 22:32:37,925 - easytorch-training - INFO - Result <val>: [val/time: 6.52 (s), val/loss: 3.1878]
|
| 56 |
+
2026-01-21 22:32:38,053 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200018/acf6c6763f53a27cd3a36171d821d82b/TimeMoE4_best_val_loss.pt saved
|
| 57 |
+
2026-01-21 22:32:38,171 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200018/acf6c6763f53a27cd3a36171d821d82b/TimeMoE4_020000.pt saved
|
| 58 |
+
2026-01-21 22:32:38,173 - easytorch-training - INFO - The estimated training finish time is 2026-01-23 11:09:25
|
| 59 |
+
2026-01-21 23:31:00,094 - easytorch-training - INFO - Iteration 25000 / 200018
|
| 60 |
+
2026-01-21 23:31:00,582 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.70 (s), train/lr: 9.89e-04, train/loss: 2.5758, train/grad_norm: 2.4831, train/amp_scale: 1.0000]
|
| 61 |
+
2026-01-21 23:31:00,582 - easytorch-training - INFO - Start validation.
|
| 62 |
+
2026-01-21 23:31:07,290 - easytorch-training - INFO - Result <val>: [val/time: 6.53 (s), val/loss: 3.1744]
|
| 63 |
+
2026-01-21 23:31:07,419 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200018/acf6c6763f53a27cd3a36171d821d82b/TimeMoE4_best_val_loss.pt saved
|
| 64 |
+
2026-01-21 23:31:07,537 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200018/acf6c6763f53a27cd3a36171d821d82b/TimeMoE4_025000.pt saved
|
| 65 |
+
2026-01-21 23:31:07,537 - easytorch-training - INFO - The estimated training finish time is 2026-01-23 10:49:12
|
| 66 |
+
2026-01-22 00:27:46,491 - easytorch-training - INFO - Iteration 30000 / 200018
|
| 67 |
+
2026-01-22 00:27:46,981 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.68 (s), train/lr: 9.79e-04, train/loss: 2.5595, train/grad_norm: 2.5564, train/amp_scale: 1.0000]
|
| 68 |
+
2026-01-22 00:27:46,981 - easytorch-training - INFO - Start validation.
|
| 69 |
+
2026-01-22 00:27:53,696 - easytorch-training - INFO - Result <val>: [val/time: 6.54 (s), val/loss: 3.1661]
|
| 70 |
+
2026-01-22 00:27:53,824 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200018/acf6c6763f53a27cd3a36171d821d82b/TimeMoE4_best_val_loss.pt saved
|
| 71 |
+
2026-01-22 00:27:53,943 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200018/acf6c6763f53a27cd3a36171d821d82b/TimeMoE4_030000.pt saved
|
| 72 |
+
2026-01-22 00:27:53,943 - easytorch-training - INFO - The estimated training finish time is 2026-01-23 10:24:17
|
| 73 |
+
2026-01-22 01:27:10,403 - easytorch-training - INFO - Iteration 35000 / 200018
|
| 74 |
+
2026-01-22 01:27:10,888 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.71 (s), train/lr: 9.66e-04, train/loss: 2.5426, train/grad_norm: 2.5068, train/amp_scale: 1.0000]
|
| 75 |
+
2026-01-22 01:27:10,888 - easytorch-training - INFO - Start validation.
|
| 76 |
+
2026-01-22 01:27:17,574 - easytorch-training - INFO - Result <val>: [val/time: 6.51 (s), val/loss: 3.1707]
|
| 77 |
+
2026-01-22 01:27:17,703 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200018/acf6c6763f53a27cd3a36171d821d82b/TimeMoE4_035000.pt saved
|
| 78 |
+
2026-01-22 01:27:17,704 - easytorch-training - INFO - The estimated training finish time is 2026-01-23 10:21:28
|
| 79 |
+
2026-01-22 02:27:38,931 - easytorch-training - INFO - Iteration 40000 / 200018
|
| 80 |
+
2026-01-22 02:27:39,420 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.72 (s), train/lr: 9.49e-04, train/loss: 2.5410, train/grad_norm: 2.5819, train/amp_scale: 1.0000]
|
| 81 |
+
2026-01-22 02:27:39,420 - easytorch-training - INFO - Start validation.
|
| 82 |
+
2026-01-22 02:27:46,137 - easytorch-training - INFO - Result <val>: [val/time: 6.54 (s), val/loss: 3.1605]
|
| 83 |
+
2026-01-22 02:27:46,269 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200018/acf6c6763f53a27cd3a36171d821d82b/TimeMoE4_best_val_loss.pt saved
|
| 84 |
+
2026-01-22 02:27:46,389 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200018/acf6c6763f53a27cd3a36171d821d82b/TimeMoE4_040000.pt saved
|
| 85 |
+
2026-01-22 02:27:46,391 - easytorch-training - INFO - The estimated training finish time is 2026-01-23 10:24:47
|
| 86 |
+
2026-01-22 03:27:49,005 - easytorch-training - INFO - Iteration 45000 / 200018
|
| 87 |
+
2026-01-22 03:27:49,495 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.72 (s), train/lr: 9.29e-04, train/loss: 2.5352, train/grad_norm: 2.5853, train/amp_scale: 1.0000]
|
| 88 |
+
2026-01-22 03:27:49,496 - easytorch-training - INFO - Start validation.
|
| 89 |
+
2026-01-22 03:27:56,201 - easytorch-training - INFO - Result <val>: [val/time: 6.53 (s), val/loss: 3.1562]
|
| 90 |
+
2026-01-22 03:27:56,333 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200018/acf6c6763f53a27cd3a36171d821d82b/TimeMoE4_best_val_loss.pt saved
|
| 91 |
+
2026-01-22 03:27:56,454 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200018/acf6c6763f53a27cd3a36171d821d82b/TimeMoE4_045000.pt saved
|
| 92 |
+
2026-01-22 03:27:56,454 - easytorch-training - INFO - The estimated training finish time is 2026-01-23 10:25:58
|
| 93 |
+
2026-01-22 04:27:08,412 - easytorch-training - INFO - Iteration 50000 / 200018
|
| 94 |
+
2026-01-22 04:27:08,903 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.71 (s), train/lr: 9.07e-04, train/loss: 2.5229, train/grad_norm: 2.5580, train/amp_scale: 1.0000]
|
| 95 |
+
2026-01-22 04:27:08,903 - easytorch-training - INFO - Start validation.
|
| 96 |
+
2026-01-22 04:27:15,629 - easytorch-training - INFO - Result <val>: [val/time: 6.55 (s), val/loss: 3.1524]
|
| 97 |
+
2026-01-22 04:27:15,758 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200018/acf6c6763f53a27cd3a36171d821d82b/TimeMoE4_best_val_loss.pt saved
|
| 98 |
+
2026-01-22 04:27:15,877 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200018/acf6c6763f53a27cd3a36171d821d82b/TimeMoE4_050000.pt saved
|
| 99 |
+
2026-01-22 04:27:15,879 - easytorch-training - INFO - The estimated training finish time is 2026-01-23 10:23:33
|
| 100 |
+
2026-01-22 05:25:44,680 - easytorch-training - INFO - Iteration 55000 / 200018
|
| 101 |
+
2026-01-22 05:25:45,168 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.70 (s), train/lr: 8.81e-04, train/loss: 2.5207, train/grad_norm: 2.4932, train/amp_scale: 1.0000]
|
| 102 |
+
2026-01-22 05:25:45,169 - easytorch-training - INFO - Start validation.
|
| 103 |
+
2026-01-22 05:25:51,875 - easytorch-training - INFO - Result <val>: [val/time: 6.53 (s), val/loss: 3.1542]
|
| 104 |
+
2026-01-22 05:25:52,018 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200018/acf6c6763f53a27cd3a36171d821d82b/TimeMoE4_055000.pt saved
|
| 105 |
+
2026-01-22 05:25:52,019 - easytorch-training - INFO - The estimated training finish time is 2026-01-23 10:18:56
|
| 106 |
+
2026-01-22 06:25:48,720 - easytorch-training - INFO - Iteration 60000 / 200018
|
| 107 |
+
2026-01-22 06:25:49,204 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.72 (s), train/lr: 8.53e-04, train/loss: 2.5222, train/grad_norm: 2.4614, train/amp_scale: 1.0000]
|
| 108 |
+
2026-01-22 06:25:49,204 - easytorch-training - INFO - Start validation.
|
| 109 |
+
2026-01-22 06:25:55,879 - easytorch-training - INFO - Result <val>: [val/time: 6.50 (s), val/loss: 3.1517]
|
| 110 |
+
2026-01-22 06:25:55,997 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200018/acf6c6763f53a27cd3a36171d821d82b/TimeMoE4_best_val_loss.pt saved
|
| 111 |
+
2026-01-22 06:25:56,106 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200018/acf6c6763f53a27cd3a36171d821d82b/TimeMoE4_060000.pt saved
|
| 112 |
+
2026-01-22 06:25:56,107 - easytorch-training - INFO - The estimated training finish time is 2026-01-23 10:19:59
|
| 113 |
+
2026-01-22 07:26:46,431 - easytorch-training - INFO - Iteration 65000 / 200018
|
| 114 |
+
2026-01-22 07:26:46,915 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.73 (s), train/lr: 8.23e-04, train/loss: 2.5805, train/grad_norm: 3.4873, train/amp_scale: 1.0000]
|
| 115 |
+
2026-01-22 07:26:46,916 - easytorch-training - INFO - Start validation.
|
| 116 |
+
2026-01-22 07:26:53,580 - easytorch-training - INFO - Result <val>: [val/time: 6.49 (s), val/loss: 3.4660]
|
| 117 |
+
2026-01-22 07:26:53,693 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200018/acf6c6763f53a27cd3a36171d821d82b/TimeMoE4_065000.pt saved
|
| 118 |
+
2026-01-22 07:26:53,693 - easytorch-training - INFO - The estimated training finish time is 2026-01-23 10:23:37
|
| 119 |
+
2026-01-22 07:37:10,038 - easytorch-training - ERROR - Traceback (most recent call last):
|
| 120 |
+
File "/home/nvidia/miniconda3/envs/zxx/lib/python3.11/site-packages/easytorch/launcher/launcher.py", line 31, in training_func
|
| 121 |
+
runner.train(cfg)
|
| 122 |
+
File "/lp-dev/zhouxx/BasicTS/basicts/runners/base_iteration_runner.py", line 200, in train
|
| 123 |
+
self.train_iters(iteration=iteration, dataloader=self.train_data_loader)
|
| 124 |
+
File "/lp-dev/zhouxx/BasicTS/basicts/runners/base_utsf_runner.py", line 281, in train_iters
|
| 125 |
+
self.backward(loss, accumulating=accumulating)
|
| 126 |
+
File "/lp-dev/zhouxx/BasicTS/basicts/runners/base_utsf_runner.py", line 337, in backward
|
| 127 |
+
grad_norm = sum(
|
| 128 |
+
^^^^
|
| 129 |
+
File "/lp-dev/zhouxx/BasicTS/basicts/runners/base_utsf_runner.py", line 338, in <genexpr>
|
| 130 |
+
param.grad.data.norm(2).item() ** 2 for param in self.model.parameters() if param.grad is not None
|
| 131 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 132 |
+
KeyboardInterrupt
|
| 133 |
+
|