Upload Main_200005
Browse files- Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_005000.pt +3 -0
- Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_010000.pt +3 -0
- Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_015000.pt +3 -0
- Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_020000.pt +3 -0
- Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_025000.pt +3 -0
- Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_030000.pt +3 -0
- Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_035000.pt +3 -0
- Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_040000.pt +3 -0
- Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_045000.pt +3 -0
- Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_050000.pt +3 -0
- Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_055000.pt +3 -0
- Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_060000.pt +3 -0
- Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_065000.pt +3 -0
- Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_070000.pt +3 -0
- Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_075000.pt +3 -0
- Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_080000.pt +3 -0
- Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_085000.pt +3 -0
- Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_090000.pt +3 -0
- Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_095000.pt +3 -0
- Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_100000.pt +3 -0
- Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_105000.pt +3 -0
- Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_best_val_loss.pt +3 -0
- Main_200005/3deef857eb45e05066abe66fffa5e741/cfg.txt +55 -0
- Main_200005/3deef857eb45e05066abe66fffa5e741/tensorboard/events.out.tfevents.1767866265.brev-5x9knwe1p.3279046.0 +3 -0
- Main_200005/3deef857eb45e05066abe66fffa5e741/timemoe_base.py +125 -0
- Main_200005/3deef857eb45e05066abe66fffa5e741/training_log_20260108095735.log +168 -0
Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_005000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:653445b35f3dbb18b242d6eba895a605d7d0ee5f5072adf42e3193ef32b01506
|
| 3 |
+
size 144594680
|
Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_010000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d1d5e4c4b2d520ee3cec703075591aba625353cacfdf06d27c329370e128783a
|
| 3 |
+
size 144594680
|
Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_015000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a78cf923ac99263fbe2ee9996ea21bed318c66de5f5eead98268f0030336f97f
|
| 3 |
+
size 144594680
|
Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_020000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b2fb271a4d4c93f119d9db89a88432bae69e4416625a0dea1153f4f5bdf02574
|
| 3 |
+
size 144594680
|
Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_025000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:717a1c310f0fed3aa47ff654ead91d2ca6fcf61d13d923aa82bf9195a415c597
|
| 3 |
+
size 144594680
|
Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_030000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5e3c17cc803f1a4f4f3438140420993de605969dbe60f6e097ccc31189e6306f
|
| 3 |
+
size 144594680
|
Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_035000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b413471980faf7da6684ac8042cd03655995d559ad824a04b60d221f159be5ec
|
| 3 |
+
size 144594680
|
Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_040000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:808aa05f19451e12f276c28ab65d53be741e528d92f797782f30c7d63c619e10
|
| 3 |
+
size 144594680
|
Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_045000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:380fc5b3262c896f728c3c64ae10e0cd849f201b91f6062474af3b582e7cbc12
|
| 3 |
+
size 144594680
|
Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_050000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5a41cc59a4b70751e222c552506ba9f877fbf5f722ba30396e42cf0a0158321f
|
| 3 |
+
size 144594680
|
Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_055000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e7ab7b9c7cd0b58cfe001a6a95c056b48c8cb755f3513a01db80eac2e3dac624
|
| 3 |
+
size 144594680
|
Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_060000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4c5f3ac77f58d0dae71c6942a5e48464eecd768e014cce08f0b21af704d3a90f
|
| 3 |
+
size 144594680
|
Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_065000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0f89282d6542da36fbae3d5a5581e2acf7f3c24ee73cdc4907d06580642b408c
|
| 3 |
+
size 144594680
|
Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_070000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ea6c914570276447e79cfc69f351bdb5d7b7d599c644f3394ae4d3a51cbaeb8f
|
| 3 |
+
size 144594680
|
Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_075000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:47aff15071f4a2243cf779090acc2043b48672c35b88375f9e649de2865da723
|
| 3 |
+
size 144594680
|
Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_080000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d457af1b68067e29661becf377a8fb5f04290aec065c8b6b087489e8956e3c62
|
| 3 |
+
size 144594680
|
Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_085000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e76034357fb9df9bd7744286fcbb0f044b6b48fdcce757aa9daf740b08e38880
|
| 3 |
+
size 144594680
|
Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_090000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d354b1f32f0dffe82e922e00bf6b350a5c461f17155d7d8690f3264a5193c5ca
|
| 3 |
+
size 144594680
|
Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_095000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1130d319b51d16bf7be685dbdc6a91698f04b06b5226aeb0c34297140053211d
|
| 3 |
+
size 144594680
|
Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_100000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:387630d204abf6816632b878c656267cc8b5829c0901d944ec644b78eec2510c
|
| 3 |
+
size 144594680
|
Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_105000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2cc5bfe0f173826eb4f651764e51b65b5ec0a0d7e3e873ee79f1028f8605eece
|
| 3 |
+
size 144594680
|
Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_best_val_loss.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:459630ea94a3afe92b8bdce3a96fd6813bd0fcebcca4fa1dde91686acdab7176
|
| 3 |
+
size 144597643
|
Main_200005/3deef857eb45e05066abe66fffa5e741/cfg.txt
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
DESCRIPTION: TimeMoE Base
|
| 2 |
+
DEVICE: gpu
|
| 3 |
+
DEVICE_NUM: 4
|
| 4 |
+
RUNNER: <class 'baselines.TimeMoE4.runner.runner.TimeMoERunner'>
|
| 5 |
+
MODEL:
|
| 6 |
+
NAME: TimeMoE4
|
| 7 |
+
ARCH: <class 'baselines.TimeMoE4.arch.timemoe.TimeMoE4'>
|
| 8 |
+
PARAM:
|
| 9 |
+
model_id: baselines/TimeMoE/ckpt/TimeMoE-50M
|
| 10 |
+
from_pretrained: False
|
| 11 |
+
context_length: 4079
|
| 12 |
+
trust_remote_code: True
|
| 13 |
+
DTYPE: bfloat16
|
| 14 |
+
METRICS:
|
| 15 |
+
FUNCS:
|
| 16 |
+
TRAIN:
|
| 17 |
+
COMPILE_MODEL: True
|
| 18 |
+
NUM_ITERATIONS: 200005
|
| 19 |
+
CKPT_SAVE_DIR: checkpoints/TimeMoE4/Main_200005
|
| 20 |
+
CKPT_SAVE_STRATEGY: 5000
|
| 21 |
+
LOSS: fake_loss
|
| 22 |
+
OPTIM:
|
| 23 |
+
TYPE: AdamW
|
| 24 |
+
PARAM:
|
| 25 |
+
lr: 0.001
|
| 26 |
+
betas: (0.9, 0.95)
|
| 27 |
+
fused: True
|
| 28 |
+
LR_SCHEDULER:
|
| 29 |
+
TYPE: CosineWarmup
|
| 30 |
+
PARAM:
|
| 31 |
+
num_warmup_steps: 10000
|
| 32 |
+
num_training_steps: 200005
|
| 33 |
+
CLIP_GRAD_PARAM:
|
| 34 |
+
max_norm: 1.0
|
| 35 |
+
DATA:
|
| 36 |
+
BATCH_SIZE: 64
|
| 37 |
+
SHUFFLE: True
|
| 38 |
+
PIN_MEMORY: True
|
| 39 |
+
PREFETCH: True
|
| 40 |
+
GRAD_ACCUMULATION_STEPS: 1
|
| 41 |
+
VAL:
|
| 42 |
+
INTERVAL: 5000
|
| 43 |
+
DATA:
|
| 44 |
+
BATCH_SIZE: 128
|
| 45 |
+
EVAL:
|
| 46 |
+
USE_GPU: True
|
| 47 |
+
DATASET:
|
| 48 |
+
NAME: Main
|
| 49 |
+
TYPE: <class 'baselines.TimeMoE4.data.mix_dataset_v2.MixedSourceDataset_v2'>
|
| 50 |
+
PARAM:
|
| 51 |
+
num_valid_samples: 1000
|
| 52 |
+
INFERENCE:
|
| 53 |
+
GENERATION_PARAMS:
|
| 54 |
+
normalize: True
|
| 55 |
+
MD5: 3deef857eb45e05066abe66fffa5e741
|
Main_200005/3deef857eb45e05066abe66fffa5e741/tensorboard/events.out.tfevents.1767866265.brev-5x9knwe1p.3279046.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e19d6fa42d10bb05308a5a5b49a9bb2d0af56b2bf2677857f5e82643768b79fe
|
| 3 |
+
size 21789990
|
Main_200005/3deef857eb45e05066abe66fffa5e741/timemoe_base.py
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 采样概率变化
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import sys
|
| 5 |
+
from easydict import EasyDict
|
| 6 |
+
sys.path.append(os.path.abspath(__file__ + '/../../..'))
|
| 7 |
+
|
| 8 |
+
from ..arch import TimeMoE4
|
| 9 |
+
from ..data import MixedSourceDataset_v2
|
| 10 |
+
from ..runner import TimeMoERunner
|
| 11 |
+
from ..loss import fake_loss
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
############################## Hot Parameters ##############################
|
| 15 |
+
# Dataset & Metrics configuration
|
| 16 |
+
# Model architecture and parameters
|
| 17 |
+
|
| 18 |
+
pretrained = False # Whether to use a pretrained model
|
| 19 |
+
|
| 20 |
+
MODEL_ARCH = TimeMoE4
|
| 21 |
+
|
| 22 |
+
MODEL_PARAM = {
|
| 23 |
+
'model_id': "baselines/TimeMoE/ckpt/TimeMoE-50M",
|
| 24 |
+
'from_pretrained': pretrained,
|
| 25 |
+
'context_length': 4079,
|
| 26 |
+
'trust_remote_code': True,
|
| 27 |
+
}
|
| 28 |
+
DATA_NAME = "Main"
|
| 29 |
+
|
| 30 |
+
# N = 20_000_000
|
| 31 |
+
# batch size = 16*8
|
| 32 |
+
# 20_000_000 / 16 / 8 = 156250 iterations
|
| 33 |
+
# 20_000_000 * 4096 / 16 / 8 / 4096 = 156_250
|
| 34 |
+
|
| 35 |
+
NUM_ITERATIONS = 200_005 # 总轮数 20_000_000 * 4096 / 16 / 4 / 4096 = 312,500
|
| 36 |
+
VAL_ITERATION_INTERVAL = 5_000 # 每VAL_ITERATION_INTERVAL执行一次验证
|
| 37 |
+
|
| 38 |
+
############################## General Configuration ##############################
|
| 39 |
+
CFG = EasyDict()
|
| 40 |
+
# General settings
|
| 41 |
+
CFG.DESCRIPTION = 'TimeMoE Base'
|
| 42 |
+
CFG.DEVICE = 'gpu'
|
| 43 |
+
CFG.DEVICE_NUM = 4
|
| 44 |
+
# Runner
|
| 45 |
+
CFG.RUNNER = TimeMoERunner
|
| 46 |
+
|
| 47 |
+
############################## Model Configuration ################################
|
| 48 |
+
CFG.MODEL = EasyDict()
|
| 49 |
+
CFG.MODEL.NAME = MODEL_ARCH.__name__
|
| 50 |
+
CFG.MODEL.ARCH = MODEL_ARCH
|
| 51 |
+
CFG.MODEL.PARAM = MODEL_PARAM
|
| 52 |
+
CFG.MODEL.DTYPE= 'bfloat16'
|
| 53 |
+
# CFG.MODEL.DTYPE= 'float32'
|
| 54 |
+
|
| 55 |
+
############################## Metrics Configuration ##############################
|
| 56 |
+
CFG.METRICS = EasyDict()
|
| 57 |
+
# Metrics settings
|
| 58 |
+
CFG.METRICS.FUNCS = EasyDict({})
|
| 59 |
+
|
| 60 |
+
############################## Training Configuration ##############################
|
| 61 |
+
CFG.TRAIN = EasyDict()
|
| 62 |
+
CFG.TRAIN.COMPILE_MODEL = True
|
| 63 |
+
CFG.TRAIN.NUM_ITERATIONS = NUM_ITERATIONS
|
| 64 |
+
CFG.TRAIN.CKPT_SAVE_DIR = os.path.join(
|
| 65 |
+
'checkpoints',
|
| 66 |
+
MODEL_ARCH.__name__,
|
| 67 |
+
'_'.join([DATA_NAME, str(CFG.TRAIN.NUM_ITERATIONS)])
|
| 68 |
+
)
|
| 69 |
+
CFG.TRAIN.CKPT_SAVE_STRATEGY = VAL_ITERATION_INTERVAL * 1 # 保存策略,每VAL_ITERATION_INTERVAL * 5保存一次模型
|
| 70 |
+
CFG.TRAIN.LOSS = fake_loss
|
| 71 |
+
# Optimizer settings
|
| 72 |
+
CFG.TRAIN.OPTIM = EasyDict()
|
| 73 |
+
CFG.TRAIN.OPTIM.TYPE = "AdamW"
|
| 74 |
+
CFG.TRAIN.OPTIM.PARAM = {
|
| 75 |
+
"lr": 1e-3,
|
| 76 |
+
"betas": (0.9, 0.95),
|
| 77 |
+
"fused": True,
|
| 78 |
+
# "weight_decay": 1e-1,
|
| 79 |
+
}
|
| 80 |
+
# Learning rate scheduler settings
|
| 81 |
+
CFG.TRAIN.LR_SCHEDULER = EasyDict()
|
| 82 |
+
CFG.TRAIN.LR_SCHEDULER.TYPE = "CosineWarmup"
|
| 83 |
+
CFG.TRAIN.LR_SCHEDULER.PARAM = {
|
| 84 |
+
'num_warmup_steps': int(NUM_ITERATIONS / 100 * 5), # 5%的warmup启动比例
|
| 85 |
+
'num_training_steps': NUM_ITERATIONS,
|
| 86 |
+
}
|
| 87 |
+
CFG.TRAIN.CLIP_GRAD_PARAM = {
|
| 88 |
+
'max_norm': 1.0
|
| 89 |
+
}
|
| 90 |
+
# Train data loader settings
|
| 91 |
+
CFG.TRAIN.DATA = EasyDict()
|
| 92 |
+
CFG.TRAIN.DATA.BATCH_SIZE = 64 # 16 / 4
|
| 93 |
+
CFG.TRAIN.DATA.SHUFFLE = True # has to be False
|
| 94 |
+
CFG.TRAIN.DATA.PIN_MEMORY = True
|
| 95 |
+
CFG.TRAIN.DATA.PREFETCH = True
|
| 96 |
+
CFG.TRAIN.GRAD_ACCUMULATION_STEPS = 1
|
| 97 |
+
# CFG.TRAIN.DATA.NUM_WORKERS = 4
|
| 98 |
+
|
| 99 |
+
############################## Validation Configuration ##############################
|
| 100 |
+
CFG.VAL = EasyDict()
|
| 101 |
+
CFG.VAL.INTERVAL = VAL_ITERATION_INTERVAL
|
| 102 |
+
CFG.VAL.DATA = EasyDict()
|
| 103 |
+
CFG.VAL.DATA.BATCH_SIZE = 128 # 32 / 8
|
| 104 |
+
|
| 105 |
+
############################## Evaluation Configuration ##############################
|
| 106 |
+
|
| 107 |
+
CFG.EVAL = EasyDict()
|
| 108 |
+
# Evaluation parameters
|
| 109 |
+
CFG.EVAL.USE_GPU = True # Whether to use GPU for evaluation. Default: True
|
| 110 |
+
|
| 111 |
+
############################## Dataset Configuration ##############################
|
| 112 |
+
CFG.DATASET = EasyDict()
|
| 113 |
+
# Dataset settings
|
| 114 |
+
CFG.DATASET.NAME = DATA_NAME
|
| 115 |
+
CFG.DATASET.TYPE = MixedSourceDataset_v2
|
| 116 |
+
CFG.DATASET.PARAM = EasyDict({
|
| 117 |
+
'num_valid_samples': 1000
|
| 118 |
+
})
|
| 119 |
+
|
| 120 |
+
############################## Inference Configuration ##############################
|
| 121 |
+
CFG.INFERENCE = EasyDict()
|
| 122 |
+
CFG.INFERENCE.GENERATION_PARAMS = EasyDict({
|
| 123 |
+
'normalize': not pretrained
|
| 124 |
+
})
|
| 125 |
+
|
Main_200005/3deef857eb45e05066abe66fffa5e741/training_log_20260108095735.log
ADDED
|
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2026-01-08 09:57:35,149 - easytorch-training - INFO - Initializing training.
|
| 2 |
+
2026-01-08 09:57:35,151 - easytorch-training - INFO - Set clip grad, param: {'max_norm': 1.0}
|
| 3 |
+
2026-01-08 09:57:35,152 - easytorch-training - INFO - Building training data loader.
|
| 4 |
+
2026-01-08 09:57:45,649 - easytorch-training - INFO - MixedSourceDataset initialized for 'train' mode.
|
| 5 |
+
2026-01-08 09:57:45,649 - easytorch-training - INFO - - real: 3201174 samples
|
| 6 |
+
2026-01-08 09:57:45,649 - easytorch-training - INFO - - synth: 2000000 samples
|
| 7 |
+
2026-01-08 09:57:45,649 - easytorch-training - INFO - Train dataset length: 3201174
|
| 8 |
+
2026-01-08 09:57:45,651 - easytorch-training - INFO - Set optim: AdamW (
|
| 9 |
+
Parameter Group 0
|
| 10 |
+
amsgrad: False
|
| 11 |
+
betas: (0.9, 0.95)
|
| 12 |
+
capturable: False
|
| 13 |
+
differentiable: False
|
| 14 |
+
eps: 1e-08
|
| 15 |
+
foreach: None
|
| 16 |
+
fused: True
|
| 17 |
+
lr: 0.001
|
| 18 |
+
maximize: False
|
| 19 |
+
weight_decay: 0.01
|
| 20 |
+
)
|
| 21 |
+
2026-01-08 09:57:45,651 - easytorch-training - INFO - Set lr_scheduler: <basicts.runners.optim.lr_schedulers.CosineWarmup object at 0x755940b1e350>
|
| 22 |
+
2026-01-08 09:57:45,653 - easytorch-training - INFO - Initializing validation.
|
| 23 |
+
2026-01-08 09:57:45,653 - easytorch-training - INFO - Building val data loader.
|
| 24 |
+
2026-01-08 09:57:45,932 - easytorch-training - INFO - Worker 0 initialized for cauker_univariate.
|
| 25 |
+
2026-01-08 09:58:09,126 - easytorch-training - INFO - MixedSourceDataset initialized for 'valid' mode.
|
| 26 |
+
2026-01-08 09:58:09,126 - easytorch-training - INFO - - real: 1000 samples
|
| 27 |
+
2026-01-08 09:58:09,126 - easytorch-training - INFO - Valid dataset length: 1000
|
| 28 |
+
2026-01-08 09:58:09,127 - easytorch-training - INFO - Number of parameters: 12038400
|
| 29 |
+
2026-01-08 09:58:09,127 - easytorch-training - INFO - Training with 4 GPUs, batch size per GPUs: 64, grad_accumulation_steps: 1
|
| 30 |
+
2026-01-08 09:58:09,127 - easytorch-training - INFO - Effective batch size: 256
|
| 31 |
+
2026-01-08 10:41:47,062 - easytorch-training - INFO - Iteration 5000 / 200005
|
| 32 |
+
2026-01-08 10:41:47,383 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.52 (s), train/lr: 2.50e-04, train/loss: 3.4083, train/grad_norm: 133.6961, train/amp_scale: 1.0000]
|
| 33 |
+
2026-01-08 10:41:47,383 - easytorch-training - INFO - Start validation.
|
| 34 |
+
2026-01-08 10:42:01,110 - easytorch-training - INFO - Result <val>: [val/time: 13.61 (s), val/loss: 3.3069]
|
| 35 |
+
2026-01-08 10:42:01,216 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_best_val_loss.pt saved
|
| 36 |
+
2026-01-08 10:42:01,320 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_005000.pt saved
|
| 37 |
+
2026-01-08 10:42:01,321 - easytorch-training - INFO - The estimated training finish time is 2026-01-09 15:12:59
|
| 38 |
+
2026-01-08 11:26:29,905 - easytorch-training - INFO - Iteration 10000 / 200005
|
| 39 |
+
2026-01-08 11:26:30,222 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.53 (s), train/lr: 7.50e-04, train/loss: 2.7186, train/grad_norm: 2.1180, train/amp_scale: 1.0000]
|
| 40 |
+
2026-01-08 11:26:30,222 - easytorch-training - INFO - Start validation.
|
| 41 |
+
2026-01-08 11:26:36,748 - easytorch-training - INFO - Result <val>: [val/time: 6.41 (s), val/loss: 3.2427]
|
| 42 |
+
2026-01-08 11:26:36,864 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_best_val_loss.pt saved
|
| 43 |
+
2026-01-08 11:26:36,969 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_010000.pt saved
|
| 44 |
+
2026-01-08 11:26:36,970 - easytorch-training - INFO - The estimated training finish time is 2026-01-09 15:27:28
|
| 45 |
+
2026-01-08 12:11:57,053 - easytorch-training - INFO - Iteration 15000 / 200005
|
| 46 |
+
2026-01-08 12:11:57,372 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.54 (s), train/lr: 9.99e-04, train/loss: 2.6231, train/grad_norm: 1.8888, train/amp_scale: 1.0000]
|
| 47 |
+
2026-01-08 12:11:57,373 - easytorch-training - INFO - Start validation.
|
| 48 |
+
2026-01-08 12:12:03,905 - easytorch-training - INFO - Result <val>: [val/time: 6.41 (s), val/loss: 3.2321]
|
| 49 |
+
2026-01-08 12:12:04,018 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_best_val_loss.pt saved
|
| 50 |
+
2026-01-08 12:12:04,122 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_015000.pt saved
|
| 51 |
+
2026-01-08 12:12:04,123 - easytorch-training - INFO - The estimated training finish time is 2026-01-09 15:43:45
|
| 52 |
+
2026-01-08 12:57:11,461 - easytorch-training - INFO - Iteration 20000 / 200005
|
| 53 |
+
2026-01-08 12:57:11,819 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.54 (s), train/lr: 9.96e-04, train/loss: 2.5773, train/grad_norm: 1.9563, train/amp_scale: 1.0000]
|
| 54 |
+
2026-01-08 12:57:11,819 - easytorch-training - INFO - Start validation.
|
| 55 |
+
2026-01-08 12:57:18,360 - easytorch-training - INFO - Result <val>: [val/time: 6.42 (s), val/loss: 3.1896]
|
| 56 |
+
2026-01-08 12:57:18,471 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_best_val_loss.pt saved
|
| 57 |
+
2026-01-08 12:57:18,577 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_020000.pt saved
|
| 58 |
+
2026-01-08 12:57:18,578 - easytorch-training - INFO - The estimated training finish time is 2026-01-09 15:49:46
|
| 59 |
+
2026-01-08 13:42:08,090 - easytorch-training - INFO - Iteration 25000 / 200005
|
| 60 |
+
2026-01-08 13:42:08,408 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.54 (s), train/lr: 9.89e-04, train/loss: 2.5514, train/grad_norm: 2.0475, train/amp_scale: 1.0000]
|
| 61 |
+
2026-01-08 13:42:08,408 - easytorch-training - INFO - Start validation.
|
| 62 |
+
2026-01-08 13:42:14,945 - easytorch-training - INFO - Result <val>: [val/time: 6.42 (s), val/loss: 3.1773]
|
| 63 |
+
2026-01-08 13:42:15,058 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_best_val_loss.pt saved
|
| 64 |
+
2026-01-08 13:42:15,161 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_025000.pt saved
|
| 65 |
+
2026-01-08 13:42:15,162 - easytorch-training - INFO - The estimated training finish time is 2026-01-09 15:51:00
|
| 66 |
+
2026-01-08 14:27:14,135 - easytorch-training - INFO - Iteration 30000 / 200005
|
| 67 |
+
2026-01-08 14:27:14,453 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.54 (s), train/lr: 9.79e-04, train/loss: 2.5403, train/grad_norm: 2.0965, train/amp_scale: 1.0000]
|
| 68 |
+
2026-01-08 14:27:14,453 - easytorch-training - INFO - Start validation.
|
| 69 |
+
2026-01-08 14:27:20,978 - easytorch-training - INFO - Result <val>: [val/time: 6.40 (s), val/loss: 3.1741]
|
| 70 |
+
2026-01-08 14:27:21,091 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_best_val_loss.pt saved
|
| 71 |
+
2026-01-08 14:27:21,195 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_030000.pt saved
|
| 72 |
+
2026-01-08 14:27:21,195 - easytorch-training - INFO - The estimated training finish time is 2026-01-09 15:52:52
|
| 73 |
+
2026-01-08 15:11:20,762 - easytorch-training - INFO - Iteration 35000 / 200005
|
| 74 |
+
2026-01-08 15:11:21,083 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.53 (s), train/lr: 9.66e-04, train/loss: 2.5332, train/grad_norm: 2.1668, train/amp_scale: 1.0000]
|
| 75 |
+
2026-01-08 15:11:21,084 - easytorch-training - INFO - Start validation.
|
| 76 |
+
2026-01-08 15:11:27,609 - easytorch-training - INFO - Result <val>: [val/time: 6.40 (s), val/loss: 3.1861]
|
| 77 |
+
2026-01-08 15:11:27,719 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_035000.pt saved
|
| 78 |
+
2026-01-08 15:11:27,719 - easytorch-training - INFO - The estimated training finish time is 2026-01-09 15:48:32
|
| 79 |
+
2026-01-08 15:55:29,835 - easytorch-training - INFO - Iteration 40000 / 200005
|
| 80 |
+
2026-01-08 15:55:30,153 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.53 (s), train/lr: 9.49e-04, train/loss: 2.5232, train/grad_norm: 2.1491, train/amp_scale: 1.0000]
|
| 81 |
+
2026-01-08 15:55:30,153 - easytorch-training - INFO - Start validation.
|
| 82 |
+
2026-01-08 15:55:36,671 - easytorch-training - INFO - Result <val>: [val/time: 6.40 (s), val/loss: 3.1705]
|
| 83 |
+
2026-01-08 15:55:36,792 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_best_val_loss.pt saved
|
| 84 |
+
2026-01-08 15:55:36,903 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_040000.pt saved
|
| 85 |
+
2026-01-08 15:55:36,904 - easytorch-training - INFO - The estimated training finish time is 2026-01-09 15:45:30
|
| 86 |
+
2026-01-08 16:40:01,848 - easytorch-training - INFO - Iteration 45000 / 200005
|
| 87 |
+
2026-01-08 16:40:02,165 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.53 (s), train/lr: 9.29e-04, train/loss: 2.5130, train/grad_norm: 2.2228, train/amp_scale: 1.0000]
|
| 88 |
+
2026-01-08 16:40:02,166 - easytorch-training - INFO - Start validation.
|
| 89 |
+
2026-01-08 16:40:08,698 - easytorch-training - INFO - Result <val>: [val/time: 6.41 (s), val/loss: 3.1653]
|
| 90 |
+
2026-01-08 16:40:08,820 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_best_val_loss.pt saved
|
| 91 |
+
2026-01-08 16:40:08,933 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_045000.pt saved
|
| 92 |
+
2026-01-08 16:40:08,933 - easytorch-training - INFO - The estimated training finish time is 2026-01-09 15:44:50
|
| 93 |
+
2026-01-08 17:24:16,453 - easytorch-training - INFO - Iteration 50000 / 200005
|
| 94 |
+
2026-01-08 17:24:16,772 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.53 (s), train/lr: 9.07e-04, train/loss: 2.5088, train/grad_norm: 2.2215, train/amp_scale: 1.0000]
|
| 95 |
+
2026-01-08 17:24:16,772 - easytorch-training - INFO - Start validation.
|
| 96 |
+
2026-01-08 17:24:23,291 - easytorch-training - INFO - Result <val>: [val/time: 6.40 (s), val/loss: 3.1643]
|
| 97 |
+
2026-01-08 17:24:23,403 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_best_val_loss.pt saved
|
| 98 |
+
2026-01-08 17:24:23,504 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_050000.pt saved
|
| 99 |
+
2026-01-08 17:24:23,506 - easytorch-training - INFO - The estimated training finish time is 2026-01-09 15:43:09
|
| 100 |
+
2026-01-08 18:08:09,060 - easytorch-training - INFO - Iteration 55000 / 200005
|
| 101 |
+
2026-01-08 18:08:09,380 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.52 (s), train/lr: 8.81e-04, train/loss: 2.5053, train/grad_norm: 2.1781, train/amp_scale: 1.0000]
|
| 102 |
+
2026-01-08 18:08:09,380 - easytorch-training - INFO - Start validation.
|
| 103 |
+
2026-01-08 18:08:15,939 - easytorch-training - INFO - Result <val>: [val/time: 6.44 (s), val/loss: 3.1559]
|
| 104 |
+
2026-01-08 18:08:16,051 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_best_val_loss.pt saved
|
| 105 |
+
2026-01-08 18:08:16,155 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_055000.pt saved
|
| 106 |
+
2026-01-08 18:08:16,156 - easytorch-training - INFO - The estimated training finish time is 2026-01-09 15:40:26
|
| 107 |
+
2026-01-08 18:53:09,811 - easytorch-training - INFO - Iteration 60000 / 200005
|
| 108 |
+
2026-01-08 18:53:10,128 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.54 (s), train/lr: 8.53e-04, train/loss: 2.5007, train/grad_norm: 2.1794, train/amp_scale: 1.0000]
|
| 109 |
+
2026-01-08 18:53:10,128 - easytorch-training - INFO - Start validation.
|
| 110 |
+
2026-01-08 18:53:16,637 - easytorch-training - INFO - Result <val>: [val/time: 6.39 (s), val/loss: 3.1610]
|
| 111 |
+
2026-01-08 18:53:16,744 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_060000.pt saved
|
| 112 |
+
2026-01-08 18:53:16,744 - easytorch-training - INFO - The estimated training finish time is 2026-01-09 15:41:57
|
| 113 |
+
2026-01-08 19:36:29,147 - easytorch-training - INFO - Iteration 65000 / 200005
|
| 114 |
+
2026-01-08 19:36:29,465 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.52 (s), train/lr: 8.23e-04, train/loss: 2.4949, train/grad_norm: 2.1817, train/amp_scale: 1.0000]
|
| 115 |
+
2026-01-08 19:36:29,466 - easytorch-training - INFO - Start validation.
|
| 116 |
+
2026-01-08 19:36:36,007 - easytorch-training - INFO - Result <val>: [val/time: 6.42 (s), val/loss: 3.1579]
|
| 117 |
+
2026-01-08 19:36:36,112 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_065000.pt saved
|
| 118 |
+
2026-01-08 19:36:36,113 - easytorch-training - INFO - The estimated training finish time is 2026-01-09 15:38:02
|
| 119 |
+
2026-01-08 20:21:16,749 - easytorch-training - INFO - Iteration 70000 / 200005
|
| 120 |
+
2026-01-08 20:21:17,065 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.54 (s), train/lr: 7.90e-04, train/loss: 2.4959, train/grad_norm: 2.1763, train/amp_scale: 1.0000]
|
| 121 |
+
2026-01-08 20:21:17,066 - easytorch-training - INFO - Start validation.
|
| 122 |
+
2026-01-08 20:21:23,587 - easytorch-training - INFO - Result <val>: [val/time: 6.40 (s), val/loss: 3.1523]
|
| 123 |
+
2026-01-08 20:21:23,698 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_best_val_loss.pt saved
|
| 124 |
+
2026-01-08 20:21:23,801 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_070000.pt saved
|
| 125 |
+
2026-01-08 20:21:23,802 - easytorch-training - INFO - The estimated training finish time is 2026-01-09 15:38:53
|
| 126 |
+
2026-01-08 21:05:50,545 - easytorch-training - INFO - Iteration 75000 / 200005
|
| 127 |
+
2026-01-08 21:05:50,863 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.53 (s), train/lr: 7.56e-04, train/loss: 2.4902, train/grad_norm: 2.1678, train/amp_scale: 1.0000]
|
| 128 |
+
2026-01-08 21:05:50,863 - easytorch-training - INFO - Start validation.
|
| 129 |
+
2026-01-08 21:05:57,411 - easytorch-training - INFO - Result <val>: [val/time: 6.43 (s), val/loss: 3.1493]
|
| 130 |
+
2026-01-08 21:05:57,531 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_best_val_loss.pt saved
|
| 131 |
+
2026-01-08 21:05:57,641 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_075000.pt saved
|
| 132 |
+
2026-01-08 21:05:57,642 - easytorch-training - INFO - The estimated training finish time is 2026-01-09 15:39:01
|
| 133 |
+
2026-01-08 21:51:31,371 - easytorch-training - INFO - Iteration 80000 / 200005
|
| 134 |
+
2026-01-08 21:51:31,686 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.55 (s), train/lr: 7.20e-04, train/loss: 2.5014, train/grad_norm: 2.1775, train/amp_scale: 1.0000]
|
| 135 |
+
2026-01-08 21:51:31,686 - easytorch-training - INFO - Start validation.
|
| 136 |
+
2026-01-08 21:51:38,191 - easytorch-training - INFO - Result <val>: [val/time: 6.38 (s), val/loss: 3.1504]
|
| 137 |
+
2026-01-08 21:51:38,308 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_080000.pt saved
|
| 138 |
+
2026-01-08 21:51:38,308 - easytorch-training - INFO - The estimated training finish time is 2026-01-09 15:41:54
|
| 139 |
+
2026-01-08 22:33:18,101 - easytorch-training - INFO - Iteration 85000 / 200005
|
| 140 |
+
2026-01-08 22:33:18,420 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.50 (s), train/lr: 6.82e-04, train/loss: 2.4792, train/grad_norm: 2.2245, train/amp_scale: 1.0000]
|
| 141 |
+
2026-01-08 22:33:18,421 - easytorch-training - INFO - Start validation.
|
| 142 |
+
2026-01-08 22:33:24,960 - easytorch-training - INFO - Result <val>: [val/time: 6.42 (s), val/loss: 3.1497]
|
| 143 |
+
2026-01-08 22:33:25,066 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_085000.pt saved
|
| 144 |
+
2026-01-08 22:33:25,066 - easytorch-training - INFO - The estimated training finish time is 2026-01-09 15:35:17
|
| 145 |
+
2026-01-08 23:19:27,157 - easytorch-training - INFO - Iteration 90000 / 200005
|
| 146 |
+
2026-01-08 23:19:27,478 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.55 (s), train/lr: 6.43e-04, train/loss: 2.5439, train/grad_norm: 2.4947, train/amp_scale: 1.0000]
|
| 147 |
+
2026-01-08 23:19:27,478 - easytorch-training - INFO - Start validation.
|
| 148 |
+
2026-01-08 23:19:34,008 - easytorch-training - INFO - Result <val>: [val/time: 6.41 (s), val/loss: 3.3415]
|
| 149 |
+
2026-01-08 23:19:34,118 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_090000.pt saved
|
| 150 |
+
2026-01-08 23:19:34,119 - easytorch-training - INFO - The estimated training finish time is 2026-01-09 15:39:07
|
| 151 |
+
2026-01-09 00:07:46,269 - easytorch-training - INFO - Iteration 95000 / 200005
|
| 152 |
+
2026-01-09 00:07:46,602 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.58 (s), train/lr: 6.03e-04, train/loss: 4.5141, train/grad_norm: 18243.9237, train/amp_scale: 1.0000]
|
| 153 |
+
2026-01-09 00:07:46,602 - easytorch-training - INFO - Start validation.
|
| 154 |
+
2026-01-09 00:07:53,124 - easytorch-training - INFO - Result <val>: [val/time: 6.40 (s), val/loss: 5.2037]
|
| 155 |
+
2026-01-09 00:07:53,232 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_095000.pt saved
|
| 156 |
+
2026-01-09 00:07:53,233 - easytorch-training - INFO - The estimated training finish time is 2026-01-09 15:47:06
|
| 157 |
+
2026-01-09 01:02:39,127 - easytorch-training - INFO - Iteration 100000 / 200005
|
| 158 |
+
2026-01-09 01:02:39,443 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.66 (s), train/lr: 5.62e-04, train/loss: 5.6952, train/grad_norm: 59405430.2133, train/amp_scale: 1.0000]
|
| 159 |
+
2026-01-09 01:02:39,444 - easytorch-training - INFO - Start validation.
|
| 160 |
+
2026-01-09 01:02:45,959 - easytorch-training - INFO - Result <val>: [val/time: 6.39 (s), val/loss: 5.3386]
|
| 161 |
+
2026-01-09 01:02:46,064 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_100000.pt saved
|
| 162 |
+
2026-01-09 01:02:46,064 - easytorch-training - INFO - The estimated training finish time is 2026-01-09 16:07:25
|
| 163 |
+
2026-01-09 01:46:34,959 - easytorch-training - INFO - Iteration 105000 / 200005
|
| 164 |
+
2026-01-09 01:46:35,275 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.53 (s), train/lr: 5.21e-04, train/loss: 5.5087, train/grad_norm: 691109484.9481, train/amp_scale: 1.0000]
|
| 165 |
+
2026-01-09 01:46:35,276 - easytorch-training - INFO - Start validation.
|
| 166 |
+
2026-01-09 01:46:41,776 - easytorch-training - INFO - Result <val>: [val/time: 6.38 (s), val/loss: 5.5645]
|
| 167 |
+
2026-01-09 01:46:41,878 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200005/3deef857eb45e05066abe66fffa5e741/TimeMoE4_105000.pt saved
|
| 168 |
+
2026-01-09 01:46:41,878 - easytorch-training - INFO - The estimated training finish time is 2026-01-09 16:04:57
|