Upload Main_100002
Browse files- Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_005000.pt +3 -0
- Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_010000.pt +3 -0
- Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_015000.pt +3 -0
- Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_020000.pt +3 -0
- Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_025000.pt +3 -0
- Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_030000.pt +3 -0
- Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_035000.pt +3 -0
- Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_040000.pt +3 -0
- Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_045000.pt +3 -0
- Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_050000.pt +3 -0
- Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_055000.pt +3 -0
- Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_060000.pt +3 -0
- Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_065000.pt +3 -0
- Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_070000.pt +3 -0
- Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_075000.pt +3 -0
- Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_080000.pt +3 -0
- Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_085000.pt +3 -0
- Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_090000.pt +3 -0
- Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_095000.pt +3 -0
- Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_best_val_loss.pt +3 -0
- Main_100002/371eb6ff0d4cfac53569c28370657ebd/cfg.txt +55 -0
- Main_100002/371eb6ff0d4cfac53569c28370657ebd/tensorboard/events.out.tfevents.1768466342.brev-5x9knwe1p.1779457.0 +3 -0
- Main_100002/371eb6ff0d4cfac53569c28370657ebd/timemoe_base.py +125 -0
- Main_100002/371eb6ff0d4cfac53569c28370657ebd/training_log_20260115083849.log +152 -0
Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_005000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:998450ed80bab9cc6c3c7177eae742be780c8bb88434d8afd26a046dfcffae1d
|
| 3 |
+
size 151982040
|
Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_010000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:102f4f4757e0fe6d774eba90c2812d5f4723aece28801a77d4f9473cee819655
|
| 3 |
+
size 151982040
|
Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_015000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b66426ee74c5ce5fc7c99852b490a51b1140aa1bbdfba22eb10401f1b27ecb38
|
| 3 |
+
size 151982040
|
Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_020000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:352198520b23e7ef2151b84f806f5e5ffab1e0c5983b5557e9e6a6e1d43692ab
|
| 3 |
+
size 151982040
|
Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_025000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a6922548f9e2466c0d068b06b171a6c327edb53b8ec1d4a3f52b75c4c008ea70
|
| 3 |
+
size 151982040
|
Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_030000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6d97eaa815aab63109d5992a3979af23dff208efcb0632dc31926ad68736e6f0
|
| 3 |
+
size 151982040
|
Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_035000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bcf90c48ed91a86b4ef5041eb61ae12d184b94e6f9a8deed64f488c4304db6b0
|
| 3 |
+
size 151982040
|
Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_040000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3bae84f254eef3c98109cc5607f9294e55f4927dfb725b2ff7ed0c6cffac41f3
|
| 3 |
+
size 151982040
|
Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_045000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:15ebd69c970667e2b24af65dd154ee0da569eebe1dd608ca89bf41500ce45262
|
| 3 |
+
size 151982040
|
Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_050000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e47671c9ace7e0971b1b28f0d6f6aaedad7f19a00a29a8d4eb27d43388083640
|
| 3 |
+
size 151982040
|
Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_055000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6a11def54155e353ec214311dab96866cde9e10795483e301c7d01e4081bff8b
|
| 3 |
+
size 151982040
|
Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_060000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cb4237edb183449549456fb23fbb193ef15ba34cb87d62d2292069ccc0cc9514
|
| 3 |
+
size 151982040
|
Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_065000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b14b137188cc0bf8e79f55d995a51b47c401af33857d72b9773f4f5e1fb248ba
|
| 3 |
+
size 151982040
|
Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_070000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:83692de0aab93c2b012ffe0ed4514be8c956568bf5b52298bacc963de9b2c284
|
| 3 |
+
size 151982040
|
Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_075000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c718741d7e726b0c8a9cdcfb7ac33b6bde0afe4035c0e468fd98e0297916fd4f
|
| 3 |
+
size 151982040
|
Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_080000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2dc2665be8da91579bc2cda528e20f0793147f771fc74ef488cb89452467af9e
|
| 3 |
+
size 151982040
|
Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_085000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:73e90399f0a701e7942f86b9945af1d21e36f3a13d8eee02bfd43cb825aa0f35
|
| 3 |
+
size 151982040
|
Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_090000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:376c02f1496e7c53864c4c49d65aa754337fea4b850fe23848239c7b0c5bbdd8
|
| 3 |
+
size 151982040
|
Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_095000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a5e1aca0a6391dfe6e415c997e49be7521b3f315d48ebb33202f61de562283a0
|
| 3 |
+
size 151982040
|
Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_best_val_loss.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3f6d2e0c162a48b81bbb2f17a1c407409325d1149f20f8e4a9b23f160c866460
|
| 3 |
+
size 151985051
|
Main_100002/371eb6ff0d4cfac53569c28370657ebd/cfg.txt
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
DESCRIPTION: TimeMoE Base
|
| 2 |
+
DEVICE: gpu
|
| 3 |
+
DEVICE_NUM: 3
|
| 4 |
+
RUNNER: <class 'baselines.TimeMoE4.runner.runner.TimeMoERunner'>
|
| 5 |
+
MODEL:
|
| 6 |
+
NAME: TimeMoE4
|
| 7 |
+
ARCH: <class 'baselines.TimeMoE4.arch.timemoe.TimeMoE4'>
|
| 8 |
+
PARAM:
|
| 9 |
+
model_id: baselines/TimeMoE/ckpt/TimeMoE-50M
|
| 10 |
+
from_pretrained: False
|
| 11 |
+
context_length: 4079
|
| 12 |
+
trust_remote_code: True
|
| 13 |
+
DTYPE: bfloat16
|
| 14 |
+
METRICS:
|
| 15 |
+
FUNCS:
|
| 16 |
+
TRAIN:
|
| 17 |
+
COMPILE_MODEL: True
|
| 18 |
+
NUM_ITERATIONS: 100002
|
| 19 |
+
CKPT_SAVE_DIR: checkpoints/TimeMoE4/Main_100002
|
| 20 |
+
CKPT_SAVE_STRATEGY: 5000
|
| 21 |
+
LOSS: fake_loss
|
| 22 |
+
OPTIM:
|
| 23 |
+
TYPE: AdamW
|
| 24 |
+
PARAM:
|
| 25 |
+
lr: 0.001
|
| 26 |
+
betas: (0.9, 0.95)
|
| 27 |
+
fused: True
|
| 28 |
+
LR_SCHEDULER:
|
| 29 |
+
TYPE: CosineWarmup
|
| 30 |
+
PARAM:
|
| 31 |
+
num_warmup_steps: 5000
|
| 32 |
+
num_training_steps: 100002
|
| 33 |
+
CLIP_GRAD_PARAM:
|
| 34 |
+
max_norm: 1.0
|
| 35 |
+
DATA:
|
| 36 |
+
BATCH_SIZE: 85
|
| 37 |
+
SHUFFLE: True
|
| 38 |
+
PIN_MEMORY: True
|
| 39 |
+
PREFETCH: True
|
| 40 |
+
GRAD_ACCUMULATION_STEPS: 1
|
| 41 |
+
VAL:
|
| 42 |
+
INTERVAL: 5000
|
| 43 |
+
DATA:
|
| 44 |
+
BATCH_SIZE: 170
|
| 45 |
+
EVAL:
|
| 46 |
+
USE_GPU: True
|
| 47 |
+
DATASET:
|
| 48 |
+
NAME: Main
|
| 49 |
+
TYPE: <class 'baselines.TimeMoE4.data.mix_dataset_v2.MixedSourceDataset_v2'>
|
| 50 |
+
PARAM:
|
| 51 |
+
num_valid_samples: 1000
|
| 52 |
+
INFERENCE:
|
| 53 |
+
GENERATION_PARAMS:
|
| 54 |
+
normalize: True
|
| 55 |
+
MD5: 371eb6ff0d4cfac53569c28370657ebd
|
Main_100002/371eb6ff0d4cfac53569c28370657ebd/tensorboard/events.out.tfevents.1768466342.brev-5x9knwe1p.1779457.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c4f9848cadc6757bb1dcc629187a59c569d09aefaaecaf3c533f3c271771d02f
|
| 3 |
+
size 20187882
|
Main_100002/371eb6ff0d4cfac53569c28370657ebd/timemoe_base.py
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 采样概率变化
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import sys
|
| 5 |
+
from easydict import EasyDict
|
| 6 |
+
sys.path.append(os.path.abspath(__file__ + '/../../..'))
|
| 7 |
+
|
| 8 |
+
from ..arch import TimeMoE4
|
| 9 |
+
from ..data import MixedSourceDataset_v2
|
| 10 |
+
from ..runner import TimeMoERunner
|
| 11 |
+
from ..loss import fake_loss
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
############################## Hot Parameters ##############################
|
| 15 |
+
# Dataset & Metrics configuration
|
| 16 |
+
# Model architecture and parameters
|
| 17 |
+
|
| 18 |
+
pretrained = False # Whether to use a pretrained model
|
| 19 |
+
|
| 20 |
+
MODEL_ARCH = TimeMoE4
|
| 21 |
+
|
| 22 |
+
MODEL_PARAM = {
|
| 23 |
+
'model_id': "baselines/TimeMoE/ckpt/TimeMoE-50M",
|
| 24 |
+
'from_pretrained': pretrained,
|
| 25 |
+
'context_length': 4079,
|
| 26 |
+
'trust_remote_code': True,
|
| 27 |
+
}
|
| 28 |
+
DATA_NAME = "Main"
|
| 29 |
+
|
| 30 |
+
# N = 20_000_000
|
| 31 |
+
# batch size = 16*8
|
| 32 |
+
# 20_000_000 / 16 / 8 = 156250 iterations
|
| 33 |
+
# 20_000_000 * 4096 / 16 / 8 / 4096 = 156_250
|
| 34 |
+
|
| 35 |
+
NUM_ITERATIONS = 100_002 # 总轮数 20_000_000 * 4096 / 16 / 4 / 4096 = 312,500
|
| 36 |
+
VAL_ITERATION_INTERVAL = 5_000 # 每VAL_ITERATION_INTERVAL执行一次验证
|
| 37 |
+
|
| 38 |
+
############################## General Configuration ##############################
|
| 39 |
+
CFG = EasyDict()
|
| 40 |
+
# General settings
|
| 41 |
+
CFG.DESCRIPTION = 'TimeMoE Base'
|
| 42 |
+
CFG.DEVICE = 'gpu'
|
| 43 |
+
CFG.DEVICE_NUM = 3
|
| 44 |
+
# Runner
|
| 45 |
+
CFG.RUNNER = TimeMoERunner
|
| 46 |
+
|
| 47 |
+
############################## Model Configuration ################################
|
| 48 |
+
CFG.MODEL = EasyDict()
|
| 49 |
+
CFG.MODEL.NAME = MODEL_ARCH.__name__
|
| 50 |
+
CFG.MODEL.ARCH = MODEL_ARCH
|
| 51 |
+
CFG.MODEL.PARAM = MODEL_PARAM
|
| 52 |
+
CFG.MODEL.DTYPE= 'bfloat16'
|
| 53 |
+
# CFG.MODEL.DTYPE= 'float32'
|
| 54 |
+
|
| 55 |
+
############################## Metrics Configuration ##############################
|
| 56 |
+
CFG.METRICS = EasyDict()
|
| 57 |
+
# Metrics settings
|
| 58 |
+
CFG.METRICS.FUNCS = EasyDict({})
|
| 59 |
+
|
| 60 |
+
############################## Training Configuration ##############################
|
| 61 |
+
CFG.TRAIN = EasyDict()
|
| 62 |
+
CFG.TRAIN.COMPILE_MODEL = True
|
| 63 |
+
CFG.TRAIN.NUM_ITERATIONS = NUM_ITERATIONS
|
| 64 |
+
CFG.TRAIN.CKPT_SAVE_DIR = os.path.join(
|
| 65 |
+
'checkpoints',
|
| 66 |
+
MODEL_ARCH.__name__,
|
| 67 |
+
'_'.join([DATA_NAME, str(CFG.TRAIN.NUM_ITERATIONS)])
|
| 68 |
+
)
|
| 69 |
+
CFG.TRAIN.CKPT_SAVE_STRATEGY = VAL_ITERATION_INTERVAL * 1 # 保存策略,每VAL_ITERATION_INTERVAL * 5保存一次模型
|
| 70 |
+
CFG.TRAIN.LOSS = fake_loss
|
| 71 |
+
# Optimizer settings
|
| 72 |
+
CFG.TRAIN.OPTIM = EasyDict()
|
| 73 |
+
CFG.TRAIN.OPTIM.TYPE = "AdamW"
|
| 74 |
+
CFG.TRAIN.OPTIM.PARAM = {
|
| 75 |
+
"lr": 1e-3,
|
| 76 |
+
"betas": (0.9, 0.95),
|
| 77 |
+
"fused": True,
|
| 78 |
+
# "weight_decay": 1e-1,
|
| 79 |
+
}
|
| 80 |
+
# Learning rate scheduler settings
|
| 81 |
+
CFG.TRAIN.LR_SCHEDULER = EasyDict()
|
| 82 |
+
CFG.TRAIN.LR_SCHEDULER.TYPE = "CosineWarmup"
|
| 83 |
+
CFG.TRAIN.LR_SCHEDULER.PARAM = {
|
| 84 |
+
'num_warmup_steps': int(NUM_ITERATIONS / 100 * 5), # 5%的warmup启动比例
|
| 85 |
+
'num_training_steps': NUM_ITERATIONS,
|
| 86 |
+
}
|
| 87 |
+
CFG.TRAIN.CLIP_GRAD_PARAM = {
|
| 88 |
+
'max_norm': 1.0
|
| 89 |
+
}
|
| 90 |
+
# Train data loader settings
|
| 91 |
+
CFG.TRAIN.DATA = EasyDict()
|
| 92 |
+
CFG.TRAIN.DATA.BATCH_SIZE = 85 # 16 / 4
|
| 93 |
+
CFG.TRAIN.DATA.SHUFFLE = True # has to be False
|
| 94 |
+
CFG.TRAIN.DATA.PIN_MEMORY = True
|
| 95 |
+
CFG.TRAIN.DATA.PREFETCH = True
|
| 96 |
+
CFG.TRAIN.GRAD_ACCUMULATION_STEPS = 1
|
| 97 |
+
# CFG.TRAIN.DATA.NUM_WORKERS = 4
|
| 98 |
+
|
| 99 |
+
############################## Validation Configuration ##############################
|
| 100 |
+
CFG.VAL = EasyDict()
|
| 101 |
+
CFG.VAL.INTERVAL = VAL_ITERATION_INTERVAL
|
| 102 |
+
CFG.VAL.DATA = EasyDict()
|
| 103 |
+
CFG.VAL.DATA.BATCH_SIZE = 170 # 32 / 8
|
| 104 |
+
|
| 105 |
+
############################## Evaluation Configuration ##############################
|
| 106 |
+
|
| 107 |
+
CFG.EVAL = EasyDict()
|
| 108 |
+
# Evaluation parameters
|
| 109 |
+
CFG.EVAL.USE_GPU = True # Whether to use GPU for evaluation. Default: True
|
| 110 |
+
|
| 111 |
+
############################## Dataset Configuration ##############################
|
| 112 |
+
CFG.DATASET = EasyDict()
|
| 113 |
+
# Dataset settings
|
| 114 |
+
CFG.DATASET.NAME = DATA_NAME
|
| 115 |
+
CFG.DATASET.TYPE = MixedSourceDataset_v2
|
| 116 |
+
CFG.DATASET.PARAM = EasyDict({
|
| 117 |
+
'num_valid_samples': 1000
|
| 118 |
+
})
|
| 119 |
+
|
| 120 |
+
############################## Inference Configuration ##############################
|
| 121 |
+
CFG.INFERENCE = EasyDict()
|
| 122 |
+
CFG.INFERENCE.GENERATION_PARAMS = EasyDict({
|
| 123 |
+
'normalize': not pretrained
|
| 124 |
+
})
|
| 125 |
+
|
Main_100002/371eb6ff0d4cfac53569c28370657ebd/training_log_20260115083849.log
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2026-01-15 08:38:49,877 - easytorch-training - INFO - Initializing training.
|
| 2 |
+
2026-01-15 08:38:49,878 - easytorch-training - INFO - Set clip grad, param: {'max_norm': 1.0}
|
| 3 |
+
2026-01-15 08:38:49,879 - easytorch-training - INFO - Building training data loader.
|
| 4 |
+
2026-01-15 08:39:02,306 - easytorch-training - INFO - MixedSourceDataset initialized for 'train' mode.
|
| 5 |
+
2026-01-15 08:39:02,306 - easytorch-training - INFO - - real: 3201174 samples
|
| 6 |
+
2026-01-15 08:39:02,307 - easytorch-training - INFO - - synth: 2000000 samples
|
| 7 |
+
2026-01-15 08:39:02,307 - easytorch-training - INFO - Train dataset length: 3201174
|
| 8 |
+
2026-01-15 08:39:02,309 - easytorch-training - INFO - Set optim: AdamW (
|
| 9 |
+
Parameter Group 0
|
| 10 |
+
amsgrad: False
|
| 11 |
+
betas: (0.9, 0.95)
|
| 12 |
+
capturable: False
|
| 13 |
+
differentiable: False
|
| 14 |
+
eps: 1e-08
|
| 15 |
+
foreach: None
|
| 16 |
+
fused: True
|
| 17 |
+
lr: 0.001
|
| 18 |
+
maximize: False
|
| 19 |
+
weight_decay: 0.01
|
| 20 |
+
)
|
| 21 |
+
2026-01-15 08:39:02,309 - easytorch-training - INFO - Set lr_scheduler: <basicts.runners.optim.lr_schedulers.CosineWarmup object at 0x77f375ad7950>
|
| 22 |
+
2026-01-15 08:39:02,310 - easytorch-training - INFO - Initializing validation.
|
| 23 |
+
2026-01-15 08:39:02,311 - easytorch-training - INFO - Building val data loader.
|
| 24 |
+
2026-01-15 08:39:04,081 - easytorch-training - INFO - Worker 0 initialized for cauker_univariate.
|
| 25 |
+
2026-01-15 08:39:29,237 - easytorch-training - INFO - MixedSourceDataset initialized for 'valid' mode.
|
| 26 |
+
2026-01-15 08:39:29,237 - easytorch-training - INFO - - real: 1000 samples
|
| 27 |
+
2026-01-15 08:39:29,237 - easytorch-training - INFO - Valid dataset length: 1000
|
| 28 |
+
2026-01-15 08:39:29,238 - easytorch-training - INFO - Number of parameters: 12653568
|
| 29 |
+
2026-01-15 08:39:29,238 - easytorch-training - INFO - Training with 3 GPUs, batch size per GPUs: 85, grad_accumulation_steps: 1
|
| 30 |
+
2026-01-15 08:39:29,238 - easytorch-training - INFO - Effective batch size: 255
|
| 31 |
+
2026-01-15 09:37:30,831 - easytorch-training - INFO - Iteration 5000 / 100002
|
| 32 |
+
2026-01-15 09:37:31,320 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.70 (s), train/lr: 5.00e-04, train/loss: 3.2906, train/grad_norm: 5.5040, train/amp_scale: 1.0000]
|
| 33 |
+
2026-01-15 09:37:31,320 - easytorch-training - INFO - Start validation.
|
| 34 |
+
2026-01-15 09:37:43,761 - easytorch-training - INFO - Result <val>: [val/time: 12.26 (s), val/loss: 3.3327]
|
| 35 |
+
2026-01-15 09:37:43,870 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_best_val_loss.pt saved
|
| 36 |
+
2026-01-15 09:37:43,977 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_005000.pt saved
|
| 37 |
+
2026-01-15 09:37:43,978 - easytorch-training - INFO - The estimated training finish time is 2026-01-16 04:04:25
|
| 38 |
+
2026-01-15 10:37:54,341 - easytorch-training - INFO - Iteration 10000 / 100002
|
| 39 |
+
2026-01-15 10:37:54,824 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.72 (s), train/lr: 9.98e-04, train/loss: 2.6038, train/grad_norm: 2.4474, train/amp_scale: 1.0000]
|
| 40 |
+
2026-01-15 10:37:54,825 - easytorch-training - INFO - Start validation.
|
| 41 |
+
2026-01-15 10:38:01,430 - easytorch-training - INFO - Result <val>: [val/time: 6.43 (s), val/loss: 3.2617]
|
| 42 |
+
2026-01-15 10:38:01,550 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_best_val_loss.pt saved
|
| 43 |
+
2026-01-15 10:38:01,661 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_010000.pt saved
|
| 44 |
+
2026-01-15 10:38:01,662 - easytorch-training - INFO - The estimated training finish time is 2026-01-16 04:24:54
|
| 45 |
+
2026-01-15 11:37:28,330 - easytorch-training - INFO - Iteration 15000 / 100002
|
| 46 |
+
2026-01-15 11:37:28,820 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.71 (s), train/lr: 9.84e-04, train/loss: 2.5290, train/grad_norm: 2.4423, train/amp_scale: 1.0000]
|
| 47 |
+
2026-01-15 11:37:28,820 - easytorch-training - INFO - Start validation.
|
| 48 |
+
2026-01-15 11:37:35,446 - easytorch-training - INFO - Result <val>: [val/time: 6.45 (s), val/loss: 3.2413]
|
| 49 |
+
2026-01-15 11:37:35,579 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_best_val_loss.pt saved
|
| 50 |
+
2026-01-15 11:37:35,697 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_015000.pt saved
|
| 51 |
+
2026-01-15 11:37:35,698 - easytorch-training - INFO - The estimated training finish time is 2026-01-16 04:26:53
|
| 52 |
+
2026-01-15 12:36:32,303 - easytorch-training - INFO - Iteration 20000 / 100002
|
| 53 |
+
2026-01-15 12:36:32,791 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.71 (s), train/lr: 9.57e-04, train/loss: 2.4877, train/grad_norm: 2.5034, train/amp_scale: 1.0000]
|
| 54 |
+
2026-01-15 12:36:32,791 - easytorch-training - INFO - Start validation.
|
| 55 |
+
2026-01-15 12:36:39,411 - easytorch-training - INFO - Result <val>: [val/time: 6.44 (s), val/loss: 3.2209]
|
| 56 |
+
2026-01-15 12:36:39,530 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_best_val_loss.pt saved
|
| 57 |
+
2026-01-15 12:36:39,640 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_020000.pt saved
|
| 58 |
+
2026-01-15 12:36:39,641 - easytorch-training - INFO - The estimated training finish time is 2026-01-16 04:25:22
|
| 59 |
+
2026-01-15 13:40:54,833 - easytorch-training - INFO - Iteration 25000 / 100002
|
| 60 |
+
2026-01-15 13:40:55,324 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.77 (s), train/lr: 9.18e-04, train/loss: 2.4780, train/grad_norm: 2.5490, train/amp_scale: 1.0000]
|
| 61 |
+
2026-01-15 13:40:55,325 - easytorch-training - INFO - Start validation.
|
| 62 |
+
2026-01-15 13:41:01,925 - easytorch-training - INFO - Result <val>: [val/time: 6.43 (s), val/loss: 3.1968]
|
| 63 |
+
2026-01-15 13:41:02,055 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_best_val_loss.pt saved
|
| 64 |
+
2026-01-15 13:41:02,175 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_025000.pt saved
|
| 65 |
+
2026-01-15 13:41:02,176 - easytorch-training - INFO - The estimated training finish time is 2026-01-16 04:45:42
|
| 66 |
+
2026-01-15 14:46:20,547 - easytorch-training - INFO - Iteration 30000 / 100002
|
| 67 |
+
2026-01-15 14:46:21,049 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.78 (s), train/lr: 8.67e-04, train/loss: 2.4605, train/grad_norm: 2.5594, train/amp_scale: 1.0000]
|
| 68 |
+
2026-01-15 14:46:21,050 - easytorch-training - INFO - Start validation.
|
| 69 |
+
2026-01-15 14:46:27,677 - easytorch-training - INFO - Result <val>: [val/time: 6.45 (s), val/loss: 3.2146]
|
| 70 |
+
2026-01-15 14:46:27,813 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_030000.pt saved
|
| 71 |
+
2026-01-15 14:46:27,814 - easytorch-training - INFO - The estimated training finish time is 2026-01-16 05:02:45
|
| 72 |
+
2026-01-15 15:45:51,839 - easytorch-training - INFO - Iteration 35000 / 100002
|
| 73 |
+
2026-01-15 15:45:52,327 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.71 (s), train/lr: 8.07e-04, train/loss: 2.4428, train/grad_norm: 2.6078, train/amp_scale: 1.0000]
|
| 74 |
+
2026-01-15 15:45:52,328 - easytorch-training - INFO - Start validation.
|
| 75 |
+
2026-01-15 15:45:58,922 - easytorch-training - INFO - Result <val>: [val/time: 6.42 (s), val/loss: 3.2057]
|
| 76 |
+
2026-01-15 15:45:59,029 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_035000.pt saved
|
| 77 |
+
2026-01-15 15:45:59,030 - easytorch-training - INFO - The estimated training finish time is 2026-01-16 04:58:04
|
| 78 |
+
2026-01-15 16:45:06,547 - easytorch-training - INFO - Iteration 40000 / 100002
|
| 79 |
+
2026-01-15 16:45:15,031 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.71 (s), train/lr: 7.38e-04, train/loss: 2.4477, train/grad_norm: 2.6098, train/amp_scale: 1.0000]
|
| 80 |
+
2026-01-15 16:45:15,032 - easytorch-training - INFO - Start validation.
|
| 81 |
+
2026-01-15 16:45:21,616 - easytorch-training - INFO - Result <val>: [val/time: 6.41 (s), val/loss: 3.2111]
|
| 82 |
+
2026-01-15 16:45:21,743 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_040000.pt saved
|
| 83 |
+
2026-01-15 16:45:21,743 - easytorch-training - INFO - The estimated training finish time is 2026-01-16 04:54:11
|
| 84 |
+
2026-01-15 17:42:46,527 - easytorch-training - INFO - Iteration 45000 / 100002
|
| 85 |
+
2026-01-15 17:42:47,013 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.69 (s), train/lr: 6.62e-04, train/loss: 2.4363, train/grad_norm: 2.5512, train/amp_scale: 1.0000]
|
| 86 |
+
2026-01-15 17:42:47,014 - easytorch-training - INFO - Start validation.
|
| 87 |
+
2026-01-15 17:42:53,621 - easytorch-training - INFO - Result <val>: [val/time: 6.43 (s), val/loss: 3.1859]
|
| 88 |
+
2026-01-15 17:42:53,740 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_best_val_loss.pt saved
|
| 89 |
+
2026-01-15 17:42:53,847 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_045000.pt saved
|
| 90 |
+
2026-01-15 17:42:53,849 - easytorch-training - INFO - The estimated training finish time is 2026-01-16 04:47:05
|
| 91 |
+
2026-01-15 18:41:53,454 - easytorch-training - INFO - Iteration 50000 / 100002
|
| 92 |
+
2026-01-15 18:41:53,939 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.71 (s), train/lr: 5.82e-04, train/loss: 2.4239, train/grad_norm: 2.5121, train/amp_scale: 1.0000]
|
| 93 |
+
2026-01-15 18:41:53,939 - easytorch-training - INFO - Start validation.
|
| 94 |
+
2026-01-15 18:42:00,553 - easytorch-training - INFO - Result <val>: [val/time: 6.44 (s), val/loss: 3.1826]
|
| 95 |
+
2026-01-15 18:42:00,682 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_best_val_loss.pt saved
|
| 96 |
+
2026-01-15 18:42:00,800 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_050000.pt saved
|
| 97 |
+
2026-01-15 18:42:00,801 - easytorch-training - INFO - The estimated training finish time is 2026-01-16 04:44:33
|
| 98 |
+
2026-01-15 19:41:10,140 - easytorch-training - INFO - Iteration 55000 / 100002
|
| 99 |
+
2026-01-15 19:41:10,624 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.71 (s), train/lr: 5.00e-04, train/loss: 2.4189, train/grad_norm: 2.4195, train/amp_scale: 1.0000]
|
| 100 |
+
2026-01-15 19:41:10,624 - easytorch-training - INFO - Start validation.
|
| 101 |
+
2026-01-15 19:41:17,232 - easytorch-training - INFO - Result <val>: [val/time: 6.43 (s), val/loss: 3.1847]
|
| 102 |
+
2026-01-15 19:41:17,352 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_055000.pt saved
|
| 103 |
+
2026-01-15 19:41:17,353 - easytorch-training - INFO - The estimated training finish time is 2026-01-16 04:42:47
|
| 104 |
+
2026-01-15 20:39:49,923 - easytorch-training - INFO - Iteration 60000 / 100002
|
| 105 |
+
2026-01-15 20:39:50,407 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.70 (s), train/lr: 4.18e-04, train/loss: 2.4076, train/grad_norm: 2.4589, train/amp_scale: 1.0000]
|
| 106 |
+
2026-01-15 20:39:50,407 - easytorch-training - INFO - Start validation.
|
| 107 |
+
2026-01-15 20:39:57,015 - easytorch-training - INFO - Result <val>: [val/time: 6.43 (s), val/loss: 3.1806]
|
| 108 |
+
2026-01-15 20:39:57,148 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_best_val_loss.pt saved
|
| 109 |
+
2026-01-15 20:39:57,274 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_060000.pt saved
|
| 110 |
+
2026-01-15 20:39:57,274 - easytorch-training - INFO - The estimated training finish time is 2026-01-16 04:40:17
|
| 111 |
+
2026-01-15 21:38:33,022 - easytorch-training - INFO - Iteration 65000 / 100002
|
| 112 |
+
2026-01-15 21:38:33,502 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.70 (s), train/lr: 3.38e-04, train/loss: 2.6238, train/grad_norm: 65.1523, train/amp_scale: 1.0000]
|
| 113 |
+
2026-01-15 21:38:33,502 - easytorch-training - INFO - Start validation.
|
| 114 |
+
2026-01-15 21:38:40,080 - easytorch-training - INFO - Result <val>: [val/time: 6.40 (s), val/loss: 3.3509]
|
| 115 |
+
2026-01-15 21:38:40,201 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_065000.pt saved
|
| 116 |
+
2026-01-15 21:38:40,202 - easytorch-training - INFO - The estimated training finish time is 2026-01-16 04:38:15
|
| 117 |
+
2026-01-15 22:38:09,765 - easytorch-training - INFO - Iteration 70000 / 100002
|
| 118 |
+
2026-01-15 22:38:17,101 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.71 (s), train/lr: 2.62e-04, train/loss: 3.0549, train/grad_norm: 9366.2361, train/amp_scale: 1.0000]
|
| 119 |
+
2026-01-15 22:38:17,102 - easytorch-training - INFO - Start validation.
|
| 120 |
+
2026-01-15 22:38:23,670 - easytorch-training - INFO - Result <val>: [val/time: 6.39 (s), val/loss: 3.6156]
|
| 121 |
+
2026-01-15 22:38:23,792 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_070000.pt saved
|
| 122 |
+
2026-01-15 22:38:23,793 - easytorch-training - INFO - The estimated training finish time is 2026-01-16 04:37:57
|
| 123 |
+
2026-01-15 23:36:56,603 - easytorch-training - INFO - Iteration 75000 / 100002
|
| 124 |
+
2026-01-15 23:36:57,084 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.70 (s), train/lr: 1.93e-04, train/loss: 3.4898, train/grad_norm: 1685043.3068, train/amp_scale: 1.0000]
|
| 125 |
+
2026-01-15 23:36:57,084 - easytorch-training - INFO - Start validation.
|
| 126 |
+
2026-01-15 23:37:03,670 - easytorch-training - INFO - Result <val>: [val/time: 6.41 (s), val/loss: 4.1228]
|
| 127 |
+
2026-01-15 23:37:03,789 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_075000.pt saved
|
| 128 |
+
2026-01-15 23:37:03,790 - easytorch-training - INFO - The estimated training finish time is 2026-01-16 04:36:16
|
| 129 |
+
2026-01-16 00:35:52,808 - easytorch-training - INFO - Iteration 80000 / 100002
|
| 130 |
+
2026-01-16 00:35:53,291 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.70 (s), train/lr: 1.33e-04, train/loss: 5.3236, train/grad_norm: 180469406.3815, train/amp_scale: 1.0000]
|
| 131 |
+
2026-01-16 00:35:53,292 - easytorch-training - INFO - Start validation.
|
| 132 |
+
2026-01-16 00:35:59,860 - easytorch-training - INFO - Result <val>: [val/time: 6.39 (s), val/loss: 5.4014]
|
| 133 |
+
2026-01-16 00:35:59,982 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_080000.pt saved
|
| 134 |
+
2026-01-16 00:35:59,983 - easytorch-training - INFO - The estimated training finish time is 2026-01-16 04:35:09
|
| 135 |
+
2026-01-16 01:34:21,323 - easytorch-training - INFO - Iteration 85000 / 100002
|
| 136 |
+
2026-01-16 01:34:21,807 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.70 (s), train/lr: 8.19e-05, train/loss: 5.3023, train/grad_norm: 110604970.9048, train/amp_scale: 1.0000]
|
| 137 |
+
2026-01-16 01:34:21,807 - easytorch-training - INFO - Start validation.
|
| 138 |
+
2026-01-16 01:34:28,420 - easytorch-training - INFO - Result <val>: [val/time: 6.44 (s), val/loss: 5.4143]
|
| 139 |
+
2026-01-16 01:34:28,529 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_085000.pt saved
|
| 140 |
+
2026-01-16 01:34:28,530 - easytorch-training - INFO - The estimated training finish time is 2026-01-16 04:33:36
|
| 141 |
+
2026-01-16 02:34:20,133 - easytorch-training - INFO - Iteration 90000 / 100002
|
| 142 |
+
2026-01-16 02:34:20,617 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.72 (s), train/lr: 4.26e-05, train/loss: 5.8005, train/grad_norm: 240414947.6978, train/amp_scale: 1.0000]
|
| 143 |
+
2026-01-16 02:34:20,617 - easytorch-training - INFO - Start validation.
|
| 144 |
+
2026-01-16 02:34:27,188 - easytorch-training - INFO - Result <val>: [val/time: 6.39 (s), val/loss: 5.0262]
|
| 145 |
+
2026-01-16 02:34:27,307 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_090000.pt saved
|
| 146 |
+
2026-01-16 02:34:27,307 - easytorch-training - INFO - The estimated training finish time is 2026-01-16 04:33:55
|
| 147 |
+
2026-01-16 03:33:48,850 - easytorch-training - INFO - Iteration 95000 / 100002
|
| 148 |
+
2026-01-16 03:33:49,330 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.71 (s), train/lr: 1.59e-05, train/loss: 5.5571, train/grad_norm: 62206349.6520, train/amp_scale: 1.0000]
|
| 149 |
+
2026-01-16 03:33:49,330 - easytorch-training - INFO - Start validation.
|
| 150 |
+
2026-01-16 03:33:55,897 - easytorch-training - INFO - Result <val>: [val/time: 6.39 (s), val/loss: 5.0098]
|
| 151 |
+
2026-01-16 03:33:56,016 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_100002/371eb6ff0d4cfac53569c28370657ebd/TimeMoE4_095000.pt saved
|
| 152 |
+
2026-01-16 03:33:56,017 - easytorch-training - INFO - The estimated training finish time is 2026-01-16 04:33:39
|