Upload Main_200003
Browse files- Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_005000.pt +3 -0
- Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_010000.pt +3 -0
- Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_015000.pt +3 -0
- Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_020000.pt +3 -0
- Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_025000.pt +3 -0
- Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_030000.pt +3 -0
- Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_035000.pt +3 -0
- Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_040000.pt +3 -0
- Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_045000.pt +3 -0
- Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_050000.pt +3 -0
- Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_055000.pt +3 -0
- Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_060000.pt +3 -0
- Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_065000.pt +3 -0
- Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_070000.pt +3 -0
- Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_075000.pt +3 -0
- Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_080000.pt +3 -0
- Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_085000.pt +3 -0
- Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_090000.pt +3 -0
- Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_095000.pt +3 -0
- Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_100000.pt +3 -0
- Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_105000.pt +3 -0
- Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_110000.pt +3 -0
- Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_115000.pt +3 -0
- Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_120000.pt +3 -0
- Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_125000.pt +3 -0
- Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_130000.pt +3 -0
- Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_135000.pt +3 -0
- Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_140000.pt +3 -0
- Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_145000.pt +3 -0
- Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_150000.pt +3 -0
- Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_155000.pt +3 -0
- Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_best_val_loss.pt +3 -0
- Main_200003/937fc7f323a4d7d3ae9480105345ef67/cfg.txt +55 -0
- Main_200003/937fc7f323a4d7d3ae9480105345ef67/tensorboard/events.out.tfevents.1767768860.brev-5x9knwe1p.1631682.0 +3 -0
- Main_200003/937fc7f323a4d7d3ae9480105345ef67/timemoe_base.py +125 -0
- Main_200003/937fc7f323a4d7d3ae9480105345ef67/training_log_20260107065409.log +248 -0
Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_005000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bf780ffedf8689f164c00b01604d3fa320388a40244b6b6059f13af95aa6bdb7
|
| 3 |
+
size 151982040
|
Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_010000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1c1a09acbcf0fe687cb88ea0d03384132f11491a7096226ffffce2ddb5a65c02
|
| 3 |
+
size 151982040
|
Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_015000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:aeee0418fb43874438b2ec7ce7987ab7c2fc0c821660a8521f95589f4b6d60ea
|
| 3 |
+
size 151982040
|
Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_020000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:171021c6931f55a59a50d59a02408a56bb43cad2de64dcb43b76efd123bb824d
|
| 3 |
+
size 151982040
|
Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_025000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:28e691f8f2cf4ff2a8baace32eb3faef0c366bf180fb01857edcd9805f736b02
|
| 3 |
+
size 151982040
|
Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_030000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f4ce08683280fe98fbb478e39076857eecbca88581825bfb5b88c9b86a3463db
|
| 3 |
+
size 151982040
|
Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_035000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0be1b1e3d9f137b6ba131317c98e6eac5bdc4588e0115b2d6482691660369884
|
| 3 |
+
size 151982040
|
Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_040000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b2aac929c7893a4c068cdd2b9131f04c9faf3554454c6a5b7edf5f84b3e6bda0
|
| 3 |
+
size 151982040
|
Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_045000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4b90b959f2b911283e3c075f9837ef51059db4a4bcd23dbfd80e7694dcb3bb9e
|
| 3 |
+
size 151982040
|
Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_050000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ff370a0eed00a8b466ffdeea2ca6890eab0cd7b3b0c8df0f401eaead634a6930
|
| 3 |
+
size 151982040
|
Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_055000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f8c88f0927765376a3476b354cd3a603d3b79b5aa547a890b26bba0ec59a8351
|
| 3 |
+
size 151982040
|
Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_060000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c27efcb8515056904d818705f2876a835584316166feeaa703ae9b729ed015b3
|
| 3 |
+
size 151982040
|
Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_065000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d13c6fbfd9afadffae54fda69c5bc2b6b5cf172910b4e329765ef1d0b95feb6e
|
| 3 |
+
size 151982040
|
Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_070000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a2820bfd429f3ba6b66033debf9f9dbf0469ab50449cde6027930e62d9eb8230
|
| 3 |
+
size 151982040
|
Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_075000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:56fa6d050540aa44f4613247f90f3cc9ae137f84137d4c0d7a3b4deeb7b8a664
|
| 3 |
+
size 151982040
|
Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_080000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d202a0b77faab6eb2dbe7043e511b1c7f3331edc1ecd50265556fbd34d8bc25f
|
| 3 |
+
size 151982040
|
Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_085000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:60c39f90a6d0ce5a0888860d6788c16c7f3dc575b8537d2226430a0a5c71bbf3
|
| 3 |
+
size 151982040
|
Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_090000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a05a57fba88ff9b76926afb7dfa2bc608ca7baca29c7576794a817e1eded5140
|
| 3 |
+
size 151982040
|
Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_095000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2396b4925f09c9e9cbfd50a3f0666b954bae7f0bd2068cedb19685ebf5355625
|
| 3 |
+
size 151982040
|
Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_100000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:36b38c1885f1c9a0f0a9fc26ee1951e8f5ae481f2e48424baf442d9a2b2eebb2
|
| 3 |
+
size 151982040
|
Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_105000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:862fe1414fdce7e8033417aec971c496f6274f806d62f5b1fc5436dd832386a4
|
| 3 |
+
size 151982040
|
Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_110000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:55b2554934ef3616e79e5993b18b64ecfab90bc84a3f3428f7decbe26df4e1cc
|
| 3 |
+
size 151982040
|
Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_115000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d57ffec3b32c9f77c718e799b63de0336de8b39891477f1389dd359386b0124b
|
| 3 |
+
size 151982040
|
Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_120000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:de4608681855baeb0dd09b303bbd3241adad8136aecb66ef1be4d3d30282b95f
|
| 3 |
+
size 151982040
|
Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_125000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:812f53d3460c5e7e55a1fd616ded1d07c8784aebf630729a2d9a7e4898653d12
|
| 3 |
+
size 151982040
|
Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_130000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0332478c95dac46cc1428f7e8bab84650c6316c973963795ac55635c8651ac57
|
| 3 |
+
size 151982040
|
Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_135000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cae9d91af2c1400cb78601d013e0da16210f966ecc661cf0db9dcfd09dc7b68e
|
| 3 |
+
size 151982040
|
Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_140000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0226306a6fd7422c9484e986b56bb2d0a21c566a0b36d78c1978bd4759c90bd8
|
| 3 |
+
size 151982040
|
Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_145000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c5a7e9a071cbb91ad4f3406c8a50dc667754b5851259c9b1e23aafe3fa7723a2
|
| 3 |
+
size 151982040
|
Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_150000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:69a63b2fcefd5eb20ad8d6289cfca9d6026cf2e1f2971cf119de4936a8a786fe
|
| 3 |
+
size 151982040
|
Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_155000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a68c4c189d6dee90fcc54460b16fbee964dd275a19511dae5157f359384a774b
|
| 3 |
+
size 151982040
|
Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_best_val_loss.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d4f83839a2ef74f2448d4b9391a28e5703bcb6f69e631d5535dc19475678a07d
|
| 3 |
+
size 151985051
|
Main_200003/937fc7f323a4d7d3ae9480105345ef67/cfg.txt
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
DESCRIPTION: TimeMoE Base
|
| 2 |
+
DEVICE: gpu
|
| 3 |
+
DEVICE_NUM: 4
|
| 4 |
+
RUNNER: <class 'baselines.TimeMoE4.runner.runner.TimeMoERunner'>
|
| 5 |
+
MODEL:
|
| 6 |
+
NAME: TimeMoE4
|
| 7 |
+
ARCH: <class 'baselines.TimeMoE4.arch.timemoe.TimeMoE4'>
|
| 8 |
+
PARAM:
|
| 9 |
+
model_id: baselines/TimeMoE/ckpt/TimeMoE-50M
|
| 10 |
+
from_pretrained: False
|
| 11 |
+
context_length: 4079
|
| 12 |
+
trust_remote_code: True
|
| 13 |
+
DTYPE: bfloat16
|
| 14 |
+
METRICS:
|
| 15 |
+
FUNCS:
|
| 16 |
+
TRAIN:
|
| 17 |
+
COMPILE_MODEL: True
|
| 18 |
+
NUM_ITERATIONS: 200003
|
| 19 |
+
CKPT_SAVE_DIR: checkpoints/TimeMoE4/Main_200003
|
| 20 |
+
CKPT_SAVE_STRATEGY: 5000
|
| 21 |
+
LOSS: fake_loss
|
| 22 |
+
OPTIM:
|
| 23 |
+
TYPE: AdamW
|
| 24 |
+
PARAM:
|
| 25 |
+
lr: 0.0004
|
| 26 |
+
betas: (0.9, 0.95)
|
| 27 |
+
fused: True
|
| 28 |
+
LR_SCHEDULER:
|
| 29 |
+
TYPE: CosineWarmup
|
| 30 |
+
PARAM:
|
| 31 |
+
num_warmup_steps: 10000
|
| 32 |
+
num_training_steps: 200003
|
| 33 |
+
CLIP_GRAD_PARAM:
|
| 34 |
+
max_norm: 1.0
|
| 35 |
+
DATA:
|
| 36 |
+
BATCH_SIZE: 64
|
| 37 |
+
SHUFFLE: True
|
| 38 |
+
PIN_MEMORY: True
|
| 39 |
+
PREFETCH: True
|
| 40 |
+
GRAD_ACCUMULATION_STEPS: 1
|
| 41 |
+
VAL:
|
| 42 |
+
INTERVAL: 5000
|
| 43 |
+
DATA:
|
| 44 |
+
BATCH_SIZE: 128
|
| 45 |
+
EVAL:
|
| 46 |
+
USE_GPU: True
|
| 47 |
+
DATASET:
|
| 48 |
+
NAME: Main
|
| 49 |
+
TYPE: <class 'baselines.TimeMoE4.data.mix_dataset_v2.MixedSourceDataset_v2'>
|
| 50 |
+
PARAM:
|
| 51 |
+
num_valid_samples: 1000
|
| 52 |
+
INFERENCE:
|
| 53 |
+
GENERATION_PARAMS:
|
| 54 |
+
normalize: True
|
| 55 |
+
MD5: 937fc7f323a4d7d3ae9480105345ef67
|
Main_200003/937fc7f323a4d7d3ae9480105345ef67/tensorboard/events.out.tfevents.1767768860.brev-5x9knwe1p.1631682.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4112a746ba65eb6c329667113d7356a9113cb0e290d616aed45c89d9890924f6
|
| 3 |
+
size 32418418
|
Main_200003/937fc7f323a4d7d3ae9480105345ef67/timemoe_base.py
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 采样概率变化
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import sys
|
| 5 |
+
from easydict import EasyDict
|
| 6 |
+
sys.path.append(os.path.abspath(__file__ + '/../../..'))
|
| 7 |
+
|
| 8 |
+
from ..arch import TimeMoE4
|
| 9 |
+
from ..data import MixedSourceDataset_v2
|
| 10 |
+
from ..runner import TimeMoERunner
|
| 11 |
+
from ..loss import fake_loss
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
############################## Hot Parameters ##############################
|
| 15 |
+
# Dataset & Metrics configuration
|
| 16 |
+
# Model architecture and parameters
|
| 17 |
+
|
| 18 |
+
pretrained = False # Whether to use a pretrained model
|
| 19 |
+
|
| 20 |
+
MODEL_ARCH = TimeMoE4
|
| 21 |
+
|
| 22 |
+
MODEL_PARAM = {
|
| 23 |
+
'model_id': "baselines/TimeMoE/ckpt/TimeMoE-50M",
|
| 24 |
+
'from_pretrained': pretrained,
|
| 25 |
+
'context_length': 4079,
|
| 26 |
+
'trust_remote_code': True,
|
| 27 |
+
}
|
| 28 |
+
DATA_NAME = "Main"
|
| 29 |
+
|
| 30 |
+
# N = 20_000_000
|
| 31 |
+
# batch size = 16*8
|
| 32 |
+
# 20_000_000 / 16 / 8 = 156250 iterations
|
| 33 |
+
# 20_000_000 * 4096 / 16 / 8 / 4096 = 156_250
|
| 34 |
+
|
| 35 |
+
NUM_ITERATIONS = 200_003 # 总轮数 20_000_000 * 4096 / 16 / 4 / 4096 = 312,500
|
| 36 |
+
VAL_ITERATION_INTERVAL = 5_000 # 每VAL_ITERATION_INTERVAL执行一次验证
|
| 37 |
+
|
| 38 |
+
############################## General Configuration ##############################
|
| 39 |
+
CFG = EasyDict()
|
| 40 |
+
# General settings
|
| 41 |
+
CFG.DESCRIPTION = 'TimeMoE Base'
|
| 42 |
+
CFG.DEVICE = 'gpu'
|
| 43 |
+
CFG.DEVICE_NUM = 4
|
| 44 |
+
# Runner
|
| 45 |
+
CFG.RUNNER = TimeMoERunner
|
| 46 |
+
|
| 47 |
+
############################## Model Configuration ################################
|
| 48 |
+
CFG.MODEL = EasyDict()
|
| 49 |
+
CFG.MODEL.NAME = MODEL_ARCH.__name__
|
| 50 |
+
CFG.MODEL.ARCH = MODEL_ARCH
|
| 51 |
+
CFG.MODEL.PARAM = MODEL_PARAM
|
| 52 |
+
CFG.MODEL.DTYPE= 'bfloat16'
|
| 53 |
+
# CFG.MODEL.DTYPE= 'float32'
|
| 54 |
+
|
| 55 |
+
############################## Metrics Configuration ##############################
|
| 56 |
+
CFG.METRICS = EasyDict()
|
| 57 |
+
# Metrics settings
|
| 58 |
+
CFG.METRICS.FUNCS = EasyDict({})
|
| 59 |
+
|
| 60 |
+
############################## Training Configuration ##############################
|
| 61 |
+
CFG.TRAIN = EasyDict()
|
| 62 |
+
CFG.TRAIN.COMPILE_MODEL = True
|
| 63 |
+
CFG.TRAIN.NUM_ITERATIONS = NUM_ITERATIONS
|
| 64 |
+
CFG.TRAIN.CKPT_SAVE_DIR = os.path.join(
|
| 65 |
+
'checkpoints',
|
| 66 |
+
MODEL_ARCH.__name__,
|
| 67 |
+
'_'.join([DATA_NAME, str(CFG.TRAIN.NUM_ITERATIONS)])
|
| 68 |
+
)
|
| 69 |
+
CFG.TRAIN.CKPT_SAVE_STRATEGY = VAL_ITERATION_INTERVAL * 1 # 保存策略,每VAL_ITERATION_INTERVAL * 5保存一次模型
|
| 70 |
+
CFG.TRAIN.LOSS = fake_loss
|
| 71 |
+
# Optimizer settings
|
| 72 |
+
CFG.TRAIN.OPTIM = EasyDict()
|
| 73 |
+
CFG.TRAIN.OPTIM.TYPE = "AdamW"
|
| 74 |
+
CFG.TRAIN.OPTIM.PARAM = {
|
| 75 |
+
"lr": 4e-4,
|
| 76 |
+
"betas": (0.9, 0.95),
|
| 77 |
+
"fused": True,
|
| 78 |
+
# "weight_decay": 1e-1,
|
| 79 |
+
}
|
| 80 |
+
# Learning rate scheduler settings
|
| 81 |
+
CFG.TRAIN.LR_SCHEDULER = EasyDict()
|
| 82 |
+
CFG.TRAIN.LR_SCHEDULER.TYPE = "CosineWarmup"
|
| 83 |
+
CFG.TRAIN.LR_SCHEDULER.PARAM = {
|
| 84 |
+
'num_warmup_steps': int(NUM_ITERATIONS / 100 * 5), # 5%的warmup启动比例
|
| 85 |
+
'num_training_steps': NUM_ITERATIONS,
|
| 86 |
+
}
|
| 87 |
+
CFG.TRAIN.CLIP_GRAD_PARAM = {
|
| 88 |
+
'max_norm': 1.0
|
| 89 |
+
}
|
| 90 |
+
# Train data loader settings
|
| 91 |
+
CFG.TRAIN.DATA = EasyDict()
|
| 92 |
+
CFG.TRAIN.DATA.BATCH_SIZE = 64 # 16 / 4
|
| 93 |
+
CFG.TRAIN.DATA.SHUFFLE = True # has to be False
|
| 94 |
+
CFG.TRAIN.DATA.PIN_MEMORY = True
|
| 95 |
+
CFG.TRAIN.DATA.PREFETCH = True
|
| 96 |
+
CFG.TRAIN.GRAD_ACCUMULATION_STEPS = 1
|
| 97 |
+
# CFG.TRAIN.DATA.NUM_WORKERS = 4
|
| 98 |
+
|
| 99 |
+
############################## Validation Configuration ##############################
|
| 100 |
+
CFG.VAL = EasyDict()
|
| 101 |
+
CFG.VAL.INTERVAL = VAL_ITERATION_INTERVAL
|
| 102 |
+
CFG.VAL.DATA = EasyDict()
|
| 103 |
+
CFG.VAL.DATA.BATCH_SIZE = 128 # 32 / 8
|
| 104 |
+
|
| 105 |
+
############################## Evaluation Configuration ##############################
|
| 106 |
+
|
| 107 |
+
CFG.EVAL = EasyDict()
|
| 108 |
+
# Evaluation parameters
|
| 109 |
+
CFG.EVAL.USE_GPU = True # Whether to use GPU for evaluation. Default: True
|
| 110 |
+
|
| 111 |
+
############################## Dataset Configuration ##############################
|
| 112 |
+
CFG.DATASET = EasyDict()
|
| 113 |
+
# Dataset settings
|
| 114 |
+
CFG.DATASET.NAME = DATA_NAME
|
| 115 |
+
CFG.DATASET.TYPE = MixedSourceDataset_v2
|
| 116 |
+
CFG.DATASET.PARAM = EasyDict({
|
| 117 |
+
'num_valid_samples': 1000
|
| 118 |
+
})
|
| 119 |
+
|
| 120 |
+
############################## Inference Configuration ##############################
|
| 121 |
+
CFG.INFERENCE = EasyDict()
|
| 122 |
+
CFG.INFERENCE.GENERATION_PARAMS = EasyDict({
|
| 123 |
+
'normalize': not pretrained
|
| 124 |
+
})
|
| 125 |
+
|
Main_200003/937fc7f323a4d7d3ae9480105345ef67/training_log_20260107065409.log
ADDED
|
@@ -0,0 +1,248 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2026-01-07 06:54:09,447 - easytorch-training - INFO - Initializing training.
|
| 2 |
+
2026-01-07 06:54:09,447 - easytorch-training - INFO - Set clip grad, param: {'max_norm': 1.0}
|
| 3 |
+
2026-01-07 06:54:09,448 - easytorch-training - INFO - Building training data loader.
|
| 4 |
+
2026-01-07 06:54:20,632 - easytorch-training - INFO - MixedSourceDataset initialized for 'train' mode.
|
| 5 |
+
2026-01-07 06:54:20,632 - easytorch-training - INFO - - real: 3201174 samples
|
| 6 |
+
2026-01-07 06:54:20,633 - easytorch-training - INFO - - synth: 2000000 samples
|
| 7 |
+
2026-01-07 06:54:20,633 - easytorch-training - INFO - Train dataset length: 3201174
|
| 8 |
+
2026-01-07 06:54:20,635 - easytorch-training - INFO - Set optim: AdamW (
|
| 9 |
+
Parameter Group 0
|
| 10 |
+
amsgrad: False
|
| 11 |
+
betas: (0.9, 0.95)
|
| 12 |
+
capturable: False
|
| 13 |
+
differentiable: False
|
| 14 |
+
eps: 1e-08
|
| 15 |
+
foreach: None
|
| 16 |
+
fused: True
|
| 17 |
+
lr: 0.0004
|
| 18 |
+
maximize: False
|
| 19 |
+
weight_decay: 0.01
|
| 20 |
+
)
|
| 21 |
+
2026-01-07 06:54:20,635 - easytorch-training - INFO - Set lr_scheduler: <basicts.runners.optim.lr_schedulers.CosineWarmup object at 0x7e51283ca150>
|
| 22 |
+
2026-01-07 06:54:20,637 - easytorch-training - INFO - Initializing validation.
|
| 23 |
+
2026-01-07 06:54:20,638 - easytorch-training - INFO - Building val data loader.
|
| 24 |
+
2026-01-07 06:54:24,141 - easytorch-training - INFO - Worker 0 initialized for cauker_univariate.
|
| 25 |
+
2026-01-07 06:54:49,987 - easytorch-training - INFO - MixedSourceDataset initialized for 'valid' mode.
|
| 26 |
+
2026-01-07 06:54:49,987 - easytorch-training - INFO - - real: 1000 samples
|
| 27 |
+
2026-01-07 06:54:49,987 - easytorch-training - INFO - Valid dataset length: 1000
|
| 28 |
+
2026-01-07 06:54:49,988 - easytorch-training - INFO - Number of parameters: 12653568
|
| 29 |
+
2026-01-07 06:54:49,988 - easytorch-training - INFO - Training with 4 GPUs, batch size per GPUs: 64, grad_accumulation_steps: 1
|
| 30 |
+
2026-01-07 06:54:49,989 - easytorch-training - INFO - Effective batch size: 256
|
| 31 |
+
2026-01-07 07:43:41,002 - easytorch-training - INFO - Iteration 5000 / 200003
|
| 32 |
+
2026-01-07 07:43:41,369 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.58 (s), train/lr: 1.00e-04, train/loss: 3.2162, train/grad_norm: 30.2086, train/amp_scale: 1.0000]
|
| 33 |
+
2026-01-07 07:43:41,370 - easytorch-training - INFO - Start validation.
|
| 34 |
+
2026-01-07 07:43:55,259 - easytorch-training - INFO - Result <val>: [val/time: 13.77 (s), val/loss: 3.3376]
|
| 35 |
+
2026-01-07 07:43:55,371 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_best_val_loss.pt saved
|
| 36 |
+
2026-01-07 07:43:55,477 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_005000.pt saved
|
| 37 |
+
2026-01-07 07:43:55,478 - easytorch-training - INFO - The estimated training finish time is 2026-01-08 15:38:31
|
| 38 |
+
2026-01-07 08:47:40,179 - easytorch-training - INFO - Iteration 10000 / 200003
|
| 39 |
+
2026-01-07 08:47:40,551 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.76 (s), train/lr: 3.00e-04, train/loss: 2.1032, train/grad_norm: 4.2165, train/amp_scale: 1.0000]
|
| 40 |
+
2026-01-07 08:47:40,551 - easytorch-training - INFO - Start validation.
|
| 41 |
+
2026-01-07 08:47:47,100 - easytorch-training - INFO - Result <val>: [val/time: 6.43 (s), val/loss: 3.2769]
|
| 42 |
+
2026-01-07 08:47:47,222 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_best_val_loss.pt saved
|
| 43 |
+
2026-01-07 08:47:47,331 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_010000.pt saved
|
| 44 |
+
2026-01-07 08:47:47,332 - easytorch-training - INFO - The estimated training finish time is 2026-01-08 20:33:58
|
| 45 |
+
2026-01-07 09:37:19,647 - easytorch-training - INFO - Iteration 15000 / 200003
|
| 46 |
+
2026-01-07 09:37:20,017 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.59 (s), train/lr: 4.00e-04, train/loss: 1.8988, train/grad_norm: 3.5223, train/amp_scale: 1.0000]
|
| 47 |
+
2026-01-07 09:37:20,018 - easytorch-training - INFO - Start validation.
|
| 48 |
+
2026-01-07 09:37:26,549 - easytorch-training - INFO - Result <val>: [val/time: 6.41 (s), val/loss: 3.2378]
|
| 49 |
+
2026-01-07 09:37:26,666 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_best_val_loss.pt saved
|
| 50 |
+
2026-01-07 09:37:26,774 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_015000.pt saved
|
| 51 |
+
2026-01-07 09:37:26,774 - easytorch-training - INFO - The estimated training finish time is 2026-01-08 19:03:02
|
| 52 |
+
2026-01-07 10:26:10,634 - easytorch-training - INFO - Iteration 20000 / 200003
|
| 53 |
+
2026-01-07 10:26:11,007 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.58 (s), train/lr: 3.98e-04, train/loss: 1.8353, train/grad_norm: 3.4002, train/amp_scale: 1.0000]
|
| 54 |
+
2026-01-07 10:26:11,007 - easytorch-training - INFO - Start validation.
|
| 55 |
+
2026-01-07 10:26:17,533 - easytorch-training - INFO - Result <val>: [val/time: 6.40 (s), val/loss: 3.2155]
|
| 56 |
+
2026-01-07 10:26:17,650 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_best_val_loss.pt saved
|
| 57 |
+
2026-01-07 10:26:17,760 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_020000.pt saved
|
| 58 |
+
2026-01-07 10:26:17,761 - easytorch-training - INFO - The estimated training finish time is 2026-01-08 18:09:29
|
| 59 |
+
2026-01-07 11:15:21,133 - easytorch-training - INFO - Iteration 25000 / 200003
|
| 60 |
+
2026-01-07 11:15:21,505 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.59 (s), train/lr: 3.96e-04, train/loss: 1.7968, train/grad_norm: 3.2657, train/amp_scale: 1.0000]
|
| 61 |
+
2026-01-07 11:15:21,505 - easytorch-training - INFO - Start validation.
|
| 62 |
+
2026-01-07 11:15:28,016 - easytorch-training - INFO - Result <val>: [val/time: 6.39 (s), val/loss: 3.2073]
|
| 63 |
+
2026-01-07 11:15:28,132 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_best_val_loss.pt saved
|
| 64 |
+
2026-01-07 11:15:28,244 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_025000.pt saved
|
| 65 |
+
2026-01-07 11:15:28,245 - easytorch-training - INFO - The estimated training finish time is 2026-01-08 17:39:57
|
| 66 |
+
2026-01-07 12:04:34,928 - easytorch-training - INFO - Iteration 30000 / 200003
|
| 67 |
+
2026-01-07 12:04:35,300 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.59 (s), train/lr: 3.92e-04, train/loss: 1.7817, train/grad_norm: 3.2711, train/amp_scale: 1.0000]
|
| 68 |
+
2026-01-07 12:04:35,301 - easytorch-training - INFO - Start validation.
|
| 69 |
+
2026-01-07 12:04:41,848 - easytorch-training - INFO - Result <val>: [val/time: 6.43 (s), val/loss: 3.2007]
|
| 70 |
+
2026-01-07 12:04:41,977 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_best_val_loss.pt saved
|
| 71 |
+
2026-01-07 12:04:42,095 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_030000.pt saved
|
| 72 |
+
2026-01-07 12:04:42,096 - easytorch-training - INFO - The estimated training finish time is 2026-01-08 17:20:39
|
| 73 |
+
2026-01-07 12:52:50,770 - easytorch-training - INFO - Iteration 35000 / 200003
|
| 74 |
+
2026-01-07 12:52:51,138 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.58 (s), train/lr: 3.86e-04, train/loss: 1.7605, train/grad_norm: 3.1830, train/amp_scale: 1.0000]
|
| 75 |
+
2026-01-07 12:52:51,139 - easytorch-training - INFO - Start validation.
|
| 76 |
+
2026-01-07 12:52:57,665 - easytorch-training - INFO - Result <val>: [val/time: 6.40 (s), val/loss: 3.1925]
|
| 77 |
+
2026-01-07 12:52:57,783 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_best_val_loss.pt saved
|
| 78 |
+
2026-01-07 12:52:57,890 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_035000.pt saved
|
| 79 |
+
2026-01-07 12:52:57,891 - easytorch-training - INFO - The estimated training finish time is 2026-01-08 17:01:19
|
| 80 |
+
2026-01-07 13:41:36,580 - easytorch-training - INFO - Iteration 40000 / 200003
|
| 81 |
+
2026-01-07 13:41:36,948 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.58 (s), train/lr: 3.80e-04, train/loss: 1.7497, train/grad_norm: 3.1247, train/amp_scale: 1.0000]
|
| 82 |
+
2026-01-07 13:41:36,949 - easytorch-training - INFO - Start validation.
|
| 83 |
+
2026-01-07 13:41:43,464 - easytorch-training - INFO - Result <val>: [val/time: 6.39 (s), val/loss: 3.1783]
|
| 84 |
+
2026-01-07 13:41:43,580 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_best_val_loss.pt saved
|
| 85 |
+
2026-01-07 13:41:43,692 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_040000.pt saved
|
| 86 |
+
2026-01-07 13:41:43,692 - easytorch-training - INFO - The estimated training finish time is 2026-01-08 16:49:20
|
| 87 |
+
2026-01-07 14:30:28,198 - easytorch-training - INFO - Iteration 45000 / 200003
|
| 88 |
+
2026-01-07 14:30:28,571 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.58 (s), train/lr: 3.72e-04, train/loss: 1.7527, train/grad_norm: 3.1179, train/amp_scale: 1.0000]
|
| 89 |
+
2026-01-07 14:30:28,571 - easytorch-training - INFO - Start validation.
|
| 90 |
+
2026-01-07 14:30:35,095 - easytorch-training - INFO - Result <val>: [val/time: 6.40 (s), val/loss: 3.1884]
|
| 91 |
+
2026-01-07 14:30:35,203 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_045000.pt saved
|
| 92 |
+
2026-01-07 14:30:35,203 - easytorch-training - INFO - The estimated training finish time is 2026-01-08 16:40:26
|
| 93 |
+
2026-01-07 15:19:32,332 - easytorch-training - INFO - Iteration 50000 / 200003
|
| 94 |
+
2026-01-07 15:19:32,703 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.59 (s), train/lr: 3.63e-04, train/loss: 1.7219, train/grad_norm: 3.1227, train/amp_scale: 1.0000]
|
| 95 |
+
2026-01-07 15:19:32,704 - easytorch-training - INFO - Start validation.
|
| 96 |
+
2026-01-07 15:19:39,229 - easytorch-training - INFO - Result <val>: [val/time: 6.40 (s), val/loss: 3.1710]
|
| 97 |
+
2026-01-07 15:19:39,354 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_best_val_loss.pt saved
|
| 98 |
+
2026-01-07 15:19:39,471 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_050000.pt saved
|
| 99 |
+
2026-01-07 15:19:39,473 - easytorch-training - INFO - The estimated training finish time is 2026-01-08 16:34:09
|
| 100 |
+
2026-01-07 16:08:29,693 - easytorch-training - INFO - Iteration 55000 / 200003
|
| 101 |
+
2026-01-07 16:08:30,063 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.59 (s), train/lr: 3.53e-04, train/loss: 1.6934, train/grad_norm: 3.1966, train/amp_scale: 1.0000]
|
| 102 |
+
2026-01-07 16:08:30,063 - easytorch-training - INFO - Start validation.
|
| 103 |
+
2026-01-07 16:08:36,589 - easytorch-training - INFO - Result <val>: [val/time: 6.41 (s), val/loss: 3.1712]
|
| 104 |
+
2026-01-07 16:08:36,708 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_055000.pt saved
|
| 105 |
+
2026-01-07 16:08:36,709 - easytorch-training - INFO - The estimated training finish time is 2026-01-08 16:28:36
|
| 106 |
+
2026-01-07 16:57:25,056 - easytorch-training - INFO - Iteration 60000 / 200003
|
| 107 |
+
2026-01-07 16:57:25,429 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.58 (s), train/lr: 3.41e-04, train/loss: 1.6860, train/grad_norm: 3.1026, train/amp_scale: 1.0000]
|
| 108 |
+
2026-01-07 16:57:25,430 - easytorch-training - INFO - Start validation.
|
| 109 |
+
2026-01-07 16:57:31,953 - easytorch-training - INFO - Result <val>: [val/time: 6.40 (s), val/loss: 3.1621]
|
| 110 |
+
2026-01-07 16:57:32,079 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_best_val_loss.pt saved
|
| 111 |
+
2026-01-07 16:57:32,196 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_060000.pt saved
|
| 112 |
+
2026-01-07 16:57:32,197 - easytorch-training - INFO - The estimated training finish time is 2026-01-08 16:23:52
|
| 113 |
+
2026-01-07 17:46:40,426 - easytorch-training - INFO - Iteration 65000 / 200003
|
| 114 |
+
2026-01-07 17:46:40,798 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.59 (s), train/lr: 3.29e-04, train/loss: 1.6761, train/grad_norm: 3.0234, train/amp_scale: 1.0000]
|
| 115 |
+
2026-01-07 17:46:40,798 - easytorch-training - INFO - Start validation.
|
| 116 |
+
2026-01-07 17:46:47,379 - easytorch-training - INFO - Result <val>: [val/time: 6.46 (s), val/loss: 3.1677]
|
| 117 |
+
2026-01-07 17:46:47,498 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_065000.pt saved
|
| 118 |
+
2026-01-07 17:46:47,498 - easytorch-training - INFO - The estimated training finish time is 2026-01-08 16:20:53
|
| 119 |
+
2026-01-07 18:34:08,164 - easytorch-training - INFO - Iteration 70000 / 200003
|
| 120 |
+
2026-01-07 18:34:08,534 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.57 (s), train/lr: 3.16e-04, train/loss: 1.6859, train/grad_norm: 3.1570, train/amp_scale: 1.0000]
|
| 121 |
+
2026-01-07 18:34:08,535 - easytorch-training - INFO - Start validation.
|
| 122 |
+
2026-01-07 18:34:15,064 - easytorch-training - INFO - Result <val>: [val/time: 6.41 (s), val/loss: 3.1601]
|
| 123 |
+
2026-01-07 18:34:15,188 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_best_val_loss.pt saved
|
| 124 |
+
2026-01-07 18:34:15,304 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_070000.pt saved
|
| 125 |
+
2026-01-07 18:34:15,304 - easytorch-training - INFO - The estimated training finish time is 2026-01-08 16:13:12
|
| 126 |
+
2026-01-07 19:21:47,633 - easytorch-training - INFO - Iteration 75000 / 200003
|
| 127 |
+
2026-01-07 19:21:48,003 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.57 (s), train/lr: 3.02e-04, train/loss: 1.6886, train/grad_norm: 3.0815, train/amp_scale: 1.0000]
|
| 128 |
+
2026-01-07 19:21:48,004 - easytorch-training - INFO - Start validation.
|
| 129 |
+
2026-01-07 19:21:54,547 - easytorch-training - INFO - Result <val>: [val/time: 6.42 (s), val/loss: 3.1612]
|
| 130 |
+
2026-01-07 19:21:54,657 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_075000.pt saved
|
| 131 |
+
2026-01-07 19:21:54,658 - easytorch-training - INFO - The estimated training finish time is 2026-01-08 16:07:04
|
| 132 |
+
2026-01-07 20:10:41,673 - easytorch-training - INFO - Iteration 80000 / 200003
|
| 133 |
+
2026-01-07 20:10:42,120 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.58 (s), train/lr: 2.88e-04, train/loss: 1.6720, train/grad_norm: 3.0019, train/amp_scale: 1.0000]
|
| 134 |
+
2026-01-07 20:10:42,120 - easytorch-training - INFO - Start validation.
|
| 135 |
+
2026-01-07 20:10:49,850 - easytorch-training - INFO - Result <val>: [val/time: 7.61 (s), val/loss: 3.1598]
|
| 136 |
+
2026-01-07 20:10:49,968 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_best_val_loss.pt saved
|
| 137 |
+
2026-01-07 20:10:50,076 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_080000.pt saved
|
| 138 |
+
2026-01-07 20:10:50,077 - easytorch-training - INFO - The estimated training finish time is 2026-01-08 16:04:52
|
| 139 |
+
2026-01-07 20:59:31,371 - easytorch-training - INFO - Iteration 85000 / 200003
|
| 140 |
+
2026-01-07 20:59:31,737 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.58 (s), train/lr: 2.73e-04, train/loss: 1.6886, train/grad_norm: 3.0279, train/amp_scale: 1.0000]
|
| 141 |
+
2026-01-07 20:59:31,738 - easytorch-training - INFO - Start validation.
|
| 142 |
+
2026-01-07 20:59:38,254 - easytorch-training - INFO - Result <val>: [val/time: 6.39 (s), val/loss: 3.1565]
|
| 143 |
+
2026-01-07 20:59:38,380 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_best_val_loss.pt saved
|
| 144 |
+
2026-01-07 20:59:38,498 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_085000.pt saved
|
| 145 |
+
2026-01-07 20:59:38,499 - easytorch-training - INFO - The estimated training finish time is 2026-01-08 16:02:38
|
| 146 |
+
2026-01-07 21:46:32,409 - easytorch-training - INFO - Iteration 90000 / 200003
|
| 147 |
+
2026-01-07 21:46:32,778 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.56 (s), train/lr: 2.57e-04, train/loss: 1.6757, train/grad_norm: 2.9444, train/amp_scale: 1.0000]
|
| 148 |
+
2026-01-07 21:46:32,778 - easytorch-training - INFO - Start validation.
|
| 149 |
+
2026-01-07 21:46:39,315 - easytorch-training - INFO - Result <val>: [val/time: 6.41 (s), val/loss: 3.1490]
|
| 150 |
+
2026-01-07 21:46:39,444 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_best_val_loss.pt saved
|
| 151 |
+
2026-01-07 21:46:39,562 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_090000.pt saved
|
| 152 |
+
2026-01-07 21:46:39,563 - easytorch-training - INFO - The estimated training finish time is 2026-01-08 15:56:41
|
| 153 |
+
2026-01-07 22:35:28,461 - easytorch-training - INFO - Iteration 95000 / 200003
|
| 154 |
+
2026-01-07 22:35:28,830 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.58 (s), train/lr: 2.41e-04, train/loss: 1.6787, train/grad_norm: 3.0766, train/amp_scale: 1.0000]
|
| 155 |
+
2026-01-07 22:35:28,830 - easytorch-training - INFO - Start validation.
|
| 156 |
+
2026-01-07 22:35:35,488 - easytorch-training - INFO - Result <val>: [val/time: 6.54 (s), val/loss: 3.1522]
|
| 157 |
+
2026-01-07 22:35:35,596 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_095000.pt saved
|
| 158 |
+
2026-01-07 22:35:35,597 - easytorch-training - INFO - The estimated training finish time is 2026-01-08 15:55:24
|
| 159 |
+
2026-01-07 23:23:36,357 - easytorch-training - INFO - Iteration 100000 / 200003
|
| 160 |
+
2026-01-07 23:23:36,728 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.58 (s), train/lr: 2.25e-04, train/loss: 1.6803, train/grad_norm: 2.9411, train/amp_scale: 1.0000]
|
| 161 |
+
2026-01-07 23:23:36,729 - easytorch-training - INFO - Start validation.
|
| 162 |
+
2026-01-07 23:23:43,275 - easytorch-training - INFO - Result <val>: [val/time: 6.42 (s), val/loss: 3.1480]
|
| 163 |
+
2026-01-07 23:23:43,392 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_best_val_loss.pt saved
|
| 164 |
+
2026-01-07 23:23:43,502 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_100000.pt saved
|
| 165 |
+
2026-01-07 23:23:43,503 - easytorch-training - INFO - The estimated training finish time is 2026-01-08 15:52:38
|
| 166 |
+
2026-01-08 00:12:26,912 - easytorch-training - INFO - Iteration 105000 / 200003
|
| 167 |
+
2026-01-08 00:12:27,283 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.58 (s), train/lr: 2.08e-04, train/loss: 1.6578, train/grad_norm: 2.9696, train/amp_scale: 1.0000]
|
| 168 |
+
2026-01-08 00:12:27,284 - easytorch-training - INFO - Start validation.
|
| 169 |
+
2026-01-08 00:12:33,832 - easytorch-training - INFO - Result <val>: [val/time: 6.43 (s), val/loss: 3.1484]
|
| 170 |
+
2026-01-08 00:12:33,946 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_105000.pt saved
|
| 171 |
+
2026-01-08 00:12:33,947 - easytorch-training - INFO - The estimated training finish time is 2026-01-08 15:51:29
|
| 172 |
+
2026-01-08 01:00:28,211 - easytorch-training - INFO - Iteration 110000 / 200003
|
| 173 |
+
2026-01-08 01:00:28,578 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.57 (s), train/lr: 1.92e-04, train/loss: 1.6578, train/grad_norm: 2.9220, train/amp_scale: 1.0000]
|
| 174 |
+
2026-01-08 01:00:28,578 - easytorch-training - INFO - Start validation.
|
| 175 |
+
2026-01-08 01:00:35,509 - easytorch-training - INFO - Result <val>: [val/time: 6.81 (s), val/loss: 3.1463]
|
| 176 |
+
2026-01-08 01:00:35,625 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_best_val_loss.pt saved
|
| 177 |
+
2026-01-08 01:00:35,733 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_110000.pt saved
|
| 178 |
+
2026-01-08 01:00:35,733 - easytorch-training - INFO - The estimated training finish time is 2026-01-08 15:48:58
|
| 179 |
+
2026-01-08 01:48:57,594 - easytorch-training - INFO - Iteration 115000 / 200003
|
| 180 |
+
2026-01-08 01:48:57,963 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.58 (s), train/lr: 1.75e-04, train/loss: 1.6509, train/grad_norm: 2.9594, train/amp_scale: 1.0000]
|
| 181 |
+
2026-01-08 01:48:57,963 - easytorch-training - INFO - Start validation.
|
| 182 |
+
2026-01-08 01:49:04,521 - easytorch-training - INFO - Result <val>: [val/time: 6.44 (s), val/loss: 3.1445]
|
| 183 |
+
2026-01-08 01:49:04,637 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_best_val_loss.pt saved
|
| 184 |
+
2026-01-08 01:49:04,745 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_115000.pt saved
|
| 185 |
+
2026-01-08 01:49:04,746 - easytorch-training - INFO - The estimated training finish time is 2026-01-08 15:47:27
|
| 186 |
+
2026-01-08 02:37:42,819 - easytorch-training - INFO - Iteration 120000 / 200003
|
| 187 |
+
2026-01-08 02:37:43,186 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.58 (s), train/lr: 1.59e-04, train/loss: 1.6437, train/grad_norm: 3.2303, train/amp_scale: 1.0000]
|
| 188 |
+
2026-01-08 02:37:43,187 - easytorch-training - INFO - Start validation.
|
| 189 |
+
2026-01-08 02:37:49,711 - easytorch-training - INFO - Result <val>: [val/time: 6.40 (s), val/loss: 3.1487]
|
| 190 |
+
2026-01-08 02:37:49,820 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_120000.pt saved
|
| 191 |
+
2026-01-08 02:37:49,821 - easytorch-training - INFO - The estimated training finish time is 2026-01-08 15:46:31
|
| 192 |
+
2026-01-08 03:27:03,918 - easytorch-training - INFO - Iteration 125000 / 200003
|
| 193 |
+
2026-01-08 03:27:04,286 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.59 (s), train/lr: 1.43e-04, train/loss: 1.8331, train/grad_norm: 57.6050, train/amp_scale: 1.0000]
|
| 194 |
+
2026-01-08 03:27:04,287 - easytorch-training - INFO - Start validation.
|
| 195 |
+
2026-01-08 03:27:10,815 - easytorch-training - INFO - Result <val>: [val/time: 6.41 (s), val/loss: 3.3145]
|
| 196 |
+
2026-01-08 03:27:10,922 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_125000.pt saved
|
| 197 |
+
2026-01-08 03:27:10,923 - easytorch-training - INFO - The estimated training finish time is 2026-01-08 15:46:37
|
| 198 |
+
2026-01-08 04:16:06,610 - easytorch-training - INFO - Iteration 130000 / 200003
|
| 199 |
+
2026-01-08 04:16:06,980 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.59 (s), train/lr: 1.27e-04, train/loss: 2.4557, train/grad_norm: 1590.0738, train/amp_scale: 1.0000]
|
| 200 |
+
2026-01-08 04:16:06,980 - easytorch-training - INFO - Start validation.
|
| 201 |
+
2026-01-08 04:16:13,506 - easytorch-training - INFO - Result <val>: [val/time: 6.40 (s), val/loss: 4.2894]
|
| 202 |
+
2026-01-08 04:16:13,613 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_130000.pt saved
|
| 203 |
+
2026-01-08 04:16:13,614 - easytorch-training - INFO - The estimated training finish time is 2026-01-08 15:46:14
|
| 204 |
+
2026-01-08 05:04:55,295 - easytorch-training - INFO - Iteration 135000 / 200003
|
| 205 |
+
2026-01-08 05:04:55,663 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.58 (s), train/lr: 1.12e-04, train/loss: 3.9402, train/grad_norm: 11837.9622, train/amp_scale: 1.0000]
|
| 206 |
+
2026-01-08 05:04:55,664 - easytorch-training - INFO - Start validation.
|
| 207 |
+
2026-01-08 05:05:02,197 - easytorch-training - INFO - Result <val>: [val/time: 6.41 (s), val/loss: 4.5390]
|
| 208 |
+
2026-01-08 05:05:02,306 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_135000.pt saved
|
| 209 |
+
2026-01-08 05:05:02,307 - easytorch-training - INFO - The estimated training finish time is 2026-01-08 15:45:32
|
| 210 |
+
2026-01-08 05:53:41,209 - easytorch-training - INFO - Iteration 140000 / 200003
|
| 211 |
+
2026-01-08 05:53:41,574 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.58 (s), train/lr: 9.77e-05, train/loss: 4.2236, train/grad_norm: 497822.2043, train/amp_scale: 1.0000]
|
| 212 |
+
2026-01-08 05:53:41,574 - easytorch-training - INFO - Start validation.
|
| 213 |
+
2026-01-08 05:53:48,090 - easytorch-training - INFO - Result <val>: [val/time: 6.39 (s), val/loss: 4.3929]
|
| 214 |
+
2026-01-08 05:53:48,197 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_140000.pt saved
|
| 215 |
+
2026-01-08 05:53:48,198 - easytorch-training - INFO - The estimated training finish time is 2026-01-08 15:44:49
|
| 216 |
+
2026-01-08 06:42:18,216 - easytorch-training - INFO - Iteration 145000 / 200003
|
| 217 |
+
2026-01-08 06:42:18,586 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.58 (s), train/lr: 8.38e-05, train/loss: 4.6652, train/grad_norm: 261043219.4423, train/amp_scale: 1.0000]
|
| 218 |
+
2026-01-08 06:42:18,587 - easytorch-training - INFO - Start validation.
|
| 219 |
+
2026-01-08 06:42:25,113 - easytorch-training - INFO - Result <val>: [val/time: 6.40 (s), val/loss: 4.6455]
|
| 220 |
+
2026-01-08 06:42:25,225 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_145000.pt saved
|
| 221 |
+
2026-01-08 06:42:25,226 - easytorch-training - INFO - The estimated training finish time is 2026-01-08 15:43:56
|
| 222 |
+
2026-01-08 07:30:49,543 - easytorch-training - INFO - Iteration 150000 / 200003
|
| 223 |
+
2026-01-08 07:30:49,926 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.58 (s), train/lr: 7.08e-05, train/loss: 4.8012, train/grad_norm: 542310146.5395, train/amp_scale: 1.0000]
|
| 224 |
+
2026-01-08 07:30:49,926 - easytorch-training - INFO - Start validation.
|
| 225 |
+
2026-01-08 07:30:56,443 - easytorch-training - INFO - Result <val>: [val/time: 6.39 (s), val/loss: 4.5128]
|
| 226 |
+
2026-01-08 07:30:56,554 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_150000.pt saved
|
| 227 |
+
2026-01-08 07:30:56,555 - easytorch-training - INFO - The estimated training finish time is 2026-01-08 15:43:00
|
| 228 |
+
2026-01-08 08:19:05,663 - easytorch-training - INFO - Iteration 155000 / 200003
|
| 229 |
+
2026-01-08 08:19:06,031 - easytorch-training - INFO - Result <train>: [train/iter_time: 0.58 (s), train/lr: 5.86e-05, train/loss: 4.5055, train/grad_norm: 370485325.2526, train/amp_scale: 1.0000]
|
| 230 |
+
2026-01-08 08:19:06,031 - easytorch-training - INFO - Start validation.
|
| 231 |
+
2026-01-08 08:19:12,566 - easytorch-training - INFO - Result <val>: [val/time: 6.41 (s), val/loss: 4.8368]
|
| 232 |
+
2026-01-08 08:19:12,674 - easytorch-training - INFO - Checkpoint checkpoints/TimeMoE4/Main_200003/937fc7f323a4d7d3ae9480105345ef67/TimeMoE4_155000.pt saved
|
| 233 |
+
2026-01-08 08:19:12,675 - easytorch-training - INFO - The estimated training finish time is 2026-01-08 15:41:48
|
| 234 |
+
2026-01-08 08:30:31,699 - easytorch-training - ERROR - Traceback (most recent call last):
|
| 235 |
+
File "/home/nvidia/miniconda3/envs/zxx/lib/python3.11/site-packages/easytorch/launcher/launcher.py", line 31, in training_func
|
| 236 |
+
runner.train(cfg)
|
| 237 |
+
File "/lp-dev/zhouxx/BasicTS/basicts/runners/base_iteration_runner.py", line 200, in train
|
| 238 |
+
self.train_iters(iteration=iteration, dataloader=self.train_data_loader)
|
| 239 |
+
File "/lp-dev/zhouxx/BasicTS/basicts/runners/base_utsf_runner.py", line 281, in train_iters
|
| 240 |
+
self.backward(loss, accumulating=accumulating)
|
| 241 |
+
File "/lp-dev/zhouxx/BasicTS/basicts/runners/base_utsf_runner.py", line 337, in backward
|
| 242 |
+
grad_norm = sum(
|
| 243 |
+
^^^^
|
| 244 |
+
File "/lp-dev/zhouxx/BasicTS/basicts/runners/base_utsf_runner.py", line 338, in <genexpr>
|
| 245 |
+
param.grad.data.norm(2).item() ** 2 for param in self.model.parameters() if param.grad is not None
|
| 246 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 247 |
+
KeyboardInterrupt
|
| 248 |
+
|