Upload folder using huggingface_hub
Browse files- logs/output_run_20260131_hung_pitapat.log +136 -0
- logs/run_20260131_hung_pitapat_train.csv +18 -0
- logs/run_20260131_hung_pitapat_train.png +0 -0
- logs/run_20260131_hung_pitapat_val.csv +1 -0
- model_checkpoints/run_20260131_hung_pitapat/checkpoint-1605.pt/_CHECKPOINT_METADATA +3 -0
- model_checkpoints/run_20260131_hung_pitapat/checkpoint-1605.pt/_METADATA +3 -0
- model_checkpoints/run_20260131_hung_pitapat/checkpoint-1605.pt/_sharding +3 -0
- model_checkpoints/run_20260131_hung_pitapat/checkpoint-1605.pt/array_metadatas/process_0 +3 -0
- model_checkpoints/run_20260131_hung_pitapat/checkpoint-1605.pt/d/24908042f611bcc78959e7a8c56485f7 +3 -0
- model_checkpoints/run_20260131_hung_pitapat/checkpoint-1605.pt/manifest.ocdbt +3 -0
- model_checkpoints/run_20260131_hung_pitapat/checkpoint-1605.pt/ocdbt.process_0/d/6468bf454fcb2abaed124681f9a1cd48 +3 -0
- model_checkpoints/run_20260131_hung_pitapat/checkpoint-1605.pt/ocdbt.process_0/d/7763b086c54b886a362d829ce51d7c91 +3 -0
- model_checkpoints/run_20260131_hung_pitapat/checkpoint-1605.pt/ocdbt.process_0/d/81a3e4c8ed51983cfba1b04728dcef9d +3 -0
- model_checkpoints/run_20260131_hung_pitapat/checkpoint-1605.pt/ocdbt.process_0/d/c2a448087757f68112629249bf32a694 +3 -0
- model_checkpoints/run_20260131_hung_pitapat/checkpoint-1605.pt/ocdbt.process_0/d/c4b6ab29f0fbc27680bcca4261f1b1f1 +3 -0
- model_checkpoints/run_20260131_hung_pitapat/checkpoint-1605.pt/ocdbt.process_0/d/c902b46c692fb50b81287f337039a7ba +3 -0
- model_checkpoints/run_20260131_hung_pitapat/checkpoint-1605.pt/ocdbt.process_0/d/cba8c1c0f064d4c3234f460c76e9bf1f +3 -0
- model_checkpoints/run_20260131_hung_pitapat/checkpoint-1605.pt/ocdbt.process_0/d/e054c24c67b06441f4b1473692431e4b +3 -0
- model_checkpoints/run_20260131_hung_pitapat/checkpoint-1605.pt/ocdbt.process_0/manifest.ocdbt +3 -0
logs/output_run_20260131_hung_pitapat.log
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2026-01-31 04:28:54,215 - root - INFO - Run: run_20260131_hung_pitapat
|
| 2 |
+
2026-01-31 04:28:54,215 - root - INFO - Log directory: /root/tiny_moe/training_runs/Tiny_MoE/logs
|
| 3 |
+
2026-01-31 04:28:54,215 - root - INFO - Output dir: /root/tiny_moe/training_runs
|
| 4 |
+
2026-01-31 04:28:59,515 - jax._src.xla_bridge - INFO - Unable to initialize backend 'tpu': INTERNAL: Failed to open libtpu.so: libtpu.so: cannot open shared object file: No such file or directory
|
| 5 |
+
2026-01-31 04:29:02,782 - root - INFO - Flax version: 0.11.1
|
| 6 |
+
2026-01-31 04:29:02,783 - root - INFO - Optax version: 0.2.6
|
| 7 |
+
2026-01-31 04:29:02,783 - root - INFO - Platform: gpu
|
| 8 |
+
2026-01-31 04:29:02,783 - root - INFO - Num Devices: 8
|
| 9 |
+
2026-01-31 04:29:02,783 - root - INFO - Devices: [CudaDevice(id=0), CudaDevice(id=1), CudaDevice(id=2), CudaDevice(id=3), CudaDevice(id=4), CudaDevice(id=5), CudaDevice(id=6), CudaDevice(id=7)]
|
| 10 |
+
2026-01-31 04:29:03,665 - root - INFO - Model config:
|
| 11 |
+
Config(name='Tiny_MoE',
|
| 12 |
+
dtype=<class 'jax.numpy.bfloat16'>,
|
| 13 |
+
vocab_size=50304,
|
| 14 |
+
block_size=2048,
|
| 15 |
+
n_layer=30,
|
| 16 |
+
n_embed=672,
|
| 17 |
+
n_glu_hidden=2048,
|
| 18 |
+
n_head=12,
|
| 19 |
+
n_kv_head=4,
|
| 20 |
+
n_experts=8,
|
| 21 |
+
init_stddev=0.02,
|
| 22 |
+
expert_load_factor=1.25,
|
| 23 |
+
aux_loss_coeff=0.01,
|
| 24 |
+
moe_bias=False,
|
| 25 |
+
mlp_bias=False,
|
| 26 |
+
attention_bias=False,
|
| 27 |
+
load_balance_loss_coeff=0.01,
|
| 28 |
+
z_loss_coeff=0.0005,
|
| 29 |
+
expert_top_k=2,
|
| 30 |
+
ln_epsilon=1e-05,
|
| 31 |
+
rope_theta=0.0001,
|
| 32 |
+
expert_partition_spec=PartitionSpec('devices',),
|
| 33 |
+
sdpa_implementation='cudnn')
|
| 34 |
+
2026-01-31 04:30:49,061 - root - INFO - Parameter Count: 1,062,185,520
|
| 35 |
+
2026-01-31 04:30:49,061 - root - INFO - Sharded / MoE Parameter Count: 992,210,160
|
| 36 |
+
2026-01-31 04:30:49,061 - root - INFO - Replicated Parameter Count: 69,975,360
|
| 37 |
+
2026-01-31 04:30:50,495 - root - INFO - Weight decay param count: 1,062,140,928
|
| 38 |
+
2026-01-31 04:30:50,496 - root - INFO - Training config:
|
| 39 |
+
TrainerConfig(num_tokens=100000000000,
|
| 40 |
+
num_tokens_per_batch=262144,
|
| 41 |
+
mB=128,
|
| 42 |
+
T=2048,
|
| 43 |
+
max_steps=381469,
|
| 44 |
+
max_lr=0.001,
|
| 45 |
+
min_lr=0.0001,
|
| 46 |
+
max_grad_norm=1.0,
|
| 47 |
+
weight_decay=0.1,
|
| 48 |
+
adam_b1=0.9,
|
| 49 |
+
adam_b2=0.95,
|
| 50 |
+
warmup_steps=3814,
|
| 51 |
+
print_interval=100,
|
| 52 |
+
val=True,
|
| 53 |
+
val_interval=5000,
|
| 54 |
+
val_batches=50,
|
| 55 |
+
checkpoint_model=True,
|
| 56 |
+
checkpoint_optimizer=False,
|
| 57 |
+
checkpoint_interval=10000)
|
| 58 |
+
2026-01-31 04:30:50,496 - root - INFO - Effective batch size per device: 16
|
| 59 |
+
2026-01-31 04:30:51,627 - root - INFO - ModdedNanoGPTDataLoader: 1030 shards (train)
|
| 60 |
+
2026-01-31 04:30:51,628 - root - INFO - Downloading fineweb_train_000001.bin from kjj0/fineweb100B-gpt2...
|
| 61 |
+
2026-01-31 04:30:52,175 - httpx - INFO - HTTP Request: HEAD https://huggingface.co/datasets/kjj0/fineweb100B-gpt2/resolve/main/fineweb_train_000001.bin "HTTP/1.1 302 Found"
|
| 62 |
+
2026-01-31 04:30:52,204 - httpx - INFO - HTTP Request: GET https://huggingface.co/api/datasets/kjj0/fineweb100B-gpt2/xet-read-token/50d1422b27e1a928440c26a8829f3f827f44ac56 "HTTP/1.1 200 OK"
|
| 63 |
+
2026-01-31 04:30:54,055 - root - INFO - HuggingfaceDataLoader initialized:
|
| 64 |
+
------------------------
|
| 65 |
+
label: train
|
| 66 |
+
shards: 1,030
|
| 67 |
+
shard size: 100,000,000
|
| 68 |
+
batch size: 128
|
| 69 |
+
block size: 2048
|
| 70 |
+
device rank: 1
|
| 71 |
+
start shard: 0
|
| 72 |
+
start pos: 0
|
| 73 |
+
------------------------
|
| 74 |
+
2026-01-31 04:30:54,055 - root - INFO - ModdedNanoGPTDataLoader: 1 shards (val)
|
| 75 |
+
2026-01-31 04:30:54,056 - root - INFO - Downloading fineweb_val_000000.bin from kjj0/fineweb100B-gpt2...
|
| 76 |
+
2026-01-31 04:30:54,101 - httpx - INFO - HTTP Request: HEAD https://huggingface.co/datasets/kjj0/fineweb100B-gpt2/resolve/main/fineweb_val_000000.bin "HTTP/1.1 302 Found"
|
| 77 |
+
2026-01-31 04:30:56,300 - root - INFO - Starting from step: 0
|
| 78 |
+
2026-01-31 04:32:47,073 - root - INFO - 0 | lr: 0.0000 | loss: 13.8289 | logits loss: 13.4375 | load balance loss: 30.1355 | z loss: 145.0000 | avg iter time: 0.00ms | avg tok/sec: 0.00 | tokens processed: 262,144
|
| 79 |
+
2026-01-31 04:36:03,300 - root - INFO - 100 | lr: 0.0000 | loss: 8.5689 | logits loss: 8.2500 | load balance loss: 30.3916 | z loss: 28.7500 | avg iter time: 1954.95ms | avg tok/sec: 134,092.44 | tokens processed: 26,476,544
|
| 80 |
+
2026-01-31 04:37:38,414 - root - INFO - 200 | lr: 0.0001 | loss: 7.1279 | logits loss: 6.8125 | load balance loss: 30.3412 | z loss: 14.6250 | avg iter time: 943.79ms | avg tok/sec: 277,755.86 | tokens processed: 52,690,944
|
| 81 |
+
2026-01-31 04:39:14,180 - root - INFO - 300 | lr: 0.0001 | loss: 6.3679 | logits loss: 6.0625 | load balance loss: 30.5275 | z loss: 13.5625 | avg iter time: 950.36ms | avg tok/sec: 275,836.23 | tokens processed: 78,905,344
|
| 82 |
+
2026-01-31 04:40:30,043 - root - INFO - Downloading fineweb_train_000002.bin from kjj0/fineweb100B-gpt2...
|
| 83 |
+
2026-01-31 04:40:30,085 - httpx - INFO - HTTP Request: HEAD https://huggingface.co/datasets/kjj0/fineweb100B-gpt2/resolve/main/fineweb_train_000002.bin "HTTP/1.1 302 Found"
|
| 84 |
+
2026-01-31 04:40:51,047 - root - INFO - 400 | lr: 0.0001 | loss: 6.0706 | logits loss: 5.7500 | load balance loss: 30.2374 | z loss: 12.0000 | avg iter time: 961.32ms | avg tok/sec: 272,690.93 | tokens processed: 105,119,744
|
| 85 |
+
2026-01-31 04:42:25,669 - root - INFO - 500 | lr: 0.0001 | loss: 5.6715 | logits loss: 5.3750 | load balance loss: 30.1538 | z loss: 10.2500 | avg iter time: 938.92ms | avg tok/sec: 279,198.09 | tokens processed: 131,334,144
|
| 86 |
+
2026-01-31 04:44:00,603 - root - INFO - 600 | lr: 0.0002 | loss: 5.3976 | logits loss: 5.0938 | load balance loss: 30.1780 | z loss: 7.1562 | avg iter time: 942.06ms | avg tok/sec: 278,266.03 | tokens processed: 157,548,544
|
| 87 |
+
2026-01-31 04:45:36,143 - root - INFO - 700 | lr: 0.0002 | loss: 5.2308 | logits loss: 4.9375 | load balance loss: 30.1354 | z loss: 6.0625 | avg iter time: 948.16ms | avg tok/sec: 276,477.28 | tokens processed: 183,762,944
|
| 88 |
+
2026-01-31 04:46:33,973 - root - INFO - Downloading fineweb_train_000003.bin from kjj0/fineweb100B-gpt2...
|
| 89 |
+
2026-01-31 04:46:34,020 - httpx - INFO - HTTP Request: HEAD https://huggingface.co/datasets/kjj0/fineweb100B-gpt2/resolve/main/fineweb_train_000003.bin "HTTP/1.1 302 Found"
|
| 90 |
+
2026-01-31 04:46:34,038 - httpx - INFO - HTTP Request: GET https://huggingface.co/api/datasets/kjj0/fineweb100B-gpt2/xet-read-token/50d1422b27e1a928440c26a8829f3f827f44ac56 "HTTP/1.1 200 OK"
|
| 91 |
+
2026-01-31 04:47:12,875 - root - INFO - 800 | lr: 0.0002 | loss: 5.1568 | logits loss: 4.8438 | load balance loss: 30.0979 | z loss: 4.6562 | avg iter time: 960.05ms | avg tok/sec: 273,052.39 | tokens processed: 209,977,344
|
| 92 |
+
2026-01-31 04:48:47,805 - root - INFO - 900 | lr: 0.0002 | loss: 4.9039 | logits loss: 4.5938 | load balance loss: 30.1480 | z loss: 5.1875 | avg iter time: 942.09ms | avg tok/sec: 278,259.15 | tokens processed: 236,191,744
|
| 93 |
+
2026-01-31 04:50:22,528 - root - INFO - 1000 | lr: 0.0003 | loss: 4.8597 | logits loss: 4.5625 | load balance loss: 30.1017 | z loss: 3.9844 | avg iter time: 939.85ms | avg tok/sec: 278,921.88 | tokens processed: 262,406,144
|
| 94 |
+
2026-01-31 04:51:58,308 - root - INFO - 1100 | lr: 0.0003 | loss: 4.7187 | logits loss: 4.4062 | load balance loss: 30.0914 | z loss: 3.6094 | avg iter time: 950.45ms | avg tok/sec: 275,810.45 | tokens processed: 288,620,544
|
| 95 |
+
2026-01-31 04:52:38,995 - root - INFO - Downloading fineweb_train_000004.bin from kjj0/fineweb100B-gpt2...
|
| 96 |
+
2026-01-31 04:52:39,047 - httpx - INFO - HTTP Request: HEAD https://huggingface.co/datasets/kjj0/fineweb100B-gpt2/resolve/main/fineweb_train_000004.bin "HTTP/1.1 302 Found"
|
| 97 |
+
2026-01-31 04:53:35,267 - root - INFO - 1200 | lr: 0.0003 | loss: 4.4736 | logits loss: 4.1562 | load balance loss: 30.1062 | z loss: 3.2188 | avg iter time: 962.26ms | avg tok/sec: 272,424.03 | tokens processed: 314,834,944
|
| 98 |
+
2026-01-31 04:55:10,157 - root - INFO - 1300 | lr: 0.0003 | loss: 4.4982 | logits loss: 4.1875 | load balance loss: 30.1618 | z loss: 3.4375 | avg iter time: 941.60ms | avg tok/sec: 278,401.74 | tokens processed: 341,049,344
|
| 99 |
+
2026-01-31 04:56:45,466 - root - INFO - 1400 | lr: 0.0004 | loss: 4.4795 | logits loss: 4.1875 | load balance loss: 30.1030 | z loss: 3.0781 | avg iter time: 945.87ms | avg tok/sec: 277,145.59 | tokens processed: 367,263,744
|
| 100 |
+
2026-01-31 04:58:19,963 - root - INFO - 1500 | lr: 0.0004 | loss: 4.3952 | logits loss: 4.0938 | load balance loss: 30.1506 | z loss: 2.8438 | avg iter time: 937.66ms | avg tok/sec: 279,572.79 | tokens processed: 393,478,144
|
| 101 |
+
2026-01-31 04:58:42,570 - root - INFO - Downloading fineweb_train_000005.bin from kjj0/fineweb100B-gpt2...
|
| 102 |
+
2026-01-31 04:58:42,654 - httpx - INFO - HTTP Request: HEAD https://huggingface.co/datasets/kjj0/fineweb100B-gpt2/resolve/main/fineweb_train_000005.bin "HTTP/1.1 302 Found"
|
| 103 |
+
2026-01-31 04:59:56,717 - root - INFO - 1600 | lr: 0.0004 | loss: 4.4342 | logits loss: 4.1250 | load balance loss: 30.0915 | z loss: 2.7188 | avg iter time: 960.24ms | avg tok/sec: 272,999.04 | tokens processed: 419,692,544
|
| 104 |
+
2026-01-31 04:59:59,653 - root - WARNING - Received KeyboardInterrupt. Exiting...
|
| 105 |
+
2026-01-31 05:00:00,409 - absl - INFO - orbax-checkpoint version: 0.11.32
|
| 106 |
+
2026-01-31 05:00:00,410 - absl - INFO - save_device_host_concurrent_bytes=None
|
| 107 |
+
2026-01-31 05:00:00,410 - absl - INFO - Created BasePyTreeCheckpointHandler: use_ocdbt=True, use_zarr3=False, pytree_metadata_options=PyTreeMetadataOptions(support_rich_types=False), array_metadata_store=<orbax.checkpoint._src.metadata.array_metadata_store.Store object at 0x73e8a136fd90>, enable_pinned_host_transfer=True, save_concurrent_bytes: 96000000000 (89.4 GiB), restore_concurrent_bytes: 96000000000 (89.4 GiB)
|
| 108 |
+
2026-01-31 05:00:00,410 - absl - INFO - [thread=MainThread] Failed to get flag value for EXPERIMENTAL_ORBAX_USE_DISTRIBUTED_PROCESS_ID.
|
| 109 |
+
2026-01-31 05:00:00,410 - absl - INFO - [process=0][thread=MainThread] Using barrier_sync_fn: <function get_barrier_sync_fn.<locals>.<lambda> at 0x73e33e2d45e0> timeout: 600 secs and primary_host=0 for async checkpoint writes
|
| 110 |
+
2026-01-31 05:00:00,411 - absl - INFO - [process=0] Started async saving checkpoint to /root/tiny_moe/training_runs/Tiny_MoE/model_checkpoints/run_20260131_hung_pitapat/checkpoint-1605.pt.
|
| 111 |
+
2026-01-31 05:00:00,412 - absl - INFO - Using ThreadSafeKeyValueSignalingClient
|
| 112 |
+
2026-01-31 05:00:00,434 - absl - INFO - Creating tmp directory /root/tiny_moe/training_runs/Tiny_MoE/model_checkpoints/run_20260131_hung_pitapat/checkpoint-1605.pt.orbax-checkpoint-tmp
|
| 113 |
+
2026-01-31 05:00:00,497 - absl - INFO - Wrote Metadata={'item_handlers': None, 'metrics': {}, 'performance_metrics': {}, 'init_timestamp_nsecs': 1769835600469761863, 'commit_timestamp_nsecs': None, 'custom_metadata': {}}, json={"item_handlers": null, "metrics": {}, "performance_metrics": {}, "init_timestamp_nsecs": 1769835600469761863, "commit_timestamp_nsecs": null, "custom_metadata": {}} to /root/tiny_moe/training_runs/Tiny_MoE/model_checkpoints/run_20260131_hung_pitapat/checkpoint-1605.pt.orbax-checkpoint-tmp/_CHECKPOINT_METADATA
|
| 114 |
+
2026-01-31 05:00:00,543 - absl - INFO - Scheduling D2H of 482 prioritized jax.Array.
|
| 115 |
+
2026-01-31 05:00:00,544 - absl - INFO - Transferring arrays to host memory with options: use_replica_parallel=True, min_slice_bytes_for_replica_parallel=None, max_replicas_for_replica_parallel=None, enable_pinned_host_transfer=True
|
| 116 |
+
2026-01-31 05:00:09,991 - absl - INFO - [process=0][thread=MainThread] Initiated "orbax.checkpoint._src.serialization.jax_array_handlers.ArrayHandler".serialize. Time taken: 9.450570s
|
| 117 |
+
2026-01-31 05:00:10,008 - absl - INFO - [process=0] /jax/checkpoint/write/blocking_gbytes_per_sec: 423.285 MiB/s (total gbytes: 4.0 GiB) (time elapsed: 9 seconds) (per-host)
|
| 118 |
+
2026-01-31 05:00:10,008 - absl - INFO - [process=0][thread=MainThread] Initiated Pytree async_save. Time taken: 9.588295s (batch_requests_ready=0.044028s, total_serialization_initiated=9.538232s, others=0.006035s)
|
| 119 |
+
2026-01-31 05:00:10,076 - absl - INFO - [process=0][thread=async_save] Background save thread started.
|
| 120 |
+
2026-01-31 05:00:10,078 - absl - INFO - Finished blocking save. Time taken: 9.668147s. Continuing background save to /root/tiny_moe/training_runs/Tiny_MoE/model_checkpoints/run_20260131_hung_pitapat/checkpoint-1605.pt.
|
| 121 |
+
2026-01-31 05:00:10,092 - absl - INFO - [process=0][thread=MainThread] Waiting for background save thread=async_save.
|
| 122 |
+
2026-01-31 05:00:10,212 - absl - INFO - [process=0][thread=array_type_handler] Wrote 482 array_metadata.ArrayMetadata to /root/tiny_moe/training_runs/Tiny_MoE/model_checkpoints/run_20260131_hung_pitapat/checkpoint-1605.pt.orbax-checkpoint-tmp/array_metadatas/process_0
|
| 123 |
+
2026-01-31 05:00:15,137 - absl - INFO - [process=0][thread=write_metadata_after_commits] Commit + Array metadata written. Time taken: 5.127631s (commit=5.093810s, array_metadata_write=0.033821s)
|
| 124 |
+
2026-01-31 05:00:15,141 - absl - INFO - [process=0] /jax/checkpoint/write/gbytes_per_sec: 275.704 MiB/s (total gbytes: 4.0 GiB) (time elapsed: 14 seconds) (per-host)
|
| 125 |
+
2026-01-31 05:00:15,141 - absl - INFO - [process=0][thread=async_save] 2 Handler Commit operations completed. Time taken: 5.050204s.
|
| 126 |
+
2026-01-31 05:00:15,143 - absl - INFO - Read Metadata={'item_handlers': None, 'metrics': {}, 'performance_metrics': {}, 'init_timestamp_nsecs': 1769835600469761863, 'commit_timestamp_nsecs': None, 'custom_metadata': {}} from /root/tiny_moe/training_runs/Tiny_MoE/model_checkpoints/run_20260131_hung_pitapat/checkpoint-1605.pt.orbax-checkpoint-tmp/_CHECKPOINT_METADATA
|
| 127 |
+
2026-01-31 05:00:15,145 - absl - INFO - Updated Metadata={'item_handlers': 'orbax.checkpoint._src.handlers.standard_checkpoint_handler.StandardCheckpointHandler', 'metrics': {}, 'performance_metrics': {}, 'init_timestamp_nsecs': 1769835600469761863, 'commit_timestamp_nsecs': None, 'custom_metadata': {}} to /root/tiny_moe/training_runs/Tiny_MoE/model_checkpoints/run_20260131_hung_pitapat/checkpoint-1605.pt.orbax-checkpoint-tmp/_CHECKPOINT_METADATA
|
| 128 |
+
2026-01-31 05:00:15,148 - absl - INFO - [process=0][thread=async_save] Skipped cross-host ArrayMetadata validation because only one process is found: process_index=0.
|
| 129 |
+
2026-01-31 05:00:15,170 - absl - INFO - [process=0][thread=async_save] Pytree save finalize (merge_ocdbt + ArrayMetadata validation) completed. Time taken: 0.028268s. use_zarr3=False, enable_post_merge_validation=True, directory=/root/tiny_moe/training_runs/Tiny_MoE/model_checkpoints/run_20260131_hung_pitapat/checkpoint-1605.pt.orbax-checkpoint-tmp
|
| 130 |
+
2026-01-31 05:00:15,172 - absl - INFO - Renaming /root/tiny_moe/training_runs/Tiny_MoE/model_checkpoints/run_20260131_hung_pitapat/checkpoint-1605.pt.orbax-checkpoint-tmp to /root/tiny_moe/training_runs/Tiny_MoE/model_checkpoints/run_20260131_hung_pitapat/checkpoint-1605.pt
|
| 131 |
+
2026-01-31 05:00:15,174 - absl - INFO - [process=0][thread=async_save] Finished saving checkpoint (finalized tmp dir) to `/root/tiny_moe/training_runs/Tiny_MoE/model_checkpoints/run_20260131_hung_pitapat/checkpoint-1605.pt`.
|
| 132 |
+
2026-01-31 05:00:15,175 - absl - INFO - Finished async_save (blocking + background). Time taken: 14.764462s. directory=/root/tiny_moe/training_runs/Tiny_MoE/model_checkpoints/run_20260131_hung_pitapat/checkpoint-1605.pt
|
| 133 |
+
2026-01-31 05:00:15,175 - absl - INFO - [process=0][thread=async_save] Background save thread done. Time taken: 5.084088s.
|
| 134 |
+
2026-01-31 05:00:15,175 - absl - INFO - [process=0][thread=MainThread] Done with waiting for background save thread=async_save.
|
| 135 |
+
2026-01-31 05:00:15,188 - absl - INFO - [process=0][thread=MainThread] No errors found in background save thread=async_save.
|
| 136 |
+
2026-01-31 05:00:15,188 - root - INFO - Training completed.
|
logs/run_20260131_hung_pitapat_train.csv
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
step,lr,loss,load_balance_loss,z_loss,time,tokens_processed,tokens_per_sec
|
| 2 |
+
0,2.6219192e-07,13.828883171081543,30.13552474975586,145.0,0,262144,0
|
| 3 |
+
100,2.6481384e-05,8.568872451782227,30.391618728637695,28.75,1954.9498462677002,26476544,134092.44257619866
|
| 4 |
+
200,5.2700576e-05,7.127870082855225,30.341196060180664,14.625,943.7928676605225,52690944,277755.86040378077
|
| 5 |
+
300,7.8919766e-05,6.367865085601807,30.527496337890625,13.5625,950.361020565033,78905344,275836.2288934614
|
| 6 |
+
400,0.00010513896,6.070593357086182,30.237375259399414,12.0,961.3227725028992,105119744,272690.9291012446
|
| 7 |
+
500,0.00013135816,5.671494960784912,30.15382194519043,10.25,938.9175844192505,131334144,279198.0940075205
|
| 8 |
+
600,0.00015757735,5.397560119628906,30.1779842376709,7.15625,942.0625352859497,157548544,278266.02818933874
|
| 9 |
+
700,0.00018379654,5.230807304382324,30.135395050048828,6.0625,948.1574845314026,183762944,276477.277537451
|
| 10 |
+
800,0.00021001573,5.1568450927734375,30.09786033630371,4.65625,960.0502038002014,209977344,273052.3872213619
|
| 11 |
+
900,0.00023623492,4.903904914855957,30.148042678833008,5.1875,942.0858097076416,236191744,278259.1535704708
|
| 12 |
+
1000,0.00026245412,4.8597412109375,30.101675033569336,3.984375,939.8473906517029,262406144,278921.87881505507
|
| 13 |
+
1100,0.0002886733,4.7186808586120605,30.091373443603516,3.609375,950.4498624801636,288620544,275810.4455041374
|
| 14 |
+
1200,0.0003148925,4.473597526550293,30.106189727783203,3.21875,962.2645902633667,314834944,272424.0324880422
|
| 15 |
+
1300,0.0003411117,4.498225688934326,30.161773681640625,3.4375,941.6033172607422,341049344,278401.7379660621
|
| 16 |
+
1400,0.00036733088,4.479489803314209,30.10297393798828,3.078125,945.871090888977,367263744,277145.58836302307
|
| 17 |
+
1500,0.0003935501,4.39519739151001,30.150604248046875,2.84375,937.6592016220093,393478144,279572.79099541745
|
| 18 |
+
1600,0.00041976926,4.434188365936279,30.091474533081055,2.71875,960.2378177642822,419692544,272999.0374783913
|
logs/run_20260131_hung_pitapat_train.png
ADDED
|
logs/run_20260131_hung_pitapat_val.csv
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
step,loss,logits_loss
|
model_checkpoints/run_20260131_hung_pitapat/checkpoint-1605.pt/_CHECKPOINT_METADATA
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:be4871b687172a0326d16d3ae1d3d560b809d1a03354610355db355daff9fc04
|
| 3 |
+
size 262
|
model_checkpoints/run_20260131_hung_pitapat/checkpoint-1605.pt/_METADATA
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d6eaa0d18ed5c59340f63513422099b39027741ec86d76fea36e4e01c7c72c40
|
| 3 |
+
size 162596
|
model_checkpoints/run_20260131_hung_pitapat/checkpoint-1605.pt/_sharding
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:310491281805c39e94a9cc686e2ba0c2795f38bdff3d41ac295261d1d7c8b372
|
| 3 |
+
size 156539
|
model_checkpoints/run_20260131_hung_pitapat/checkpoint-1605.pt/array_metadatas/process_0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:728cb06ed5264ae669cc552162d189ea4aae41ba3e75b9681986bb590ac955d1
|
| 3 |
+
size 66277
|
model_checkpoints/run_20260131_hung_pitapat/checkpoint-1605.pt/d/24908042f611bcc78959e7a8c56485f7
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2c09df69e6c749393afdb4f66e3bf5825eadbeff477bf360fe969744134724f5
|
| 3 |
+
size 190729
|
model_checkpoints/run_20260131_hung_pitapat/checkpoint-1605.pt/manifest.ocdbt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:71a313ea1ea802277cc31a9f68fd4326bea6aed1850d381cf78bee8fad679247
|
| 3 |
+
size 120
|
model_checkpoints/run_20260131_hung_pitapat/checkpoint-1605.pt/ocdbt.process_0/d/6468bf454fcb2abaed124681f9a1cd48
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8c3ca7e8916bdd27266e83ea779519972dfe0a3db2d192e08be15adc67091256
|
| 3 |
+
size 22159360
|
model_checkpoints/run_20260131_hung_pitapat/checkpoint-1605.pt/ocdbt.process_0/d/7763b086c54b886a362d829ce51d7c91
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:85803d68d57d83ad650fa3306d9d28cf6b2154f26a6a9cc92baac232fdb56769
|
| 3 |
+
size 572
|
model_checkpoints/run_20260131_hung_pitapat/checkpoint-1605.pt/ocdbt.process_0/d/81a3e4c8ed51983cfba1b04728dcef9d
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7c4b84c165fe2fb44cd3c30cce087fa1dae7cd35bf20ceac204e8f716c7cfeeb
|
| 3 |
+
size 402571264
|
model_checkpoints/run_20260131_hung_pitapat/checkpoint-1605.pt/ocdbt.process_0/d/c2a448087757f68112629249bf32a694
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cac926e70a70202636036d0424d3ebf1d7b59da61d9faef6abd6f66b959c6ca2
|
| 3 |
+
size 581
|
model_checkpoints/run_20260131_hung_pitapat/checkpoint-1605.pt/ocdbt.process_0/d/c4b6ab29f0fbc27680bcca4261f1b1f1
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4402340fc51ec443ce06684e7502c73f0a883be22c3f463ce8e43c6c7e2b80cf
|
| 3 |
+
size 2149543936
|
model_checkpoints/run_20260131_hung_pitapat/checkpoint-1605.pt/ocdbt.process_0/d/c902b46c692fb50b81287f337039a7ba
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:410c37a4800a6f973fbd8101a97cb0c61c61046e5811ef1994da84efb11ac468
|
| 3 |
+
size 650
|
model_checkpoints/run_20260131_hung_pitapat/checkpoint-1605.pt/ocdbt.process_0/d/cba8c1c0f064d4c3234f460c76e9bf1f
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3520fc3acc6b56aa780ecc508b86fb29091105f1cfc50baa64a1e36906f6edb2
|
| 3 |
+
size 1363394560
|
model_checkpoints/run_20260131_hung_pitapat/checkpoint-1605.pt/ocdbt.process_0/d/e054c24c67b06441f4b1473692431e4b
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3edcb06f5d9b10e6300bc5e0adc1637570e0f6f73a5c57a909a9b1fc10dd1571
|
| 3 |
+
size 235
|
model_checkpoints/run_20260131_hung_pitapat/checkpoint-1605.pt/ocdbt.process_0/manifest.ocdbt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f9793cc05cc75ccf13396cea3a2788a827da898361595d58aa8dc82a9a3fdc2f
|
| 3 |
+
size 397
|