Upload folder using huggingface_hub
Browse files- logs/output_run_20260131_noise_molinist.log +135 -0
- logs/run_20260131_noise_molinist_train.csv +17 -0
- logs/run_20260131_noise_molinist_train.png +0 -0
- logs/run_20260131_noise_molinist_val.csv +1 -0
- model_checkpoints/run_20260131_noise_molinist/checkpoint-1527.pt/_CHECKPOINT_METADATA +3 -0
- model_checkpoints/run_20260131_noise_molinist/checkpoint-1527.pt/_METADATA +3 -0
- model_checkpoints/run_20260131_noise_molinist/checkpoint-1527.pt/_sharding +3 -0
- model_checkpoints/run_20260131_noise_molinist/checkpoint-1527.pt/array_metadatas/process_0 +3 -0
- model_checkpoints/run_20260131_noise_molinist/checkpoint-1527.pt/d/57e6fb1468fe020412b55af209042b6e +3 -0
- model_checkpoints/run_20260131_noise_molinist/checkpoint-1527.pt/manifest.ocdbt +3 -0
- model_checkpoints/run_20260131_noise_molinist/checkpoint-1527.pt/ocdbt.process_0/d/0e59d698e59e62f9753eaf6de21c796e +3 -0
- model_checkpoints/run_20260131_noise_molinist/checkpoint-1527.pt/ocdbt.process_0/d/1c4ce7cfb6556d0c016aa8059d3ace0c +3 -0
- model_checkpoints/run_20260131_noise_molinist/checkpoint-1527.pt/ocdbt.process_0/d/5ef09c7b5ec10afec38c6f32ca8052f0 +3 -0
- model_checkpoints/run_20260131_noise_molinist/checkpoint-1527.pt/ocdbt.process_0/d/6af3dfd430e34f6af04c4b0fcf2967b0 +3 -0
- model_checkpoints/run_20260131_noise_molinist/checkpoint-1527.pt/ocdbt.process_0/d/91c773dc8f398563d883a22a2f0ac638 +3 -0
- model_checkpoints/run_20260131_noise_molinist/checkpoint-1527.pt/ocdbt.process_0/d/97234e61701d74f7fe8cb669afc9cd72 +3 -0
- model_checkpoints/run_20260131_noise_molinist/checkpoint-1527.pt/ocdbt.process_0/d/9c4b0f7d711461b9666b2a001cfd4d99 +3 -0
- model_checkpoints/run_20260131_noise_molinist/checkpoint-1527.pt/ocdbt.process_0/d/c9a85aa78bd1f23e216b7232f7eeeccc +3 -0
- model_checkpoints/run_20260131_noise_molinist/checkpoint-1527.pt/ocdbt.process_0/manifest.ocdbt +3 -0
logs/output_run_20260131_noise_molinist.log
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2026-01-31 03:41:50,750 - root - INFO - Run: run_20260131_noise_molinist
|
| 2 |
+
2026-01-31 03:41:50,750 - root - INFO - Log directory: /root/tiny_moe/training_runs/Tiny_MoE/logs
|
| 3 |
+
2026-01-31 03:41:50,750 - root - INFO - Output dir: /root/tiny_moe/training_runs
|
| 4 |
+
2026-01-31 03:41:53,160 - jax._src.xla_bridge - INFO - Unable to initialize backend 'tpu': INTERNAL: Failed to open libtpu.so: libtpu.so: cannot open shared object file: No such file or directory
|
| 5 |
+
2026-01-31 03:41:56,518 - root - INFO - Flax version: 0.11.1
|
| 6 |
+
2026-01-31 03:41:56,518 - root - INFO - Optax version: 0.2.6
|
| 7 |
+
2026-01-31 03:41:56,518 - root - INFO - Platform: gpu
|
| 8 |
+
2026-01-31 03:41:56,518 - root - INFO - Num Devices: 8
|
| 9 |
+
2026-01-31 03:41:56,518 - root - INFO - Devices: [CudaDevice(id=0), CudaDevice(id=1), CudaDevice(id=2), CudaDevice(id=3), CudaDevice(id=4), CudaDevice(id=5), CudaDevice(id=6), CudaDevice(id=7)]
|
| 10 |
+
2026-01-31 03:41:59,446 - root - INFO - Model config:
|
| 11 |
+
Config(name='Tiny_MoE',
|
| 12 |
+
dtype=<class 'jax.numpy.bfloat16'>,
|
| 13 |
+
vocab_size=50304,
|
| 14 |
+
block_size=2048,
|
| 15 |
+
n_layer=30,
|
| 16 |
+
n_embed=672,
|
| 17 |
+
n_glu_hidden=2048,
|
| 18 |
+
n_head=12,
|
| 19 |
+
n_kv_head=4,
|
| 20 |
+
n_experts=8,
|
| 21 |
+
init_stddev=0.02,
|
| 22 |
+
expert_load_factor=1.25,
|
| 23 |
+
aux_loss_coeff=0.01,
|
| 24 |
+
moe_bias=False,
|
| 25 |
+
mlp_bias=False,
|
| 26 |
+
attention_bias=False,
|
| 27 |
+
load_balance_loss_coeff=0.01,
|
| 28 |
+
z_loss_coeff=0.0005,
|
| 29 |
+
expert_top_k=2,
|
| 30 |
+
ln_epsilon=1e-05,
|
| 31 |
+
rope_theta=0.0001,
|
| 32 |
+
expert_partition_spec=PartitionSpec('devices',),
|
| 33 |
+
sdpa_implementation='cudnn')
|
| 34 |
+
2026-01-31 03:43:40,423 - root - INFO - Parameter Count: 1,062,182,160
|
| 35 |
+
2026-01-31 03:43:40,423 - root - INFO - Sharded / MoE Parameter Count: 992,210,160
|
| 36 |
+
2026-01-31 03:43:40,423 - root - INFO - Replicated Parameter Count: 69,972,000
|
| 37 |
+
2026-01-31 03:43:41,629 - root - INFO - Weight decay param count: 1,062,140,928
|
| 38 |
+
2026-01-31 03:43:41,629 - root - INFO - Training config:
|
| 39 |
+
TrainerConfig(num_tokens=100000000000,
|
| 40 |
+
num_tokens_per_batch=262144,
|
| 41 |
+
mB=128,
|
| 42 |
+
T=2048,
|
| 43 |
+
max_steps=381469,
|
| 44 |
+
max_lr=0.001,
|
| 45 |
+
min_lr=0.0001,
|
| 46 |
+
max_grad_norm=1.0,
|
| 47 |
+
weight_decay=0.1,
|
| 48 |
+
adam_b1=0.9,
|
| 49 |
+
adam_b2=0.95,
|
| 50 |
+
warmup_steps=3814,
|
| 51 |
+
print_interval=100,
|
| 52 |
+
val=True,
|
| 53 |
+
val_interval=5000,
|
| 54 |
+
val_batches=50,
|
| 55 |
+
checkpoint_model=True,
|
| 56 |
+
checkpoint_optimizer=False,
|
| 57 |
+
checkpoint_interval=10000)
|
| 58 |
+
2026-01-31 03:43:41,629 - root - INFO - Effective batch size per device: 16
|
| 59 |
+
2026-01-31 03:43:42,565 - root - INFO - ModdedNanoGPTDataLoader: 1030 shards (train)
|
| 60 |
+
2026-01-31 03:43:42,566 - root - INFO - Downloading fineweb_train_000001.bin from kjj0/fineweb100B-gpt2...
|
| 61 |
+
2026-01-31 03:43:45,388 - httpx - INFO - HTTP Request: HEAD https://huggingface.co/datasets/kjj0/fineweb100B-gpt2/resolve/main/fineweb_train_000001.bin "HTTP/1.1 302 Found"
|
| 62 |
+
2026-01-31 03:43:45,423 - httpx - INFO - HTTP Request: GET https://huggingface.co/api/datasets/kjj0/fineweb100B-gpt2/xet-read-token/50d1422b27e1a928440c26a8829f3f827f44ac56 "HTTP/1.1 200 OK"
|
| 63 |
+
2026-01-31 03:43:47,435 - root - INFO - HuggingfaceDataLoader initialized:
|
| 64 |
+
------------------------
|
| 65 |
+
label: train
|
| 66 |
+
shards: 1,030
|
| 67 |
+
shard size: 100,000,000
|
| 68 |
+
batch size: 128
|
| 69 |
+
block size: 2048
|
| 70 |
+
device rank: 1
|
| 71 |
+
start shard: 0
|
| 72 |
+
start pos: 0
|
| 73 |
+
------------------------
|
| 74 |
+
2026-01-31 03:43:47,436 - root - INFO - ModdedNanoGPTDataLoader: 1 shards (val)
|
| 75 |
+
2026-01-31 03:43:47,436 - root - INFO - Downloading fineweb_val_000000.bin from kjj0/fineweb100B-gpt2...
|
| 76 |
+
2026-01-31 03:43:47,474 - httpx - INFO - HTTP Request: HEAD https://huggingface.co/datasets/kjj0/fineweb100B-gpt2/resolve/main/fineweb_val_000000.bin "HTTP/1.1 302 Found"
|
| 77 |
+
2026-01-31 03:43:49,417 - root - INFO - Starting from step: 0
|
| 78 |
+
2026-01-31 03:45:21,400 - root - INFO - 0 | lr: 0.0000 | loss: 13.8291 | logits loss: 13.4375 | load balance loss: 30.1163 | z loss: 146.0000 | avg iter time: 0.00ms | avg tok/sec: 0.00 | tokens processed: 262,144
|
| 79 |
+
2026-01-31 03:48:11,099 - root - INFO - 100 | lr: 0.0000 | loss: 8.5676 | logits loss: 8.2500 | load balance loss: 30.2992 | z loss: 29.8750 | avg iter time: 1689.63ms | avg tok/sec: 155,149.17 | tokens processed: 26,476,544
|
| 80 |
+
2026-01-31 03:49:41,598 - root - INFO - 200 | lr: 0.0001 | loss: 7.1793 | logits loss: 6.8750 | load balance loss: 30.4909 | z loss: 13.2500 | avg iter time: 897.60ms | avg tok/sec: 292,049.90 | tokens processed: 52,690,944
|
| 81 |
+
2026-01-31 03:51:11,728 - root - INFO - 300 | lr: 0.0001 | loss: 6.3569 | logits loss: 6.0625 | load balance loss: 30.2167 | z loss: 11.9375 | avg iter time: 893.90ms | avg tok/sec: 293,259.62 | tokens processed: 78,905,344
|
| 82 |
+
2026-01-31 03:52:23,895 - root - INFO - Downloading fineweb_train_000002.bin from kjj0/fineweb100B-gpt2...
|
| 83 |
+
2026-01-31 03:52:23,946 - httpx - INFO - HTTP Request: HEAD https://huggingface.co/datasets/kjj0/fineweb100B-gpt2/resolve/main/fineweb_train_000002.bin "HTTP/1.1 302 Found"
|
| 84 |
+
2026-01-31 03:52:48,290 - root - INFO - 400 | lr: 0.0001 | loss: 6.0437 | logits loss: 5.7500 | load balance loss: 30.2020 | z loss: 11.4375 | avg iter time: 958.36ms | avg tok/sec: 273,534.57 | tokens processed: 105,119,744
|
| 85 |
+
2026-01-31 03:54:18,499 - root - INFO - 500 | lr: 0.0001 | loss: 5.6435 | logits loss: 5.3438 | load balance loss: 30.1641 | z loss: 8.0000 | avg iter time: 894.71ms | avg tok/sec: 292,994.58 | tokens processed: 131,334,144
|
| 86 |
+
2026-01-31 03:55:48,609 - root - INFO - 600 | lr: 0.0002 | loss: 5.3482 | logits loss: 5.0312 | load balance loss: 30.1197 | z loss: 5.4688 | avg iter time: 893.73ms | avg tok/sec: 293,313.41 | tokens processed: 157,548,544
|
| 87 |
+
2026-01-31 03:57:19,032 - root - INFO - 700 | lr: 0.0002 | loss: 5.1981 | logits loss: 4.9062 | load balance loss: 30.1319 | z loss: 4.9062 | avg iter time: 896.88ms | avg tok/sec: 292,285.86 | tokens processed: 183,762,944
|
| 88 |
+
2026-01-31 03:58:13,933 - root - INFO - Downloading fineweb_train_000003.bin from kjj0/fineweb100B-gpt2...
|
| 89 |
+
2026-01-31 03:58:13,985 - httpx - INFO - HTTP Request: HEAD https://huggingface.co/datasets/kjj0/fineweb100B-gpt2/resolve/main/fineweb_train_000003.bin "HTTP/1.1 302 Found"
|
| 90 |
+
2026-01-31 03:58:14,272 - httpx - INFO - HTTP Request: GET https://huggingface.co/api/datasets/kjj0/fineweb100B-gpt2/xet-read-token/50d1422b27e1a928440c26a8829f3f827f44ac56 "HTTP/1.1 200 OK"
|
| 91 |
+
2026-01-31 03:58:51,901 - root - INFO - 800 | lr: 0.0002 | loss: 5.1305 | logits loss: 4.8125 | load balance loss: 30.1114 | z loss: 4.3750 | avg iter time: 921.34ms | avg tok/sec: 284,525.32 | tokens processed: 209,977,344
|
| 92 |
+
2026-01-31 04:00:21,992 - root - INFO - 900 | lr: 0.0002 | loss: 4.8817 | logits loss: 4.5625 | load balance loss: 30.2340 | z loss: 5.2812 | avg iter time: 893.65ms | avg tok/sec: 293,341.94 | tokens processed: 236,191,744
|
| 93 |
+
2026-01-31 04:01:51,909 - root - INFO - 1000 | lr: 0.0003 | loss: 4.8395 | logits loss: 4.5312 | load balance loss: 30.1275 | z loss: 3.5156 | avg iter time: 891.78ms | avg tok/sec: 293,956.16 | tokens processed: 262,406,144
|
| 94 |
+
2026-01-31 04:03:22,268 - root - INFO - 1100 | lr: 0.0003 | loss: 4.7095 | logits loss: 4.4062 | load balance loss: 30.0912 | z loss: 3.1875 | avg iter time: 896.20ms | avg tok/sec: 292,504.97 | tokens processed: 288,620,544
|
| 95 |
+
2026-01-31 04:04:01,025 - root - INFO - Downloading fineweb_train_000004.bin from kjj0/fineweb100B-gpt2...
|
| 96 |
+
2026-01-31 04:04:01,091 - httpx - INFO - HTTP Request: HEAD https://huggingface.co/datasets/kjj0/fineweb100B-gpt2/resolve/main/fineweb_train_000004.bin "HTTP/1.1 302 Found"
|
| 97 |
+
2026-01-31 04:04:54,938 - root - INFO - 1200 | lr: 0.0003 | loss: 4.4683 | logits loss: 4.1562 | load balance loss: 30.0939 | z loss: 2.9844 | avg iter time: 919.31ms | avg tok/sec: 285,152.31 | tokens processed: 314,834,944
|
| 98 |
+
2026-01-31 04:06:25,085 - root - INFO - 1300 | lr: 0.0003 | loss: 4.4734 | logits loss: 4.1562 | load balance loss: 30.1123 | z loss: 2.9219 | avg iter time: 894.21ms | avg tok/sec: 293,158.49 | tokens processed: 341,049,344
|
| 99 |
+
2026-01-31 04:07:55,049 - root - INFO - 1400 | lr: 0.0004 | loss: 4.4978 | logits loss: 4.1875 | load balance loss: 30.2392 | z loss: 3.6875 | avg iter time: 892.26ms | avg tok/sec: 293,797.17 | tokens processed: 367,263,744
|
| 100 |
+
2026-01-31 04:09:25,288 - root - INFO - 1500 | lr: 0.0004 | loss: 4.5336 | logits loss: 4.2188 | load balance loss: 30.2998 | z loss: 3.7969 | avg iter time: 895.08ms | avg tok/sec: 292,872.50 | tokens processed: 393,478,144
|
| 101 |
+
2026-01-31 04:09:47,025 - root - INFO - Downloading fineweb_train_000005.bin from kjj0/fineweb100B-gpt2...
|
| 102 |
+
2026-01-31 04:09:47,071 - httpx - INFO - HTTP Request: HEAD https://huggingface.co/datasets/kjj0/fineweb100B-gpt2/resolve/main/fineweb_train_000005.bin "HTTP/1.1 302 Found"
|
| 103 |
+
2026-01-31 04:09:51,583 - root - WARNING - Received KeyboardInterrupt. Exiting...
|
| 104 |
+
2026-01-31 04:09:51,936 - absl - INFO - orbax-checkpoint version: 0.11.32
|
| 105 |
+
2026-01-31 04:09:51,936 - absl - INFO - save_device_host_concurrent_bytes=None
|
| 106 |
+
2026-01-31 04:09:51,936 - absl - INFO - Created BasePyTreeCheckpointHandler: use_ocdbt=True, use_zarr3=False, pytree_metadata_options=PyTreeMetadataOptions(support_rich_types=False), array_metadata_store=<orbax.checkpoint._src.metadata.array_metadata_store.Store object at 0x7ecf0116fd90>, enable_pinned_host_transfer=True, save_concurrent_bytes: 96000000000 (89.4 GiB), restore_concurrent_bytes: 96000000000 (89.4 GiB)
|
| 107 |
+
2026-01-31 04:09:51,936 - absl - INFO - [thread=MainThread] Failed to get flag value for EXPERIMENTAL_ORBAX_USE_DISTRIBUTED_PROCESS_ID.
|
| 108 |
+
2026-01-31 04:09:51,936 - absl - INFO - [process=0][thread=MainThread] Using barrier_sync_fn: <function get_barrier_sync_fn.<locals>.<lambda> at 0x7ec8b447f7e0> timeout: 600 secs and primary_host=0 for async checkpoint writes
|
| 109 |
+
2026-01-31 04:09:51,938 - absl - INFO - [process=0] Started async saving checkpoint to /root/tiny_moe/training_runs/Tiny_MoE/model_checkpoints/run_20260131_noise_molinist/checkpoint-1527.pt.
|
| 110 |
+
2026-01-31 04:09:51,939 - absl - INFO - Using ThreadSafeKeyValueSignalingClient
|
| 111 |
+
2026-01-31 04:09:51,962 - absl - INFO - Creating tmp directory /root/tiny_moe/training_runs/Tiny_MoE/model_checkpoints/run_20260131_noise_molinist/checkpoint-1527.pt.orbax-checkpoint-tmp
|
| 112 |
+
2026-01-31 04:09:52,027 - absl - INFO - Wrote Metadata={'item_handlers': None, 'metrics': {}, 'performance_metrics': {}, 'init_timestamp_nsecs': 1769832591994428506, 'commit_timestamp_nsecs': None, 'custom_metadata': {}}, json={"item_handlers": null, "metrics": {}, "performance_metrics": {}, "init_timestamp_nsecs": 1769832591994428506, "commit_timestamp_nsecs": null, "custom_metadata": {}} to /root/tiny_moe/training_runs/Tiny_MoE/model_checkpoints/run_20260131_noise_molinist/checkpoint-1527.pt.orbax-checkpoint-tmp/_CHECKPOINT_METADATA
|
| 113 |
+
2026-01-31 04:09:52,045 - absl - INFO - Scheduling D2H of 422 prioritized jax.Array.
|
| 114 |
+
2026-01-31 04:09:52,047 - absl - INFO - Transferring arrays to host memory with options: use_replica_parallel=True, min_slice_bytes_for_replica_parallel=None, max_replicas_for_replica_parallel=None, enable_pinned_host_transfer=True
|
| 115 |
+
2026-01-31 04:09:59,922 - absl - INFO - [process=0][thread=MainThread] Initiated "orbax.checkpoint._src.serialization.jax_array_handlers.ArrayHandler".serialize. Time taken: 7.877772s
|
| 116 |
+
2026-01-31 04:09:59,933 - absl - INFO - [process=0] /jax/checkpoint/write/blocking_gbytes_per_sec: 508.106 MiB/s (total gbytes: 4.0 GiB) (time elapsed: 7 seconds) (per-host)
|
| 117 |
+
2026-01-31 04:09:59,943 - absl - INFO - [process=0][thread=MainThread] Initiated Pytree async_save. Time taken: 7.997003s (batch_requests_ready=0.032371s, total_serialization_initiated=7.949082s, others=0.015550s)
|
| 118 |
+
2026-01-31 04:09:59,988 - absl - INFO - [process=0][thread=async_save] Background save thread started.
|
| 119 |
+
2026-01-31 04:09:59,988 - absl - INFO - Finished blocking save. Time taken: 8.051523s. Continuing background save to /root/tiny_moe/training_runs/Tiny_MoE/model_checkpoints/run_20260131_noise_molinist/checkpoint-1527.pt.
|
| 120 |
+
2026-01-31 04:09:59,995 - absl - INFO - [process=0][thread=MainThread] Waiting for background save thread=async_save.
|
| 121 |
+
2026-01-31 04:10:00,074 - absl - INFO - [process=0][thread=array_type_handler] Wrote 422 array_metadata.ArrayMetadata to /root/tiny_moe/training_runs/Tiny_MoE/model_checkpoints/run_20260131_noise_molinist/checkpoint-1527.pt.orbax-checkpoint-tmp/array_metadatas/process_0
|
| 122 |
+
2026-01-31 04:10:04,820 - absl - INFO - [process=0][thread=write_metadata_after_commits] Commit + Array metadata written. Time taken: 4.886604s (commit=4.859642s, array_metadata_write=0.026962s)
|
| 123 |
+
2026-01-31 04:10:04,824 - absl - INFO - [process=0] /jax/checkpoint/write/gbytes_per_sec: 315.158 MiB/s (total gbytes: 4.0 GiB) (time elapsed: 12 seconds) (per-host)
|
| 124 |
+
2026-01-31 04:10:04,824 - absl - INFO - [process=0][thread=async_save] 2 Handler Commit operations completed. Time taken: 4.834271s.
|
| 125 |
+
2026-01-31 04:10:04,826 - absl - INFO - Read Metadata={'item_handlers': None, 'metrics': {}, 'performance_metrics': {}, 'init_timestamp_nsecs': 1769832591994428506, 'commit_timestamp_nsecs': None, 'custom_metadata': {}} from /root/tiny_moe/training_runs/Tiny_MoE/model_checkpoints/run_20260131_noise_molinist/checkpoint-1527.pt.orbax-checkpoint-tmp/_CHECKPOINT_METADATA
|
| 126 |
+
2026-01-31 04:10:04,828 - absl - INFO - Updated Metadata={'item_handlers': 'orbax.checkpoint._src.handlers.standard_checkpoint_handler.StandardCheckpointHandler', 'metrics': {}, 'performance_metrics': {}, 'init_timestamp_nsecs': 1769832591994428506, 'commit_timestamp_nsecs': None, 'custom_metadata': {}} to /root/tiny_moe/training_runs/Tiny_MoE/model_checkpoints/run_20260131_noise_molinist/checkpoint-1527.pt.orbax-checkpoint-tmp/_CHECKPOINT_METADATA
|
| 127 |
+
2026-01-31 04:10:04,831 - absl - INFO - [process=0][thread=async_save] Skipped cross-host ArrayMetadata validation because only one process is found: process_index=0.
|
| 128 |
+
2026-01-31 04:10:04,856 - absl - INFO - [process=0][thread=async_save] Pytree save finalize (merge_ocdbt + ArrayMetadata validation) completed. Time taken: 0.030280s. use_zarr3=False, enable_post_merge_validation=True, directory=/root/tiny_moe/training_runs/Tiny_MoE/model_checkpoints/run_20260131_noise_molinist/checkpoint-1527.pt.orbax-checkpoint-tmp
|
| 129 |
+
2026-01-31 04:10:04,857 - absl - INFO - Renaming /root/tiny_moe/training_runs/Tiny_MoE/model_checkpoints/run_20260131_noise_molinist/checkpoint-1527.pt.orbax-checkpoint-tmp to /root/tiny_moe/training_runs/Tiny_MoE/model_checkpoints/run_20260131_noise_molinist/checkpoint-1527.pt
|
| 130 |
+
2026-01-31 04:10:04,859 - absl - INFO - [process=0][thread=async_save] Finished saving checkpoint (finalized tmp dir) to `/root/tiny_moe/training_runs/Tiny_MoE/model_checkpoints/run_20260131_noise_molinist/checkpoint-1527.pt`.
|
| 131 |
+
2026-01-31 04:10:04,859 - absl - INFO - Finished async_save (blocking + background). Time taken: 12.922672s. directory=/root/tiny_moe/training_runs/Tiny_MoE/model_checkpoints/run_20260131_noise_molinist/checkpoint-1527.pt
|
| 132 |
+
2026-01-31 04:10:04,860 - absl - INFO - [process=0][thread=async_save] Background save thread done. Time taken: 4.869863s.
|
| 133 |
+
2026-01-31 04:10:04,860 - absl - INFO - [process=0][thread=MainThread] Done with waiting for background save thread=async_save.
|
| 134 |
+
2026-01-31 04:10:04,871 - absl - INFO - [process=0][thread=MainThread] No errors found in background save thread=async_save.
|
| 135 |
+
2026-01-31 04:10:04,872 - root - INFO - Training completed.
|
logs/run_20260131_noise_molinist_train.csv
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
step,lr,loss,load_balance_loss,z_loss,time,tokens_processed,tokens_per_sec
|
| 2 |
+
0,2.6219192e-07,13.829061508178711,30.116331100463867,146.0,0,262144,0
|
| 3 |
+
100,2.6481384e-05,8.567622184753418,30.29916763305664,29.875,1689.6255588531494,26476544,155149.16818489233
|
| 4 |
+
200,5.2700576e-05,7.179270267486572,30.49093246459961,13.25,897.600040435791,52690944,292049.8977169467
|
| 5 |
+
300,7.8919766e-05,6.356916904449463,30.21665382385254,11.9375,893.8973736763,78905344,293259.6153872672
|
| 6 |
+
400,0.00010513896,6.043698310852051,30.202049255371094,11.4375,958.3578491210938,105119744,273534.567740444
|
| 7 |
+
500,0.00013135816,5.643509387969971,30.16407012939453,8.0,894.7059607505798,131334144,292994.5831366589
|
| 8 |
+
600,0.00015757735,5.348178386688232,30.11968421936035,5.46875,893.7334275245667,157548544,293313.4108299807
|
| 9 |
+
700,0.00018379654,5.198059558868408,30.131858825683594,4.90625,896.8753933906555,183762944,292285.8648278434
|
| 10 |
+
800,0.00021001573,5.130524635314941,30.111366271972656,4.375,921.3380289077759,209977344,284525.32271002146
|
| 11 |
+
900,0.00023623492,4.881711483001709,30.23398780822754,5.28125,893.6465215682983,236191744,293341.935175837
|
| 12 |
+
1000,0.00026245412,4.8395304679870605,30.12749481201172,3.515625,891.7792367935181,262406144,293956.1599825593
|
| 13 |
+
1100,0.0002886733,4.709547996520996,30.091182708740234,3.1875,896.2035751342773,288620544,292504.97015783854
|
| 14 |
+
1200,0.0003148925,4.468289852142334,30.09386444091797,2.984375,919.312219619751,314834944,285152.3066977494
|
| 15 |
+
1300,0.0003411117,4.473387718200684,30.11233901977539,2.921875,894.2057299613953,341049344,293158.48827239935
|
| 16 |
+
1400,0.00036733088,4.497753620147705,30.239192962646484,3.6875,892.2618389129639,367263744,293797.1664454104
|
| 17 |
+
1500,0.0003935501,4.533601760864258,30.299835205078125,3.796875,895.078911781311,393478144,292872.50157452933
|
logs/run_20260131_noise_molinist_train.png
ADDED
|
logs/run_20260131_noise_molinist_val.csv
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
step,loss,logits_loss
|
model_checkpoints/run_20260131_noise_molinist/checkpoint-1527.pt/_CHECKPOINT_METADATA
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e1db0f3a2595c7b52b7d907a100182748bf16ed8f243a3226244fa9d628826fd
|
| 3 |
+
size 262
|
model_checkpoints/run_20260131_noise_molinist/checkpoint-1527.pt/_METADATA
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7f11aef6af980387400f0d088a4e26a45fbe95f1bf7093b76ec74c19fc3e3396
|
| 3 |
+
size 141276
|
model_checkpoints/run_20260131_noise_molinist/checkpoint-1527.pt/_sharding
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3954f37c73c953f25b799d9cf1f2ff7d2b6093b6b48605bd85eecb00b97acf17
|
| 3 |
+
size 136999
|
model_checkpoints/run_20260131_noise_molinist/checkpoint-1527.pt/array_metadatas/process_0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e0d26b2b58f0bc3c646880d5a399ddf2afd1c5e11c7ea2d800d8f1d86447e788
|
| 3 |
+
size 58497
|
model_checkpoints/run_20260131_noise_molinist/checkpoint-1527.pt/d/57e6fb1468fe020412b55af209042b6e
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:63977d0e7a70c32846cb724b398acf6ed75ab05df3ce01fcd4dce506a5491046
|
| 3 |
+
size 176536
|
model_checkpoints/run_20260131_noise_molinist/checkpoint-1527.pt/manifest.ocdbt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:63bc58faf212d66134962213b11170c29695169f33238dd8585b2c210825109f
|
| 3 |
+
size 120
|
model_checkpoints/run_20260131_noise_molinist/checkpoint-1527.pt/ocdbt.process_0/d/0e59d698e59e62f9753eaf6de21c796e
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:53d56b0a5b5d6a225e9f8d6fd640ab20662a8e315c4871b3821d043b0b5836b7
|
| 3 |
+
size 656711680
|
model_checkpoints/run_20260131_noise_molinist/checkpoint-1527.pt/ocdbt.process_0/d/1c4ce7cfb6556d0c016aa8059d3ace0c
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:975e9b80be83327d6d81e37b75419685a142ba36ec3aea1153f81e089f96dc31
|
| 3 |
+
size 1125806080
|
model_checkpoints/run_20260131_noise_molinist/checkpoint-1527.pt/ocdbt.process_0/d/5ef09c7b5ec10afec38c6f32ca8052f0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c08c9f61a32076770e78b2e1f112752c348cc928680535d52a3e14cd3b39ce63
|
| 3 |
+
size 586
|
model_checkpoints/run_20260131_noise_molinist/checkpoint-1527.pt/ocdbt.process_0/d/6af3dfd430e34f6af04c4b0fcf2967b0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9771eb413c5fe10b05ea935e4d0fa8fcc8d88c5faa81a3d9f9589a99f5227dc4
|
| 3 |
+
size 2148331520
|
model_checkpoints/run_20260131_noise_molinist/checkpoint-1527.pt/ocdbt.process_0/d/91c773dc8f398563d883a22a2f0ac638
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:10a29a2bc2dc261a96f651ed333da8ff47e13da03fbcf28bb4ab3ab814b8375c
|
| 3 |
+
size 250
|
model_checkpoints/run_20260131_noise_molinist/checkpoint-1527.pt/ocdbt.process_0/d/97234e61701d74f7fe8cb669afc9cd72
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:143021af6ef3f7e567a1b7d74fe4e9c7e97602cb64660ef706ef884ac74a8ca6
|
| 3 |
+
size 545
|
model_checkpoints/run_20260131_noise_molinist/checkpoint-1527.pt/ocdbt.process_0/d/9c4b0f7d711461b9666b2a001cfd4d99
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9c8273d626e84b6d2f5d8e615b7fe56ba60da92b664517f81a99d28fe7c4c987
|
| 3 |
+
size 538
|
model_checkpoints/run_20260131_noise_molinist/checkpoint-1527.pt/ocdbt.process_0/d/c9a85aa78bd1f23e216b7232f7eeeccc
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:56abc8658926f15339e262fc26050fc02f19c3471c0669b70abd929ac36027ad
|
| 3 |
+
size 7114752
|
model_checkpoints/run_20260131_noise_molinist/checkpoint-1527.pt/ocdbt.process_0/manifest.ocdbt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3e6afec18cc8e5c779e0e4c2b8ec1dd98431ffc8a143c60ba10b9b6c92759d2d
|
| 3 |
+
size 397
|