Upload folder using huggingface_hub
Browse files- logs/output_run_20260201_schochat_binnogue.log +125 -0
- logs/run_20260201_schochat_binnogue_train.csv +10 -0
- logs/run_20260201_schochat_binnogue_train.png +0 -0
- logs/run_20260201_schochat_binnogue_val.csv +1 -0
- model_checkpoints/run_20260201_schochat_binnogue/checkpoint-815.pt/_CHECKPOINT_METADATA +3 -0
- model_checkpoints/run_20260201_schochat_binnogue/checkpoint-815.pt/_METADATA +3 -0
- model_checkpoints/run_20260201_schochat_binnogue/checkpoint-815.pt/_sharding +3 -0
- model_checkpoints/run_20260201_schochat_binnogue/checkpoint-815.pt/array_metadatas/process_0 +3 -0
- model_checkpoints/run_20260201_schochat_binnogue/checkpoint-815.pt/d/4cf13d30c132c8167f7da87863156061 +3 -0
- model_checkpoints/run_20260201_schochat_binnogue/checkpoint-815.pt/manifest.ocdbt +3 -0
- model_checkpoints/run_20260201_schochat_binnogue/checkpoint-815.pt/ocdbt.process_0/d/11e26cbac06b98246a4c15a183667147 +3 -0
- model_checkpoints/run_20260201_schochat_binnogue/checkpoint-815.pt/ocdbt.process_0/d/3654a848f4b338660d943b6a879de37f +3 -0
- model_checkpoints/run_20260201_schochat_binnogue/checkpoint-815.pt/ocdbt.process_0/d/38fe789b61c161acbfa5d40f887feee4 +3 -0
- model_checkpoints/run_20260201_schochat_binnogue/checkpoint-815.pt/ocdbt.process_0/d/4b8bacca8870cc07a225fbee75575100 +3 -0
- model_checkpoints/run_20260201_schochat_binnogue/checkpoint-815.pt/ocdbt.process_0/d/aee83c1497702ba66492dadb6e887508 +3 -0
- model_checkpoints/run_20260201_schochat_binnogue/checkpoint-815.pt/ocdbt.process_0/d/afae1194d0b381896ee5f0054f162220 +3 -0
- model_checkpoints/run_20260201_schochat_binnogue/checkpoint-815.pt/ocdbt.process_0/d/db5168775c1987f9b1a76c55276f1e23 +3 -0
- model_checkpoints/run_20260201_schochat_binnogue/checkpoint-815.pt/ocdbt.process_0/d/efd73f5d0912834dfc3383dac421655e +3 -0
- model_checkpoints/run_20260201_schochat_binnogue/checkpoint-815.pt/ocdbt.process_0/manifest.ocdbt +3 -0
logs/output_run_20260201_schochat_binnogue.log
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2026-02-01 05:25:15,094 - root - INFO - Run: run_20260201_schochat_binnogue
|
| 2 |
+
2026-02-01 05:25:15,094 - root - INFO - Log directory: /root/tiny_moe/training_runs/Tiny_MoE/logs
|
| 3 |
+
2026-02-01 05:25:15,094 - root - INFO - Output dir: /root/tiny_moe/training_runs
|
| 4 |
+
2026-02-01 05:25:16,627 - jax._src.xla_bridge - INFO - Unable to initialize backend 'tpu': INTERNAL: Failed to open libtpu.so: libtpu.so: cannot open shared object file: No such file or directory
|
| 5 |
+
2026-02-01 05:25:17,819 - root - INFO - Flax version: 0.11.1
|
| 6 |
+
2026-02-01 05:25:17,819 - root - INFO - Optax version: 0.2.6
|
| 7 |
+
2026-02-01 05:25:17,819 - root - INFO - Platform: gpu
|
| 8 |
+
2026-02-01 05:25:17,819 - root - INFO - Num Devices: 8
|
| 9 |
+
2026-02-01 05:25:17,819 - root - INFO - Devices: [CudaDevice(id=0), CudaDevice(id=1), CudaDevice(id=2), CudaDevice(id=3), CudaDevice(id=4), CudaDevice(id=5), CudaDevice(id=6), CudaDevice(id=7)]
|
| 10 |
+
2026-02-01 05:25:18,288 - root - INFO - Model config:
|
| 11 |
+
Config(name='Tiny_MoE',
|
| 12 |
+
dtype=<class 'jax.numpy.bfloat16'>,
|
| 13 |
+
vocab_size=50304,
|
| 14 |
+
block_size=2048,
|
| 15 |
+
n_layer=30,
|
| 16 |
+
n_embed=672,
|
| 17 |
+
n_glu_hidden=2048,
|
| 18 |
+
n_head=12,
|
| 19 |
+
n_kv_head=4,
|
| 20 |
+
n_experts=8,
|
| 21 |
+
init_stddev=0.02,
|
| 22 |
+
expert_load_factor=1.25,
|
| 23 |
+
aux_loss_coeff=0.01,
|
| 24 |
+
moe_bias=True,
|
| 25 |
+
mlp_bias=False,
|
| 26 |
+
attention_bias=False,
|
| 27 |
+
load_balance_loss_coeff=0.01,
|
| 28 |
+
z_loss_coeff=0.0005,
|
| 29 |
+
expert_top_k=2,
|
| 30 |
+
ln_epsilon=1e-05,
|
| 31 |
+
rope_theta=0.0001,
|
| 32 |
+
expert_partition_spec=PartitionSpec('devices',),
|
| 33 |
+
sdpa_implementation='cudnn')
|
| 34 |
+
2026-02-01 05:25:55,210 - root - INFO - Parameter Count: 1,062,185,520
|
| 35 |
+
2026-02-01 05:25:55,211 - root - INFO - Sharded / MoE Parameter Count: 992,210,160
|
| 36 |
+
2026-02-01 05:25:55,211 - root - INFO - Replicated Parameter Count: 69,975,360
|
| 37 |
+
2026-02-01 05:25:55,778 - root - INFO - Weight decay param count: 1,062,140,928
|
| 38 |
+
2026-02-01 05:25:55,779 - root - INFO - Training config:
|
| 39 |
+
TrainerConfig(num_tokens=100000000000,
|
| 40 |
+
num_tokens_per_batch=262144,
|
| 41 |
+
mB=128,
|
| 42 |
+
T=2048,
|
| 43 |
+
max_steps=381469,
|
| 44 |
+
max_lr=0.001,
|
| 45 |
+
min_lr=0.0001,
|
| 46 |
+
max_grad_norm=1.0,
|
| 47 |
+
weight_decay=0.1,
|
| 48 |
+
adam_b1=0.9,
|
| 49 |
+
adam_b2=0.95,
|
| 50 |
+
warmup_steps=3814,
|
| 51 |
+
print_interval=100,
|
| 52 |
+
val=True,
|
| 53 |
+
val_interval=5000,
|
| 54 |
+
val_batches=50,
|
| 55 |
+
checkpoint_model=True,
|
| 56 |
+
checkpoint_optimizer=False,
|
| 57 |
+
checkpoint_interval=10000)
|
| 58 |
+
2026-02-01 05:25:55,779 - root - INFO - Effective batch size per device: 16
|
| 59 |
+
2026-02-01 05:25:56,946 - root - INFO - ModdedNanoGPTDataLoader: 1030 shards (train)
|
| 60 |
+
2026-02-01 05:25:56,946 - root - INFO - Downloading fineweb_train_000001.bin from kjj0/fineweb100B-gpt2...
|
| 61 |
+
2026-02-01 05:25:57,282 - httpx - INFO - HTTP Request: HEAD https://huggingface.co/datasets/kjj0/fineweb100B-gpt2/resolve/main/fineweb_train_000001.bin "HTTP/1.1 302 Found"
|
| 62 |
+
2026-02-01 05:25:57,283 - huggingface_hub.utils._http - WARNING - Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.
|
| 63 |
+
2026-02-01 05:25:57,319 - httpx - INFO - HTTP Request: GET https://huggingface.co/api/datasets/kjj0/fineweb100B-gpt2/xet-read-token/50d1422b27e1a928440c26a8829f3f827f44ac56 "HTTP/1.1 200 OK"
|
| 64 |
+
2026-02-01 05:25:58,819 - root - INFO - HuggingfaceDataLoader initialized:
|
| 65 |
+
------------------------
|
| 66 |
+
label: train
|
| 67 |
+
shards: 1,030
|
| 68 |
+
shard size: 100,000,000
|
| 69 |
+
batch size: 128
|
| 70 |
+
block size: 2048
|
| 71 |
+
device rank: 1
|
| 72 |
+
start shard: 0
|
| 73 |
+
start pos: 0
|
| 74 |
+
------------------------
|
| 75 |
+
2026-02-01 05:25:58,820 - root - INFO - ModdedNanoGPTDataLoader: 1 shards (val)
|
| 76 |
+
2026-02-01 05:25:58,820 - root - INFO - Downloading fineweb_val_000000.bin from kjj0/fineweb100B-gpt2...
|
| 77 |
+
2026-02-01 05:25:58,868 - httpx - INFO - HTTP Request: HEAD https://huggingface.co/datasets/kjj0/fineweb100B-gpt2/resolve/main/fineweb_val_000000.bin "HTTP/1.1 302 Found"
|
| 78 |
+
2026-02-01 05:26:00,506 - root - INFO - Starting from step: 0
|
| 79 |
+
2026-02-01 05:26:46,917 - root - INFO - 0 | lr: 0.0000 | loss: 13.8267 | logits loss: 13.4375 | load balance loss: 30.1344 | z loss: 145.0000 | avg iter time: 0.00ms | avg tok/sec: 0.00 | tokens processed: 262,144
|
| 80 |
+
2026-02-01 05:29:29,739 - root - INFO - 100 | lr: 0.0000 | loss: 8.5611 | logits loss: 8.2500 | load balance loss: 30.3308 | z loss: 31.7500 | avg iter time: 1617.04ms | avg tok/sec: 162,113.59 | tokens processed: 26,476,544
|
| 81 |
+
2026-02-01 05:31:31,389 - root - INFO - 200 | lr: 0.0001 | loss: 7.1300 | logits loss: 6.8125 | load balance loss: 30.2802 | z loss: 16.5000 | avg iter time: 1205.32ms | avg tok/sec: 217,489.18 | tokens processed: 52,690,944
|
| 82 |
+
2026-02-01 05:33:33,287 - root - INFO - 300 | lr: 0.0001 | loss: 6.4070 | logits loss: 6.0938 | load balance loss: 30.4117 | z loss: 16.5000 | avg iter time: 1207.82ms | avg tok/sec: 217,038.16 | tokens processed: 78,905,344
|
| 83 |
+
2026-02-01 05:35:10,452 - root - INFO - Downloading fineweb_train_000002.bin from kjj0/fineweb100B-gpt2...
|
| 84 |
+
2026-02-01 05:35:10,793 - httpx - INFO - HTTP Request: HEAD https://huggingface.co/datasets/kjj0/fineweb100B-gpt2/resolve/main/fineweb_train_000002.bin "HTTP/1.1 302 Found"
|
| 85 |
+
2026-02-01 05:35:36,999 - root - INFO - 400 | lr: 0.0001 | loss: 6.0872 | logits loss: 5.7812 | load balance loss: 30.2631 | z loss: 12.4375 | avg iter time: 1226.04ms | avg tok/sec: 213,813.83 | tokens processed: 105,119,744
|
| 86 |
+
2026-02-01 05:37:38,351 - root - INFO - 500 | lr: 0.0001 | loss: 5.6965 | logits loss: 5.3750 | load balance loss: 30.1777 | z loss: 10.3750 | avg iter time: 1202.32ms | avg tok/sec: 218,031.47 | tokens processed: 131,334,144
|
| 87 |
+
2026-02-01 05:39:39,704 - root - INFO - 600 | lr: 0.0002 | loss: 5.4016 | logits loss: 5.0938 | load balance loss: 30.1175 | z loss: 6.7500 | avg iter time: 1202.36ms | avg tok/sec: 218,024.13 | tokens processed: 157,548,544
|
| 88 |
+
2026-02-01 05:41:41,327 - root - INFO - 700 | lr: 0.0002 | loss: 5.2496 | logits loss: 4.9375 | load balance loss: 30.1906 | z loss: 6.1875 | avg iter time: 1205.04ms | avg tok/sec: 217,538.92 | tokens processed: 183,762,944
|
| 89 |
+
2026-02-01 05:42:55,342 - root - INFO - Downloading fineweb_train_000003.bin from kjj0/fineweb100B-gpt2...
|
| 90 |
+
2026-02-01 05:42:55,425 - httpx - INFO - HTTP Request: HEAD https://huggingface.co/datasets/kjj0/fineweb100B-gpt2/resolve/main/fineweb_train_000003.bin "HTTP/1.1 302 Found"
|
| 91 |
+
2026-02-01 05:42:55,460 - httpx - INFO - HTTP Request: GET https://huggingface.co/api/datasets/kjj0/fineweb100B-gpt2/xet-read-token/50d1422b27e1a928440c26a8829f3f827f44ac56 "HTTP/1.1 200 OK"
|
| 92 |
+
2026-02-01 05:43:44,404 - root - INFO - 800 | lr: 0.0002 | loss: 5.1733 | logits loss: 4.8750 | load balance loss: 30.1197 | z loss: 4.4062 | avg iter time: 1219.62ms | avg tok/sec: 214,939.79 | tokens processed: 209,977,344
|
| 93 |
+
2026-02-01 05:44:01,359 - root - WARNING - Received KeyboardInterrupt. Exiting...
|
| 94 |
+
2026-02-01 05:44:01,522 - absl - INFO - orbax-checkpoint version: 0.11.32
|
| 95 |
+
2026-02-01 05:44:01,522 - absl - INFO - save_device_host_concurrent_bytes=None
|
| 96 |
+
2026-02-01 05:44:01,522 - absl - INFO - Created BasePyTreeCheckpointHandler: use_ocdbt=True, use_zarr3=False, pytree_metadata_options=PyTreeMetadataOptions(support_rich_types=False), array_metadata_store=<orbax.checkpoint._src.metadata.array_metadata_store.Store object at 0x743fb1503b10>, enable_pinned_host_transfer=True, save_concurrent_bytes: 96000000000 (89.4 GiB), restore_concurrent_bytes: 96000000000 (89.4 GiB)
|
| 97 |
+
2026-02-01 05:44:01,522 - absl - INFO - [thread=MainThread] Failed to get flag value for EXPERIMENTAL_ORBAX_USE_DISTRIBUTED_PROCESS_ID.
|
| 98 |
+
2026-02-01 05:44:01,522 - absl - INFO - [process=0][thread=MainThread] Using barrier_sync_fn: <function get_barrier_sync_fn.<locals>.<lambda> at 0x74380c5904a0> timeout: 600 secs and primary_host=0 for async checkpoint writes
|
| 99 |
+
2026-02-01 05:44:01,523 - absl - INFO - [process=0] Started async saving checkpoint to /root/tiny_moe/training_runs/Tiny_MoE/model_checkpoints/run_20260201_schochat_binnogue/checkpoint-815.pt.
|
| 100 |
+
2026-02-01 05:44:01,524 - absl - INFO - Using ThreadSafeKeyValueSignalingClient
|
| 101 |
+
2026-02-01 05:44:01,553 - absl - INFO - Creating tmp directory /root/tiny_moe/training_runs/Tiny_MoE/model_checkpoints/run_20260201_schochat_binnogue/checkpoint-815.pt.orbax-checkpoint-tmp
|
| 102 |
+
2026-02-01 05:44:01,577 - absl - INFO - Scheduling D2H of 482 prioritized jax.Array.
|
| 103 |
+
2026-02-01 05:44:01,578 - absl - INFO - Transferring arrays to host memory with options: use_replica_parallel=True, min_slice_bytes_for_replica_parallel=None, max_replicas_for_replica_parallel=None, enable_pinned_host_transfer=True
|
| 104 |
+
2026-02-01 05:44:01,596 - absl - INFO - Wrote Metadata={'item_handlers': None, 'metrics': {}, 'performance_metrics': {}, 'init_timestamp_nsecs': 1769924641595559797, 'commit_timestamp_nsecs': None, 'custom_metadata': {}}, json={"item_handlers": null, "metrics": {}, "performance_metrics": {}, "init_timestamp_nsecs": 1769924641595559797, "commit_timestamp_nsecs": null, "custom_metadata": {}} to /root/tiny_moe/training_runs/Tiny_MoE/model_checkpoints/run_20260201_schochat_binnogue/checkpoint-815.pt.orbax-checkpoint-tmp/_CHECKPOINT_METADATA
|
| 105 |
+
2026-02-01 05:44:05,003 - absl - INFO - [process=0][thread=MainThread] Initiated "orbax.checkpoint._src.serialization.jax_array_handlers.ArrayHandler".serialize. Time taken: 3.426580s
|
| 106 |
+
2026-02-01 05:44:05,026 - absl - INFO - [process=0] /jax/checkpoint/write/blocking_gbytes_per_sec: 1.134 GiB/s (total gbytes: 4.0 GiB) (time elapsed: 3 seconds) (per-host)
|
| 107 |
+
2026-02-01 05:44:05,029 - absl - INFO - [process=0][thread=MainThread] Initiated Pytree async_save. Time taken: 3.499053s (batch_requests_ready=0.018013s, total_serialization_initiated=3.455816s, others=0.025224s)
|
| 108 |
+
2026-02-01 05:44:05,032 - absl - INFO - [process=0][thread=async_save] Background save thread started.
|
| 109 |
+
2026-02-01 05:44:05,034 - absl - INFO - Finished blocking save. Time taken: 3.511357s. Continuing background save to /root/tiny_moe/training_runs/Tiny_MoE/model_checkpoints/run_20260201_schochat_binnogue/checkpoint-815.pt.
|
| 110 |
+
2026-02-01 05:44:05,035 - absl - INFO - [process=0][thread=MainThread] Waiting for background save thread=async_save.
|
| 111 |
+
2026-02-01 05:44:05,065 - absl - INFO - [process=0][thread=array_type_handler] Wrote 482 array_metadata.ArrayMetadata to /root/tiny_moe/training_runs/Tiny_MoE/model_checkpoints/run_20260201_schochat_binnogue/checkpoint-815.pt.orbax-checkpoint-tmp/array_metadatas/process_0
|
| 112 |
+
2026-02-01 05:44:06,100 - absl - INFO - [process=0][thread=write_metadata_after_commits] Commit + Array metadata written. Time taken: 1.071635s (commit=1.056806s, array_metadata_write=0.014830s)
|
| 113 |
+
2026-02-01 05:44:06,102 - absl - INFO - [process=0] /jax/checkpoint/write/gbytes_per_sec: 887.631 MiB/s (total gbytes: 4.0 GiB) (time elapsed: 4 seconds) (per-host)
|
| 114 |
+
2026-02-01 05:44:06,102 - absl - INFO - [process=0][thread=async_save] 2 Handler Commit operations completed. Time taken: 1.068292s.
|
| 115 |
+
2026-02-01 05:44:06,104 - absl - INFO - Read Metadata={'item_handlers': None, 'metrics': {}, 'performance_metrics': {}, 'init_timestamp_nsecs': 1769924641595559797, 'commit_timestamp_nsecs': None, 'custom_metadata': {}} from /root/tiny_moe/training_runs/Tiny_MoE/model_checkpoints/run_20260201_schochat_binnogue/checkpoint-815.pt.orbax-checkpoint-tmp/_CHECKPOINT_METADATA
|
| 116 |
+
2026-02-01 05:44:06,105 - absl - INFO - Updated Metadata={'item_handlers': 'orbax.checkpoint._src.handlers.standard_checkpoint_handler.StandardCheckpointHandler', 'metrics': {}, 'performance_metrics': {}, 'init_timestamp_nsecs': 1769924641595559797, 'commit_timestamp_nsecs': None, 'custom_metadata': {}} to /root/tiny_moe/training_runs/Tiny_MoE/model_checkpoints/run_20260201_schochat_binnogue/checkpoint-815.pt.orbax-checkpoint-tmp/_CHECKPOINT_METADATA
|
| 117 |
+
2026-02-01 05:44:06,108 - absl - INFO - [process=0][thread=async_save] Skipped cross-host ArrayMetadata validation because only one process is found: process_index=0.
|
| 118 |
+
2026-02-01 05:44:06,119 - absl - INFO - [process=0][thread=async_save] Pytree save finalize (merge_ocdbt + ArrayMetadata validation) completed. Time taken: 0.015843s. use_zarr3=False, enable_post_merge_validation=True, directory=/root/tiny_moe/training_runs/Tiny_MoE/model_checkpoints/run_20260201_schochat_binnogue/checkpoint-815.pt.orbax-checkpoint-tmp
|
| 119 |
+
2026-02-01 05:44:06,120 - absl - INFO - Renaming /root/tiny_moe/training_runs/Tiny_MoE/model_checkpoints/run_20260201_schochat_binnogue/checkpoint-815.pt.orbax-checkpoint-tmp to /root/tiny_moe/training_runs/Tiny_MoE/model_checkpoints/run_20260201_schochat_binnogue/checkpoint-815.pt
|
| 120 |
+
2026-02-01 05:44:06,121 - absl - INFO - [process=0][thread=async_save] Finished saving checkpoint (finalized tmp dir) to `/root/tiny_moe/training_runs/Tiny_MoE/model_checkpoints/run_20260201_schochat_binnogue/checkpoint-815.pt`.
|
| 121 |
+
2026-02-01 05:44:06,122 - absl - INFO - Finished async_save (blocking + background). Time taken: 4.599616s. directory=/root/tiny_moe/training_runs/Tiny_MoE/model_checkpoints/run_20260201_schochat_binnogue/checkpoint-815.pt
|
| 122 |
+
2026-02-01 05:44:06,122 - absl - INFO - [process=0][thread=async_save] Background save thread done. Time taken: 1.088146s.
|
| 123 |
+
2026-02-01 05:44:06,123 - absl - INFO - [process=0][thread=MainThread] Done with waiting for background save thread=async_save.
|
| 124 |
+
2026-02-01 05:44:06,128 - absl - INFO - [process=0][thread=MainThread] No errors found in background save thread=async_save.
|
| 125 |
+
2026-02-01 05:44:06,129 - root - INFO - Training completed.
|
logs/run_20260201_schochat_binnogue_train.csv
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
step,lr,loss,load_balance_loss,z_loss,time,tokens_processed,tokens_per_sec
|
| 2 |
+
0,2.6219192e-07,13.826749801635742,30.134389877319336,145.0,0,262144,0
|
| 3 |
+
100,2.6481384e-05,8.561071395874023,30.330848693847656,31.75,1617.0390391349792,26476544,162113.58764735304
|
| 4 |
+
200,5.2700576e-05,7.1299638748168945,30.28018569946289,16.5,1205.319755077362,52690944,217489.17571103328
|
| 5 |
+
300,7.8919766e-05,6.407045364379883,30.4117374420166,16.5,1207.8244638442993,78905344,217038.16063274653
|
| 6 |
+
400,0.00010513896,6.087183952331543,30.26305389404297,12.4375,1226.0385417938232,105119744,213813.8329782486
|
| 7 |
+
500,0.00013135816,5.6964545249938965,30.177724838256836,10.375,1202.321858406067,131334144,218031.46816903717
|
| 8 |
+
600,0.00015757735,5.40157413482666,30.11750030517578,6.75,1202.3623037338257,157548544,218024.13397853202
|
| 9 |
+
700,0.00018379654,5.249599456787109,30.19064712524414,6.1875,1205.0441336631775,183762944,217538.9205066841
|
| 10 |
+
800,0.00021001573,5.173303127288818,30.119709014892578,4.40625,1219.615981578827,209977344,214939.78757202515
|
logs/run_20260201_schochat_binnogue_train.png
ADDED
|
logs/run_20260201_schochat_binnogue_val.csv
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
step,loss,logits_loss
|
model_checkpoints/run_20260201_schochat_binnogue/checkpoint-815.pt/_CHECKPOINT_METADATA
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7bd59556a8578c405d15552959f984e6a44d589d9d7e4472f426bb0e0e1c7520
|
| 3 |
+
size 262
|
model_checkpoints/run_20260201_schochat_binnogue/checkpoint-815.pt/_METADATA
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d6eaa0d18ed5c59340f63513422099b39027741ec86d76fea36e4e01c7c72c40
|
| 3 |
+
size 162596
|
model_checkpoints/run_20260201_schochat_binnogue/checkpoint-815.pt/_sharding
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:310491281805c39e94a9cc686e2ba0c2795f38bdff3d41ac295261d1d7c8b372
|
| 3 |
+
size 156539
|
model_checkpoints/run_20260201_schochat_binnogue/checkpoint-815.pt/array_metadatas/process_0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:728cb06ed5264ae669cc552162d189ea4aae41ba3e75b9681986bb590ac955d1
|
| 3 |
+
size 66277
|
model_checkpoints/run_20260201_schochat_binnogue/checkpoint-815.pt/d/4cf13d30c132c8167f7da87863156061
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:78b5602809ef585944e3ebe3484b6af8fba25e9716c9b3466d8be7c5515c4c37
|
| 3 |
+
size 183606
|
model_checkpoints/run_20260201_schochat_binnogue/checkpoint-815.pt/manifest.ocdbt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:64b2b8c55570deababb0a0ec4f8487b418341eabec3d0f602fc487ab257f7f28
|
| 3 |
+
size 120
|
model_checkpoints/run_20260201_schochat_binnogue/checkpoint-815.pt/ocdbt.process_0/d/11e26cbac06b98246a4c15a183667147
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ea8305040c12719aff986daafb304f321c7fa2719d9f7c929c2f9f9fe6ec950a
|
| 3 |
+
size 35852288
|
model_checkpoints/run_20260201_schochat_binnogue/checkpoint-815.pt/ocdbt.process_0/d/3654a848f4b338660d943b6a879de37f
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3e1ff6f5e25cca901a75949a750357c0d8330a74040abe6e813f012a00e2e098
|
| 3 |
+
size 1617940480
|
model_checkpoints/run_20260201_schochat_binnogue/checkpoint-815.pt/ocdbt.process_0/d/38fe789b61c161acbfa5d40f887feee4
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9773357c57eed6bd6be9d848fd618a3bb27a36adca1944dbcb2e2eae6ee85acd
|
| 3 |
+
size 674
|
model_checkpoints/run_20260201_schochat_binnogue/checkpoint-815.pt/ocdbt.process_0/d/4b8bacca8870cc07a225fbee75575100
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:df1a05448690fc043093a5322bf9a436458a63d5006517468f343e07d38193f5
|
| 3 |
+
size 199
|
model_checkpoints/run_20260201_schochat_binnogue/checkpoint-815.pt/ocdbt.process_0/d/aee83c1497702ba66492dadb6e887508
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7775644e3d99372ca4b61d86ba24569e3c11ac53bbceca97561b1e0e4c409bf5
|
| 3 |
+
size 698
|
model_checkpoints/run_20260201_schochat_binnogue/checkpoint-815.pt/ocdbt.process_0/d/afae1194d0b381896ee5f0054f162220
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0cc965cd355ae1bd75ff8185dd7f823710d962494cb79b8d08a4db1a6ed2a4aa
|
| 3 |
+
size 572
|
model_checkpoints/run_20260201_schochat_binnogue/checkpoint-815.pt/ocdbt.process_0/d/db5168775c1987f9b1a76c55276f1e23
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:92ced9554d76c890a2f331585b9bc8fd19bfbd1a98bb0de3cde7716325fe842d
|
| 3 |
+
size 2150162432
|
model_checkpoints/run_20260201_schochat_binnogue/checkpoint-815.pt/ocdbt.process_0/d/efd73f5d0912834dfc3383dac421655e
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:80638a858bd0df013742fe08b0dd820338cef719b9b9088a3b8135f7f7003db2
|
| 3 |
+
size 129007616
|
model_checkpoints/run_20260201_schochat_binnogue/checkpoint-815.pt/ocdbt.process_0/manifest.ocdbt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:769ee7f026eb43fdc9a78d119741d598ddabb297674087dda6d66f0b9a12edf4
|
| 3 |
+
size 395
|