trixyL commited on
Commit ·
a81731e
1
Parent(s): bbd1c10
dump: train artifacts
Browse files- README.md +41 -42
- aliases/best.json +12 -0
- aliases/latest.json +9 -0
- config/config.json +115 -0
- config/train.toml +88 -0
- manifest.json +84 -0
- versions/v001000/manifest.json +50 -0
- versions/v001000/model.safetensors +3 -0
- versions/v001000/opt_shard_rank0000.bin +3 -0
- versions/v001000/rng_rank0000.json +0 -0
- versions/v002000/manifest.json +50 -0
- versions/v002000/model.safetensors +3 -0
- versions/v002000/opt_shard_rank0000.bin +3 -0
- versions/v002000/rng_rank0000.json +0 -0
- versions/v003000/manifest.json +50 -0
- versions/v003000/model.safetensors +3 -0
- versions/v003000/opt_shard_rank0000.bin +3 -0
- versions/v003000/rng_rank0000.json +0 -0
- versions/v004000/manifest.json +50 -0
- versions/v004000/model.safetensors +3 -0
- versions/v004000/opt_shard_rank0000.bin +3 -0
- versions/v004000/rng_rank0000.json +0 -0
- versions/v005000/manifest.json +50 -0
- versions/v005000/model.safetensors +3 -0
- versions/v005000/opt_shard_rank0000.bin +3 -0
- versions/v005000/rng_rank0000.json +0 -0
- versions/v006000/manifest.json +50 -0
- versions/v006000/model.safetensors +3 -0
- versions/v006000/opt_shard_rank0000.bin +3 -0
- versions/v006000/rng_rank0000.json +0 -0
README.md
CHANGED
|
@@ -1,42 +1,41 @@
|
|
| 1 |
-
---
|
| 2 |
-
license: apache-2.0
|
| 3 |
-
datasets:
|
| 4 |
-
- ylecun/mnist
|
| 5 |
-
language:
|
| 6 |
-
- en
|
| 7 |
-
tags:
|
| 8 |
-
- mnist
|
| 9 |
-
- '784'
|
| 10 |
-
- '32'
|
| 11 |
-
- transformerlm
|
| 12 |
-
- diffusion
|
| 13 |
-
---
|
| 14 |
-
# 🧠✨ TransformerLM (Diffusion
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
## ✅ Key Facts
|
| 19 |
-
|
| 20 |
-
- **Model type:** Diffusion Transformer
|
| 21 |
-
- **Dataset:**
|
| 22 |
-
- **Context length:** 784
|
| 23 |
-
- **Layers:** 12
|
| 24 |
-
- **Heads:** 8
|
| 25 |
-
- **d_model:** 256
|
| 26 |
-
- **d_ff:** 1024
|
| 27 |
-
- **Training setup:** Single
|
| 28 |
-
- **Runtime:** ~2 hours ⏱️
|
| 29 |
-
|
| 30 |
-
## 📦 What’s Inside
|
| 31 |
-
|
| 32 |
-
- 6k steps
|
| 33 |
-
- Optimizer state
|
| 34 |
-
- RNG state
|
| 35 |
-
- Safetensors weights
|
| 36 |
-
- Run config
|
| 37 |
-
|
| 38 |
-
## 🚀 Reproducibility
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
Exact commit that launched the train: https://github.com/triloy8/transformerlm/commit/84a190a106ecefb7cad49f47eac24963d97fe000
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: apache-2.0
|
| 3 |
+
datasets:
|
| 4 |
+
- ylecun/mnist
|
| 5 |
+
language:
|
| 6 |
+
- en
|
| 7 |
+
tags:
|
| 8 |
+
- mnist
|
| 9 |
+
- '784'
|
| 10 |
+
- '32'
|
| 11 |
+
- transformerlm
|
| 12 |
+
- diffusion
|
| 13 |
+
---
|
| 14 |
+
# 🧠✨ TransformerLM (Diffusion 784, 32) — MNIST
|
| 15 |
+
|
| 16 |
+
Training run artifacts from https://github.com/triloy8/transformerlm: a minimal masked discrete diffusion Transformer trained on **MNIST** with a **fixed 784‑token context** (28×28 image tokens).
|
| 17 |
+
|
| 18 |
+
## ✅ Key Facts
|
| 19 |
+
|
| 20 |
+
- **Model type:** Diffusion Transformer with LLaDA‑style objective
|
| 21 |
+
- **Dataset:** MNIST
|
| 22 |
+
- **Context length:** 784 tokens (28×28 image)
|
| 23 |
+
- **Layers:** 12
|
| 24 |
+
- **Heads:** 8
|
| 25 |
+
- **d_model:** 256
|
| 26 |
+
- **d_ff:** 1024
|
| 27 |
+
- **Training setup:** Single NVIDIA A40 (48GB)
|
| 28 |
+
- **Runtime:** ~2 hours ⏱️
|
| 29 |
+
|
| 30 |
+
## 📦 What’s Inside
|
| 31 |
+
|
| 32 |
+
- 6k steps (full run), including:
|
| 33 |
+
- Optimizer state
|
| 34 |
+
- RNG state
|
| 35 |
+
- Safetensors weights
|
| 36 |
+
- Run config
|
| 37 |
+
|
| 38 |
+
## 🚀 Reproducibility
|
| 39 |
+
|
| 40 |
+
Exact commit that launched the run:
|
| 41 |
+
https://github.com/triloy8/transformerlm/commit/84a190a106ecefb7cad49f47eac24963d97fe000
|
|
|
aliases/best.json
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"alias": "best",
|
| 3 |
+
"manifest_key": "runs/2026-02-04_21-50-53/versions/v005000/manifest.json",
|
| 4 |
+
"metric_name": "val_loss",
|
| 5 |
+
"mode": "min",
|
| 6 |
+
"run_id": "2026-02-04_21-50-53",
|
| 7 |
+
"schema_version": 1,
|
| 8 |
+
"status": "active",
|
| 9 |
+
"step": 5000,
|
| 10 |
+
"value": 0.35444357991218567,
|
| 11 |
+
"version_id": "v005000"
|
| 12 |
+
}
|
aliases/latest.json
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"alias": "latest",
|
| 3 |
+
"manifest_key": "runs/2026-02-04_21-50-53/versions/v006000/manifest.json",
|
| 4 |
+
"run_id": "2026-02-04_21-50-53",
|
| 5 |
+
"schema_version": 1,
|
| 6 |
+
"status": "active",
|
| 7 |
+
"step": 6000,
|
| 8 |
+
"version_id": "v006000"
|
| 9 |
+
}
|
config/config.json
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"checkpointing": {
|
| 3 |
+
"best_metric_name": "val_loss",
|
| 4 |
+
"best_mode": "min",
|
| 5 |
+
"ckpting_save_iter": 1000,
|
| 6 |
+
"enabled": true,
|
| 7 |
+
"resume_from": null,
|
| 8 |
+
"resume_optimizer": true,
|
| 9 |
+
"run_id": null
|
| 10 |
+
},
|
| 11 |
+
"compile": null,
|
| 12 |
+
"data": {
|
| 13 |
+
"cache_all": true,
|
| 14 |
+
"dataset_config": null,
|
| 15 |
+
"dataset_name": "ylecun/mnist",
|
| 16 |
+
"megatron_train_prefix": null,
|
| 17 |
+
"megatron_val_prefix": null,
|
| 18 |
+
"pad_random_shift": false,
|
| 19 |
+
"pad_token_id": null,
|
| 20 |
+
"pipeline_mode": "mnist",
|
| 21 |
+
"runs_path": "runs",
|
| 22 |
+
"shuffle_buffer_size": 0,
|
| 23 |
+
"shuffle_seed": 3407,
|
| 24 |
+
"text_field": "image",
|
| 25 |
+
"tokenizer": null,
|
| 26 |
+
"train_split": "train",
|
| 27 |
+
"val_split": "test"
|
| 28 |
+
},
|
| 29 |
+
"ddp": {
|
| 30 |
+
"backend": "nccl",
|
| 31 |
+
"bucket_size_mb": 200,
|
| 32 |
+
"master_addr": "127.0.0.1",
|
| 33 |
+
"master_port": "29500",
|
| 34 |
+
"nccl_p2p_disable": true,
|
| 35 |
+
"node_rank": 0,
|
| 36 |
+
"num_gpus_per_node": 1,
|
| 37 |
+
"num_nodes": 1
|
| 38 |
+
},
|
| 39 |
+
"logging": {
|
| 40 |
+
"architecture": "TransformerImage",
|
| 41 |
+
"backend": "wandb",
|
| 42 |
+
"dataset": "MNIST",
|
| 43 |
+
"log_activation_norms": false,
|
| 44 |
+
"log_grad_norms": true,
|
| 45 |
+
"log_p_mask_bucket_loss": false,
|
| 46 |
+
"log_weight_norms": true,
|
| 47 |
+
"p_mask_bucket_edges": null,
|
| 48 |
+
"run_name": null,
|
| 49 |
+
"val_log_every": 8,
|
| 50 |
+
"val_log_samples": 1
|
| 51 |
+
},
|
| 52 |
+
"model": {
|
| 53 |
+
"attention_backend": "torch_sdpa",
|
| 54 |
+
"attention_sdp_backend": "auto",
|
| 55 |
+
"context_length": 784,
|
| 56 |
+
"d_ff": 1024,
|
| 57 |
+
"d_model": 256,
|
| 58 |
+
"device": "cuda",
|
| 59 |
+
"dtype": "float32",
|
| 60 |
+
"eot_token_id": null,
|
| 61 |
+
"label_vocab_size": 11,
|
| 62 |
+
"mask_token_id": 32,
|
| 63 |
+
"model_type": "image",
|
| 64 |
+
"noise_epsilon": 0.001,
|
| 65 |
+
"null_label_id": 10,
|
| 66 |
+
"num_heads": 16,
|
| 67 |
+
"num_layers": 8,
|
| 68 |
+
"pixel_bins": 32,
|
| 69 |
+
"random_trunc_prob": 0.0,
|
| 70 |
+
"rope_theta": 10000.0,
|
| 71 |
+
"vocab_size": 33
|
| 72 |
+
},
|
| 73 |
+
"optimizer": {
|
| 74 |
+
"betas": [
|
| 75 |
+
0.9,
|
| 76 |
+
0.95
|
| 77 |
+
],
|
| 78 |
+
"cosine_cycle_iters": 60000,
|
| 79 |
+
"eps": 1e-08,
|
| 80 |
+
"grad_clip_max_l2_norm": 3.0,
|
| 81 |
+
"initial_learning_rate": 0.0001,
|
| 82 |
+
"lr_schedule": "constant_with_warmup",
|
| 83 |
+
"max_learning_rate": 0.003,
|
| 84 |
+
"min_learning_rate": 0.0003,
|
| 85 |
+
"muon": null,
|
| 86 |
+
"optimizer_name": "adamw",
|
| 87 |
+
"warmup_iters": 200,
|
| 88 |
+
"weight_decay": 0.1
|
| 89 |
+
},
|
| 90 |
+
"train_infer": null,
|
| 91 |
+
"training": {
|
| 92 |
+
"amp_dtype": "bfloat16",
|
| 93 |
+
"amp_enabled": true,
|
| 94 |
+
"batch_size": 256,
|
| 95 |
+
"deterministic_mask": false,
|
| 96 |
+
"eot_mask_loss": false,
|
| 97 |
+
"grad_accum_steps": 1,
|
| 98 |
+
"max_train_iteration": 120000,
|
| 99 |
+
"max_val_iteration": 10,
|
| 100 |
+
"objective": "diffusion",
|
| 101 |
+
"p_mask_override": null,
|
| 102 |
+
"repeat_masking_seed": null,
|
| 103 |
+
"seed": 3407,
|
| 104 |
+
"skip_validation": false,
|
| 105 |
+
"train_loss_ema_decay": 0.99,
|
| 106 |
+
"uncond_label_dropout_prob": 0.1,
|
| 107 |
+
"val_freq_iteration": 250
|
| 108 |
+
},
|
| 109 |
+
"wandb": {
|
| 110 |
+
"architecture": null,
|
| 111 |
+
"dataset": null,
|
| 112 |
+
"entity": "yiltro8-org",
|
| 113 |
+
"project": "mnist_diffusion"
|
| 114 |
+
}
|
| 115 |
+
}
|
config/train.toml
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[model]
|
| 2 |
+
model_type = "image"
|
| 3 |
+
label_vocab_size = 11
|
| 4 |
+
vocab_size = 33
|
| 5 |
+
pixel_bins = 32
|
| 6 |
+
context_length = 784
|
| 7 |
+
d_model = 256
|
| 8 |
+
num_layers = 8
|
| 9 |
+
num_heads = 16
|
| 10 |
+
d_ff = 1024
|
| 11 |
+
rope_theta = 10000.0
|
| 12 |
+
attention_backend = "torch_sdpa"
|
| 13 |
+
attention_sdp_backend = "auto"
|
| 14 |
+
device = "cuda"
|
| 15 |
+
dtype = "float32"
|
| 16 |
+
mask_token_id = 32
|
| 17 |
+
null_label_id = 10
|
| 18 |
+
random_trunc_prob = 0.0
|
| 19 |
+
|
| 20 |
+
[optimizer]
|
| 21 |
+
optimizer_name = "adamw"
|
| 22 |
+
betas = [0.9, 0.95]
|
| 23 |
+
eps = 1e-8
|
| 24 |
+
weight_decay = 0.1
|
| 25 |
+
initial_learning_rate = 0.0001
|
| 26 |
+
max_learning_rate = 0.003
|
| 27 |
+
min_learning_rate = 0.0003
|
| 28 |
+
warmup_iters = 200
|
| 29 |
+
cosine_cycle_iters = 60000
|
| 30 |
+
grad_clip_max_l2_norm = 3.0
|
| 31 |
+
lr_schedule = "constant_with_warmup"
|
| 32 |
+
|
| 33 |
+
[training]
|
| 34 |
+
batch_size = 256
|
| 35 |
+
max_train_iteration = 120000
|
| 36 |
+
max_val_iteration = 10
|
| 37 |
+
val_freq_iteration = 250
|
| 38 |
+
seed = 3407
|
| 39 |
+
skip_validation = false
|
| 40 |
+
grad_accum_steps = 1
|
| 41 |
+
train_loss_ema_decay = 0.99
|
| 42 |
+
amp_enabled = true
|
| 43 |
+
amp_dtype = "bfloat16"
|
| 44 |
+
objective = "diffusion"
|
| 45 |
+
uncond_label_dropout_prob = 0.1
|
| 46 |
+
|
| 47 |
+
[data]
|
| 48 |
+
runs_path = "./runs"
|
| 49 |
+
dataset_name = "ylecun/mnist"
|
| 50 |
+
train_split = "train"
|
| 51 |
+
val_split = "test"
|
| 52 |
+
text_field = "image"
|
| 53 |
+
pipeline_mode = "mnist"
|
| 54 |
+
shuffle_buffer_size = 0
|
| 55 |
+
cache_all = true
|
| 56 |
+
shuffle_seed = 3407
|
| 57 |
+
|
| 58 |
+
[logging]
|
| 59 |
+
backend = "wandb"
|
| 60 |
+
architecture = "TransformerImage"
|
| 61 |
+
dataset = "MNIST"
|
| 62 |
+
log_activation_norms = false
|
| 63 |
+
log_weight_norms = true
|
| 64 |
+
log_grad_norms = true
|
| 65 |
+
log_p_mask_bucket_loss = false
|
| 66 |
+
val_log_every = 8
|
| 67 |
+
val_log_samples = 1
|
| 68 |
+
|
| 69 |
+
[wandb]
|
| 70 |
+
entity = "yiltro8-org"
|
| 71 |
+
project = "mnist_diffusion"
|
| 72 |
+
|
| 73 |
+
[ddp]
|
| 74 |
+
backend = "nccl"
|
| 75 |
+
num_nodes = 1
|
| 76 |
+
num_gpus_per_node = 1
|
| 77 |
+
node_rank = 0
|
| 78 |
+
master_addr = "127.0.0.1"
|
| 79 |
+
master_port = "29500"
|
| 80 |
+
bucket_size_mb = 200
|
| 81 |
+
nccl_p2p_disable = true
|
| 82 |
+
|
| 83 |
+
[checkpointing]
|
| 84 |
+
enabled = true
|
| 85 |
+
ckpting_save_iter = 1000
|
| 86 |
+
resume_optimizer = true
|
| 87 |
+
best_metric_name = "val_loss"
|
| 88 |
+
best_mode = "min"
|
manifest.json
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"aliases": {
|
| 3 |
+
"best": {
|
| 4 |
+
"metric_name": "val_loss",
|
| 5 |
+
"mode": "min",
|
| 6 |
+
"status": "active",
|
| 7 |
+
"step": 5000,
|
| 8 |
+
"value": 0.35444357991218567,
|
| 9 |
+
"version_id": "v005000"
|
| 10 |
+
},
|
| 11 |
+
"latest": {
|
| 12 |
+
"step": 6000,
|
| 13 |
+
"version_id": "v006000"
|
| 14 |
+
}
|
| 15 |
+
},
|
| 16 |
+
"config": {
|
| 17 |
+
"bytes": 1700,
|
| 18 |
+
"key": "runs/2026-02-04_21-50-53/config/train.toml",
|
| 19 |
+
"sha256": "391209bbf0737f88212f9f90b609e8db15c2ed63b217ca26bb56dbb84ced42e5"
|
| 20 |
+
},
|
| 21 |
+
"created_at": "2026-02-04T21:50:55.488444Z",
|
| 22 |
+
"paths": {
|
| 23 |
+
"layout_version": 1,
|
| 24 |
+
"root_local": "runs/2026-02-04_21-50-53"
|
| 25 |
+
},
|
| 26 |
+
"run_id": "2026-02-04_21-50-53",
|
| 27 |
+
"schema_version": 1,
|
| 28 |
+
"versions": [
|
| 29 |
+
{
|
| 30 |
+
"created_at": "2026-02-04T22:13:06.568747Z",
|
| 31 |
+
"metrics": {
|
| 32 |
+
"val_loss": 0.39340153336524963
|
| 33 |
+
},
|
| 34 |
+
"model_key": "runs/2026-02-04_21-50-53/versions/v001000/model.safetensors",
|
| 35 |
+
"step": 1000,
|
| 36 |
+
"version_id": "v001000"
|
| 37 |
+
},
|
| 38 |
+
{
|
| 39 |
+
"created_at": "2026-02-04T22:35:00.278291Z",
|
| 40 |
+
"metrics": {
|
| 41 |
+
"val_loss": 0.3754102289676666
|
| 42 |
+
},
|
| 43 |
+
"model_key": "runs/2026-02-04_21-50-53/versions/v002000/model.safetensors",
|
| 44 |
+
"step": 2000,
|
| 45 |
+
"version_id": "v002000"
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"created_at": "2026-02-04T22:56:53.759137Z",
|
| 49 |
+
"metrics": {
|
| 50 |
+
"val_loss": 0.3638891577720642
|
| 51 |
+
},
|
| 52 |
+
"model_key": "runs/2026-02-04_21-50-53/versions/v003000/model.safetensors",
|
| 53 |
+
"step": 3000,
|
| 54 |
+
"version_id": "v003000"
|
| 55 |
+
},
|
| 56 |
+
{
|
| 57 |
+
"created_at": "2026-02-04T23:18:47.962640Z",
|
| 58 |
+
"metrics": {
|
| 59 |
+
"val_loss": 0.3601241409778595
|
| 60 |
+
},
|
| 61 |
+
"model_key": "runs/2026-02-04_21-50-53/versions/v004000/model.safetensors",
|
| 62 |
+
"step": 4000,
|
| 63 |
+
"version_id": "v004000"
|
| 64 |
+
},
|
| 65 |
+
{
|
| 66 |
+
"created_at": "2026-02-04T23:40:37.656498Z",
|
| 67 |
+
"metrics": {
|
| 68 |
+
"val_loss": 0.35444357991218567
|
| 69 |
+
},
|
| 70 |
+
"model_key": "runs/2026-02-04_21-50-53/versions/v005000/model.safetensors",
|
| 71 |
+
"step": 5000,
|
| 72 |
+
"version_id": "v005000"
|
| 73 |
+
},
|
| 74 |
+
{
|
| 75 |
+
"created_at": "2026-02-05T00:02:31.367167Z",
|
| 76 |
+
"metrics": {
|
| 77 |
+
"val_loss": 0.35603439807891846
|
| 78 |
+
},
|
| 79 |
+
"model_key": "runs/2026-02-04_21-50-53/versions/v006000/model.safetensors",
|
| 80 |
+
"step": 6000,
|
| 81 |
+
"version_id": "v006000"
|
| 82 |
+
}
|
| 83 |
+
]
|
| 84 |
+
}
|
versions/v001000/manifest.json
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"amp_scaler": null,
|
| 3 |
+
"code": {},
|
| 4 |
+
"config": {
|
| 5 |
+
"bytes": 1700,
|
| 6 |
+
"key": "runs/2026-02-04_21-50-53/config/train.toml",
|
| 7 |
+
"sha256": "391209bbf0737f88212f9f90b609e8db15c2ed63b217ca26bb56dbb84ced42e5"
|
| 8 |
+
},
|
| 9 |
+
"created_at": "2026-02-04T22:13:06.568747Z",
|
| 10 |
+
"metrics": {
|
| 11 |
+
"val_loss": 0.39340153336524963
|
| 12 |
+
},
|
| 13 |
+
"model": {
|
| 14 |
+
"bytes": 42058920,
|
| 15 |
+
"key": "runs/2026-02-04_21-50-53/versions/v001000/model.safetensors",
|
| 16 |
+
"sha256": "ebb11d87c5025c61bd4ab43d5b68eb9ff3e55cf5c7fca808b6c1515a051d2a31"
|
| 17 |
+
},
|
| 18 |
+
"optimizer": {
|
| 19 |
+
"sharding": "custom",
|
| 20 |
+
"shards": [
|
| 21 |
+
{
|
| 22 |
+
"bytes": 84167913,
|
| 23 |
+
"key": "runs/2026-02-04_21-50-53/versions/v001000/opt_shard_rank0000.bin",
|
| 24 |
+
"rank": 0,
|
| 25 |
+
"sha256": "206f53052a9aebc2e21613a571a3ed22f51c66697990f9d227dd4b1e46e7e4d5"
|
| 26 |
+
}
|
| 27 |
+
]
|
| 28 |
+
},
|
| 29 |
+
"paths": {
|
| 30 |
+
"layout_version": 1,
|
| 31 |
+
"root_local": "runs/2026-02-04_21-50-53"
|
| 32 |
+
},
|
| 33 |
+
"resume": {
|
| 34 |
+
"base_step": 1001,
|
| 35 |
+
"exact": true
|
| 36 |
+
},
|
| 37 |
+
"rng": {
|
| 38 |
+
"keys": [
|
| 39 |
+
{
|
| 40 |
+
"key": "runs/2026-02-04_21-50-53/versions/v001000/rng_rank0000.json",
|
| 41 |
+
"rank": 0
|
| 42 |
+
}
|
| 43 |
+
],
|
| 44 |
+
"per_rank": true
|
| 45 |
+
},
|
| 46 |
+
"run_id": "2026-02-04_21-50-53",
|
| 47 |
+
"schema_version": 1,
|
| 48 |
+
"step": 1000,
|
| 49 |
+
"version_id": "v001000"
|
| 50 |
+
}
|
versions/v001000/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ebb11d87c5025c61bd4ab43d5b68eb9ff3e55cf5c7fca808b6c1515a051d2a31
|
| 3 |
+
size 42058920
|
versions/v001000/opt_shard_rank0000.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:206f53052a9aebc2e21613a571a3ed22f51c66697990f9d227dd4b1e46e7e4d5
|
| 3 |
+
size 84167913
|
versions/v001000/rng_rank0000.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
versions/v002000/manifest.json
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"amp_scaler": null,
|
| 3 |
+
"code": {},
|
| 4 |
+
"config": {
|
| 5 |
+
"bytes": 1700,
|
| 6 |
+
"key": "runs/2026-02-04_21-50-53/config/train.toml",
|
| 7 |
+
"sha256": "391209bbf0737f88212f9f90b609e8db15c2ed63b217ca26bb56dbb84ced42e5"
|
| 8 |
+
},
|
| 9 |
+
"created_at": "2026-02-04T22:35:00.278291Z",
|
| 10 |
+
"metrics": {
|
| 11 |
+
"val_loss": 0.3754102289676666
|
| 12 |
+
},
|
| 13 |
+
"model": {
|
| 14 |
+
"bytes": 42058920,
|
| 15 |
+
"key": "runs/2026-02-04_21-50-53/versions/v002000/model.safetensors",
|
| 16 |
+
"sha256": "370edaeeb9ef1fcd3b9b6c32d40541851f1bc884d64fc2bfecc9c968472d16d6"
|
| 17 |
+
},
|
| 18 |
+
"optimizer": {
|
| 19 |
+
"sharding": "custom",
|
| 20 |
+
"shards": [
|
| 21 |
+
{
|
| 22 |
+
"bytes": 84167913,
|
| 23 |
+
"key": "runs/2026-02-04_21-50-53/versions/v002000/opt_shard_rank0000.bin",
|
| 24 |
+
"rank": 0,
|
| 25 |
+
"sha256": "d4d7423c4d74633a72d4ab0f00202566d80817559fd2b8fcac56f31688af9e98"
|
| 26 |
+
}
|
| 27 |
+
]
|
| 28 |
+
},
|
| 29 |
+
"paths": {
|
| 30 |
+
"layout_version": 1,
|
| 31 |
+
"root_local": "runs/2026-02-04_21-50-53"
|
| 32 |
+
},
|
| 33 |
+
"resume": {
|
| 34 |
+
"base_step": 2001,
|
| 35 |
+
"exact": true
|
| 36 |
+
},
|
| 37 |
+
"rng": {
|
| 38 |
+
"keys": [
|
| 39 |
+
{
|
| 40 |
+
"key": "runs/2026-02-04_21-50-53/versions/v002000/rng_rank0000.json",
|
| 41 |
+
"rank": 0
|
| 42 |
+
}
|
| 43 |
+
],
|
| 44 |
+
"per_rank": true
|
| 45 |
+
},
|
| 46 |
+
"run_id": "2026-02-04_21-50-53",
|
| 47 |
+
"schema_version": 1,
|
| 48 |
+
"step": 2000,
|
| 49 |
+
"version_id": "v002000"
|
| 50 |
+
}
|
versions/v002000/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:370edaeeb9ef1fcd3b9b6c32d40541851f1bc884d64fc2bfecc9c968472d16d6
|
| 3 |
+
size 42058920
|
versions/v002000/opt_shard_rank0000.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d4d7423c4d74633a72d4ab0f00202566d80817559fd2b8fcac56f31688af9e98
|
| 3 |
+
size 84167913
|
versions/v002000/rng_rank0000.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
versions/v003000/manifest.json
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"amp_scaler": null,
|
| 3 |
+
"code": {},
|
| 4 |
+
"config": {
|
| 5 |
+
"bytes": 1700,
|
| 6 |
+
"key": "runs/2026-02-04_21-50-53/config/train.toml",
|
| 7 |
+
"sha256": "391209bbf0737f88212f9f90b609e8db15c2ed63b217ca26bb56dbb84ced42e5"
|
| 8 |
+
},
|
| 9 |
+
"created_at": "2026-02-04T22:56:53.759137Z",
|
| 10 |
+
"metrics": {
|
| 11 |
+
"val_loss": 0.3638891577720642
|
| 12 |
+
},
|
| 13 |
+
"model": {
|
| 14 |
+
"bytes": 42058920,
|
| 15 |
+
"key": "runs/2026-02-04_21-50-53/versions/v003000/model.safetensors",
|
| 16 |
+
"sha256": "67f022191a131ad52b4f6efc1fc53e98f9d5d314f04c4e6862ed5eadc555a722"
|
| 17 |
+
},
|
| 18 |
+
"optimizer": {
|
| 19 |
+
"sharding": "custom",
|
| 20 |
+
"shards": [
|
| 21 |
+
{
|
| 22 |
+
"bytes": 84167913,
|
| 23 |
+
"key": "runs/2026-02-04_21-50-53/versions/v003000/opt_shard_rank0000.bin",
|
| 24 |
+
"rank": 0,
|
| 25 |
+
"sha256": "a3c8d55a6d1c0da06c531b4fb6a0895588ad004d500c84a1e2550b9ad648621a"
|
| 26 |
+
}
|
| 27 |
+
]
|
| 28 |
+
},
|
| 29 |
+
"paths": {
|
| 30 |
+
"layout_version": 1,
|
| 31 |
+
"root_local": "runs/2026-02-04_21-50-53"
|
| 32 |
+
},
|
| 33 |
+
"resume": {
|
| 34 |
+
"base_step": 3001,
|
| 35 |
+
"exact": true
|
| 36 |
+
},
|
| 37 |
+
"rng": {
|
| 38 |
+
"keys": [
|
| 39 |
+
{
|
| 40 |
+
"key": "runs/2026-02-04_21-50-53/versions/v003000/rng_rank0000.json",
|
| 41 |
+
"rank": 0
|
| 42 |
+
}
|
| 43 |
+
],
|
| 44 |
+
"per_rank": true
|
| 45 |
+
},
|
| 46 |
+
"run_id": "2026-02-04_21-50-53",
|
| 47 |
+
"schema_version": 1,
|
| 48 |
+
"step": 3000,
|
| 49 |
+
"version_id": "v003000"
|
| 50 |
+
}
|
versions/v003000/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:67f022191a131ad52b4f6efc1fc53e98f9d5d314f04c4e6862ed5eadc555a722
|
| 3 |
+
size 42058920
|
versions/v003000/opt_shard_rank0000.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a3c8d55a6d1c0da06c531b4fb6a0895588ad004d500c84a1e2550b9ad648621a
|
| 3 |
+
size 84167913
|
versions/v003000/rng_rank0000.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
versions/v004000/manifest.json
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"amp_scaler": null,
|
| 3 |
+
"code": {},
|
| 4 |
+
"config": {
|
| 5 |
+
"bytes": 1700,
|
| 6 |
+
"key": "runs/2026-02-04_21-50-53/config/train.toml",
|
| 7 |
+
"sha256": "391209bbf0737f88212f9f90b609e8db15c2ed63b217ca26bb56dbb84ced42e5"
|
| 8 |
+
},
|
| 9 |
+
"created_at": "2026-02-04T23:18:47.962640Z",
|
| 10 |
+
"metrics": {
|
| 11 |
+
"val_loss": 0.3601241409778595
|
| 12 |
+
},
|
| 13 |
+
"model": {
|
| 14 |
+
"bytes": 42058920,
|
| 15 |
+
"key": "runs/2026-02-04_21-50-53/versions/v004000/model.safetensors",
|
| 16 |
+
"sha256": "78f02963ef05ad8a637ecfb3a7ac40ad473ed11332b17214ed9aaa23d728d77b"
|
| 17 |
+
},
|
| 18 |
+
"optimizer": {
|
| 19 |
+
"sharding": "custom",
|
| 20 |
+
"shards": [
|
| 21 |
+
{
|
| 22 |
+
"bytes": 84167913,
|
| 23 |
+
"key": "runs/2026-02-04_21-50-53/versions/v004000/opt_shard_rank0000.bin",
|
| 24 |
+
"rank": 0,
|
| 25 |
+
"sha256": "7fde52ddeef434b923da0ac4420d1ec51880349dd8ba165dd9b4f372a73076c2"
|
| 26 |
+
}
|
| 27 |
+
]
|
| 28 |
+
},
|
| 29 |
+
"paths": {
|
| 30 |
+
"layout_version": 1,
|
| 31 |
+
"root_local": "runs/2026-02-04_21-50-53"
|
| 32 |
+
},
|
| 33 |
+
"resume": {
|
| 34 |
+
"base_step": 4001,
|
| 35 |
+
"exact": true
|
| 36 |
+
},
|
| 37 |
+
"rng": {
|
| 38 |
+
"keys": [
|
| 39 |
+
{
|
| 40 |
+
"key": "runs/2026-02-04_21-50-53/versions/v004000/rng_rank0000.json",
|
| 41 |
+
"rank": 0
|
| 42 |
+
}
|
| 43 |
+
],
|
| 44 |
+
"per_rank": true
|
| 45 |
+
},
|
| 46 |
+
"run_id": "2026-02-04_21-50-53",
|
| 47 |
+
"schema_version": 1,
|
| 48 |
+
"step": 4000,
|
| 49 |
+
"version_id": "v004000"
|
| 50 |
+
}
|
versions/v004000/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:78f02963ef05ad8a637ecfb3a7ac40ad473ed11332b17214ed9aaa23d728d77b
|
| 3 |
+
size 42058920
|
versions/v004000/opt_shard_rank0000.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7fde52ddeef434b923da0ac4420d1ec51880349dd8ba165dd9b4f372a73076c2
|
| 3 |
+
size 84167913
|
versions/v004000/rng_rank0000.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
versions/v005000/manifest.json
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"amp_scaler": null,
|
| 3 |
+
"code": {},
|
| 4 |
+
"config": {
|
| 5 |
+
"bytes": 1700,
|
| 6 |
+
"key": "runs/2026-02-04_21-50-53/config/train.toml",
|
| 7 |
+
"sha256": "391209bbf0737f88212f9f90b609e8db15c2ed63b217ca26bb56dbb84ced42e5"
|
| 8 |
+
},
|
| 9 |
+
"created_at": "2026-02-04T23:40:37.656498Z",
|
| 10 |
+
"metrics": {
|
| 11 |
+
"val_loss": 0.35444357991218567
|
| 12 |
+
},
|
| 13 |
+
"model": {
|
| 14 |
+
"bytes": 42058920,
|
| 15 |
+
"key": "runs/2026-02-04_21-50-53/versions/v005000/model.safetensors",
|
| 16 |
+
"sha256": "7a3d231a2049a290f190ffa8dd6c33fa95419cfc7fbbb52c73b635e472e62252"
|
| 17 |
+
},
|
| 18 |
+
"optimizer": {
|
| 19 |
+
"sharding": "custom",
|
| 20 |
+
"shards": [
|
| 21 |
+
{
|
| 22 |
+
"bytes": 84167913,
|
| 23 |
+
"key": "runs/2026-02-04_21-50-53/versions/v005000/opt_shard_rank0000.bin",
|
| 24 |
+
"rank": 0,
|
| 25 |
+
"sha256": "ee794b557148fe2f9a5076107ce616de0abae8dd2d010fb37f9b3573c2050373"
|
| 26 |
+
}
|
| 27 |
+
]
|
| 28 |
+
},
|
| 29 |
+
"paths": {
|
| 30 |
+
"layout_version": 1,
|
| 31 |
+
"root_local": "runs/2026-02-04_21-50-53"
|
| 32 |
+
},
|
| 33 |
+
"resume": {
|
| 34 |
+
"base_step": 5001,
|
| 35 |
+
"exact": true
|
| 36 |
+
},
|
| 37 |
+
"rng": {
|
| 38 |
+
"keys": [
|
| 39 |
+
{
|
| 40 |
+
"key": "runs/2026-02-04_21-50-53/versions/v005000/rng_rank0000.json",
|
| 41 |
+
"rank": 0
|
| 42 |
+
}
|
| 43 |
+
],
|
| 44 |
+
"per_rank": true
|
| 45 |
+
},
|
| 46 |
+
"run_id": "2026-02-04_21-50-53",
|
| 47 |
+
"schema_version": 1,
|
| 48 |
+
"step": 5000,
|
| 49 |
+
"version_id": "v005000"
|
| 50 |
+
}
|
versions/v005000/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7a3d231a2049a290f190ffa8dd6c33fa95419cfc7fbbb52c73b635e472e62252
|
| 3 |
+
size 42058920
|
versions/v005000/opt_shard_rank0000.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ee794b557148fe2f9a5076107ce616de0abae8dd2d010fb37f9b3573c2050373
|
| 3 |
+
size 84167913
|
versions/v005000/rng_rank0000.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
versions/v006000/manifest.json
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"amp_scaler": null,
|
| 3 |
+
"code": {},
|
| 4 |
+
"config": {
|
| 5 |
+
"bytes": 1700,
|
| 6 |
+
"key": "runs/2026-02-04_21-50-53/config/train.toml",
|
| 7 |
+
"sha256": "391209bbf0737f88212f9f90b609e8db15c2ed63b217ca26bb56dbb84ced42e5"
|
| 8 |
+
},
|
| 9 |
+
"created_at": "2026-02-05T00:02:31.367167Z",
|
| 10 |
+
"metrics": {
|
| 11 |
+
"val_loss": 0.35603439807891846
|
| 12 |
+
},
|
| 13 |
+
"model": {
|
| 14 |
+
"bytes": 42058920,
|
| 15 |
+
"key": "runs/2026-02-04_21-50-53/versions/v006000/model.safetensors",
|
| 16 |
+
"sha256": "f863ca7bfd2fc11fc6cf4f3df57567655a43bf4cf9ccaa66f254ed6ed248c9e0"
|
| 17 |
+
},
|
| 18 |
+
"optimizer": {
|
| 19 |
+
"sharding": "custom",
|
| 20 |
+
"shards": [
|
| 21 |
+
{
|
| 22 |
+
"bytes": 84167913,
|
| 23 |
+
"key": "runs/2026-02-04_21-50-53/versions/v006000/opt_shard_rank0000.bin",
|
| 24 |
+
"rank": 0,
|
| 25 |
+
"sha256": "96198f5eb55fde3b7040b5ee768b14a6d28e1c6539d49f9953c71e22367a5dad"
|
| 26 |
+
}
|
| 27 |
+
]
|
| 28 |
+
},
|
| 29 |
+
"paths": {
|
| 30 |
+
"layout_version": 1,
|
| 31 |
+
"root_local": "runs/2026-02-04_21-50-53"
|
| 32 |
+
},
|
| 33 |
+
"resume": {
|
| 34 |
+
"base_step": 6001,
|
| 35 |
+
"exact": true
|
| 36 |
+
},
|
| 37 |
+
"rng": {
|
| 38 |
+
"keys": [
|
| 39 |
+
{
|
| 40 |
+
"key": "runs/2026-02-04_21-50-53/versions/v006000/rng_rank0000.json",
|
| 41 |
+
"rank": 0
|
| 42 |
+
}
|
| 43 |
+
],
|
| 44 |
+
"per_rank": true
|
| 45 |
+
},
|
| 46 |
+
"run_id": "2026-02-04_21-50-53",
|
| 47 |
+
"schema_version": 1,
|
| 48 |
+
"step": 6000,
|
| 49 |
+
"version_id": "v006000"
|
| 50 |
+
}
|
versions/v006000/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f863ca7bfd2fc11fc6cf4f3df57567655a43bf4cf9ccaa66f254ed6ed248c9e0
|
| 3 |
+
size 42058920
|
versions/v006000/opt_shard_rank0000.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:96198f5eb55fde3b7040b5ee768b14a6d28e1c6539d49f9953c71e22367a5dad
|
| 3 |
+
size 84167913
|
versions/v006000/rng_rank0000.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|