trixyL commited on
Commit
a81731e
·
1 Parent(s): bbd1c10

dump: train artifacts

Browse files
README.md CHANGED
@@ -1,42 +1,41 @@
1
- ---
2
- license: apache-2.0
3
- datasets:
4
- - ylecun/mnist
5
- language:
6
- - en
7
- tags:
8
- - mnist
9
- - '784'
10
- - '32'
11
- - transformerlm
12
- - diffusion
13
- ---
14
- # 🧠✨ TransformerLM (Diffusion 512, 32) — MNIST
15
-
16
- This is the result of the code from https://github.com/triloy8/transformerlm, a minimal diffusion Transformer trained on **MNIST** with a **784 fixed token context** =.
17
-
18
- ## ✅ Key Facts
19
-
20
- - **Model type:** Diffusion Transformer w/ LLaDA like objective
21
- - **Dataset:** SimpleStories
22
- - **Context length:** 784 tokens -> 28 * 28 image
23
- - **Layers:** 12
24
- - **Heads:** 8
25
- - **d_model:** 256
26
- - **d_ff:** 1024
27
- - **Training setup:** Single **NVIDIA A40 48GB**
28
- - **Runtime:** ~2 hours ⏱️
29
-
30
- ## 📦 What’s Inside
31
-
32
- - 6k steps from a 6k run, including:
33
- - Optimizer state
34
- - RNG state
35
- - Safetensors weights
36
- - Run config
37
-
38
- ## 🚀 Reproducibility
39
-
40
- To reproduce the run:
41
-
42
- Exact commit that launched the train: https://github.com/triloy8/transformerlm/commit/84a190a106ecefb7cad49f47eac24963d97fe000
 
1
+ ---
2
+ license: apache-2.0
3
+ datasets:
4
+ - ylecun/mnist
5
+ language:
6
+ - en
7
+ tags:
8
+ - mnist
9
+ - '784'
10
+ - '32'
11
+ - transformerlm
12
+ - diffusion
13
+ ---
14
+ # 🧠✨ TransformerLM (Diffusion 784, 32) — MNIST
15
+
16
+ Training run artifacts from https://github.com/triloy8/transformerlm: a minimal masked discrete diffusion Transformer trained on **MNIST** with a **fixed 784‑token context** (28×28 image tokens).
17
+
18
+ ## ✅ Key Facts
19
+
20
+ - **Model type:** Diffusion Transformer with LLaDA‑style objective
21
+ - **Dataset:** MNIST
22
+ - **Context length:** 784 tokens (28×28 image)
23
+ - **Layers:** 12
24
+ - **Heads:** 8
25
+ - **d_model:** 256
26
+ - **d_ff:** 1024
27
+ - **Training setup:** Single NVIDIA A40 (48GB)
28
+ - **Runtime:** ~2 hours ⏱️
29
+
30
+ ## 📦 What’s Inside
31
+
32
+ - 6k steps (full run), including:
33
+ - Optimizer state
34
+ - RNG state
35
+ - Safetensors weights
36
+ - Run config
37
+
38
+ ## 🚀 Reproducibility
39
+
40
+ Exact commit that launched the run:
41
+ https://github.com/triloy8/transformerlm/commit/84a190a106ecefb7cad49f47eac24963d97fe000
 
aliases/best.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alias": "best",
3
+ "manifest_key": "runs/2026-02-04_21-50-53/versions/v005000/manifest.json",
4
+ "metric_name": "val_loss",
5
+ "mode": "min",
6
+ "run_id": "2026-02-04_21-50-53",
7
+ "schema_version": 1,
8
+ "status": "active",
9
+ "step": 5000,
10
+ "value": 0.35444357991218567,
11
+ "version_id": "v005000"
12
+ }
aliases/latest.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alias": "latest",
3
+ "manifest_key": "runs/2026-02-04_21-50-53/versions/v006000/manifest.json",
4
+ "run_id": "2026-02-04_21-50-53",
5
+ "schema_version": 1,
6
+ "status": "active",
7
+ "step": 6000,
8
+ "version_id": "v006000"
9
+ }
config/config.json ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "checkpointing": {
3
+ "best_metric_name": "val_loss",
4
+ "best_mode": "min",
5
+ "ckpting_save_iter": 1000,
6
+ "enabled": true,
7
+ "resume_from": null,
8
+ "resume_optimizer": true,
9
+ "run_id": null
10
+ },
11
+ "compile": null,
12
+ "data": {
13
+ "cache_all": true,
14
+ "dataset_config": null,
15
+ "dataset_name": "ylecun/mnist",
16
+ "megatron_train_prefix": null,
17
+ "megatron_val_prefix": null,
18
+ "pad_random_shift": false,
19
+ "pad_token_id": null,
20
+ "pipeline_mode": "mnist",
21
+ "runs_path": "runs",
22
+ "shuffle_buffer_size": 0,
23
+ "shuffle_seed": 3407,
24
+ "text_field": "image",
25
+ "tokenizer": null,
26
+ "train_split": "train",
27
+ "val_split": "test"
28
+ },
29
+ "ddp": {
30
+ "backend": "nccl",
31
+ "bucket_size_mb": 200,
32
+ "master_addr": "127.0.0.1",
33
+ "master_port": "29500",
34
+ "nccl_p2p_disable": true,
35
+ "node_rank": 0,
36
+ "num_gpus_per_node": 1,
37
+ "num_nodes": 1
38
+ },
39
+ "logging": {
40
+ "architecture": "TransformerImage",
41
+ "backend": "wandb",
42
+ "dataset": "MNIST",
43
+ "log_activation_norms": false,
44
+ "log_grad_norms": true,
45
+ "log_p_mask_bucket_loss": false,
46
+ "log_weight_norms": true,
47
+ "p_mask_bucket_edges": null,
48
+ "run_name": null,
49
+ "val_log_every": 8,
50
+ "val_log_samples": 1
51
+ },
52
+ "model": {
53
+ "attention_backend": "torch_sdpa",
54
+ "attention_sdp_backend": "auto",
55
+ "context_length": 784,
56
+ "d_ff": 1024,
57
+ "d_model": 256,
58
+ "device": "cuda",
59
+ "dtype": "float32",
60
+ "eot_token_id": null,
61
+ "label_vocab_size": 11,
62
+ "mask_token_id": 32,
63
+ "model_type": "image",
64
+ "noise_epsilon": 0.001,
65
+ "null_label_id": 10,
66
+ "num_heads": 16,
67
+ "num_layers": 8,
68
+ "pixel_bins": 32,
69
+ "random_trunc_prob": 0.0,
70
+ "rope_theta": 10000.0,
71
+ "vocab_size": 33
72
+ },
73
+ "optimizer": {
74
+ "betas": [
75
+ 0.9,
76
+ 0.95
77
+ ],
78
+ "cosine_cycle_iters": 60000,
79
+ "eps": 1e-08,
80
+ "grad_clip_max_l2_norm": 3.0,
81
+ "initial_learning_rate": 0.0001,
82
+ "lr_schedule": "constant_with_warmup",
83
+ "max_learning_rate": 0.003,
84
+ "min_learning_rate": 0.0003,
85
+ "muon": null,
86
+ "optimizer_name": "adamw",
87
+ "warmup_iters": 200,
88
+ "weight_decay": 0.1
89
+ },
90
+ "train_infer": null,
91
+ "training": {
92
+ "amp_dtype": "bfloat16",
93
+ "amp_enabled": true,
94
+ "batch_size": 256,
95
+ "deterministic_mask": false,
96
+ "eot_mask_loss": false,
97
+ "grad_accum_steps": 1,
98
+ "max_train_iteration": 120000,
99
+ "max_val_iteration": 10,
100
+ "objective": "diffusion",
101
+ "p_mask_override": null,
102
+ "repeat_masking_seed": null,
103
+ "seed": 3407,
104
+ "skip_validation": false,
105
+ "train_loss_ema_decay": 0.99,
106
+ "uncond_label_dropout_prob": 0.1,
107
+ "val_freq_iteration": 250
108
+ },
109
+ "wandb": {
110
+ "architecture": null,
111
+ "dataset": null,
112
+ "entity": "yiltro8-org",
113
+ "project": "mnist_diffusion"
114
+ }
115
+ }
config/train.toml ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [model]
2
+ model_type = "image"
3
+ label_vocab_size = 11
4
+ vocab_size = 33
5
+ pixel_bins = 32
6
+ context_length = 784
7
+ d_model = 256
8
+ num_layers = 8
9
+ num_heads = 16
10
+ d_ff = 1024
11
+ rope_theta = 10000.0
12
+ attention_backend = "torch_sdpa"
13
+ attention_sdp_backend = "auto"
14
+ device = "cuda"
15
+ dtype = "float32"
16
+ mask_token_id = 32
17
+ null_label_id = 10
18
+ random_trunc_prob = 0.0
19
+
20
+ [optimizer]
21
+ optimizer_name = "adamw"
22
+ betas = [0.9, 0.95]
23
+ eps = 1e-8
24
+ weight_decay = 0.1
25
+ initial_learning_rate = 0.0001
26
+ max_learning_rate = 0.003
27
+ min_learning_rate = 0.0003
28
+ warmup_iters = 200
29
+ cosine_cycle_iters = 60000
30
+ grad_clip_max_l2_norm = 3.0
31
+ lr_schedule = "constant_with_warmup"
32
+
33
+ [training]
34
+ batch_size = 256
35
+ max_train_iteration = 120000
36
+ max_val_iteration = 10
37
+ val_freq_iteration = 250
38
+ seed = 3407
39
+ skip_validation = false
40
+ grad_accum_steps = 1
41
+ train_loss_ema_decay = 0.99
42
+ amp_enabled = true
43
+ amp_dtype = "bfloat16"
44
+ objective = "diffusion"
45
+ uncond_label_dropout_prob = 0.1
46
+
47
+ [data]
48
+ runs_path = "./runs"
49
+ dataset_name = "ylecun/mnist"
50
+ train_split = "train"
51
+ val_split = "test"
52
+ text_field = "image"
53
+ pipeline_mode = "mnist"
54
+ shuffle_buffer_size = 0
55
+ cache_all = true
56
+ shuffle_seed = 3407
57
+
58
+ [logging]
59
+ backend = "wandb"
60
+ architecture = "TransformerImage"
61
+ dataset = "MNIST"
62
+ log_activation_norms = false
63
+ log_weight_norms = true
64
+ log_grad_norms = true
65
+ log_p_mask_bucket_loss = false
66
+ val_log_every = 8
67
+ val_log_samples = 1
68
+
69
+ [wandb]
70
+ entity = "yiltro8-org"
71
+ project = "mnist_diffusion"
72
+
73
+ [ddp]
74
+ backend = "nccl"
75
+ num_nodes = 1
76
+ num_gpus_per_node = 1
77
+ node_rank = 0
78
+ master_addr = "127.0.0.1"
79
+ master_port = "29500"
80
+ bucket_size_mb = 200
81
+ nccl_p2p_disable = true
82
+
83
+ [checkpointing]
84
+ enabled = true
85
+ ckpting_save_iter = 1000
86
+ resume_optimizer = true
87
+ best_metric_name = "val_loss"
88
+ best_mode = "min"
manifest.json ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "aliases": {
3
+ "best": {
4
+ "metric_name": "val_loss",
5
+ "mode": "min",
6
+ "status": "active",
7
+ "step": 5000,
8
+ "value": 0.35444357991218567,
9
+ "version_id": "v005000"
10
+ },
11
+ "latest": {
12
+ "step": 6000,
13
+ "version_id": "v006000"
14
+ }
15
+ },
16
+ "config": {
17
+ "bytes": 1700,
18
+ "key": "runs/2026-02-04_21-50-53/config/train.toml",
19
+ "sha256": "391209bbf0737f88212f9f90b609e8db15c2ed63b217ca26bb56dbb84ced42e5"
20
+ },
21
+ "created_at": "2026-02-04T21:50:55.488444Z",
22
+ "paths": {
23
+ "layout_version": 1,
24
+ "root_local": "runs/2026-02-04_21-50-53"
25
+ },
26
+ "run_id": "2026-02-04_21-50-53",
27
+ "schema_version": 1,
28
+ "versions": [
29
+ {
30
+ "created_at": "2026-02-04T22:13:06.568747Z",
31
+ "metrics": {
32
+ "val_loss": 0.39340153336524963
33
+ },
34
+ "model_key": "runs/2026-02-04_21-50-53/versions/v001000/model.safetensors",
35
+ "step": 1000,
36
+ "version_id": "v001000"
37
+ },
38
+ {
39
+ "created_at": "2026-02-04T22:35:00.278291Z",
40
+ "metrics": {
41
+ "val_loss": 0.3754102289676666
42
+ },
43
+ "model_key": "runs/2026-02-04_21-50-53/versions/v002000/model.safetensors",
44
+ "step": 2000,
45
+ "version_id": "v002000"
46
+ },
47
+ {
48
+ "created_at": "2026-02-04T22:56:53.759137Z",
49
+ "metrics": {
50
+ "val_loss": 0.3638891577720642
51
+ },
52
+ "model_key": "runs/2026-02-04_21-50-53/versions/v003000/model.safetensors",
53
+ "step": 3000,
54
+ "version_id": "v003000"
55
+ },
56
+ {
57
+ "created_at": "2026-02-04T23:18:47.962640Z",
58
+ "metrics": {
59
+ "val_loss": 0.3601241409778595
60
+ },
61
+ "model_key": "runs/2026-02-04_21-50-53/versions/v004000/model.safetensors",
62
+ "step": 4000,
63
+ "version_id": "v004000"
64
+ },
65
+ {
66
+ "created_at": "2026-02-04T23:40:37.656498Z",
67
+ "metrics": {
68
+ "val_loss": 0.35444357991218567
69
+ },
70
+ "model_key": "runs/2026-02-04_21-50-53/versions/v005000/model.safetensors",
71
+ "step": 5000,
72
+ "version_id": "v005000"
73
+ },
74
+ {
75
+ "created_at": "2026-02-05T00:02:31.367167Z",
76
+ "metrics": {
77
+ "val_loss": 0.35603439807891846
78
+ },
79
+ "model_key": "runs/2026-02-04_21-50-53/versions/v006000/model.safetensors",
80
+ "step": 6000,
81
+ "version_id": "v006000"
82
+ }
83
+ ]
84
+ }
versions/v001000/manifest.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "amp_scaler": null,
3
+ "code": {},
4
+ "config": {
5
+ "bytes": 1700,
6
+ "key": "runs/2026-02-04_21-50-53/config/train.toml",
7
+ "sha256": "391209bbf0737f88212f9f90b609e8db15c2ed63b217ca26bb56dbb84ced42e5"
8
+ },
9
+ "created_at": "2026-02-04T22:13:06.568747Z",
10
+ "metrics": {
11
+ "val_loss": 0.39340153336524963
12
+ },
13
+ "model": {
14
+ "bytes": 42058920,
15
+ "key": "runs/2026-02-04_21-50-53/versions/v001000/model.safetensors",
16
+ "sha256": "ebb11d87c5025c61bd4ab43d5b68eb9ff3e55cf5c7fca808b6c1515a051d2a31"
17
+ },
18
+ "optimizer": {
19
+ "sharding": "custom",
20
+ "shards": [
21
+ {
22
+ "bytes": 84167913,
23
+ "key": "runs/2026-02-04_21-50-53/versions/v001000/opt_shard_rank0000.bin",
24
+ "rank": 0,
25
+ "sha256": "206f53052a9aebc2e21613a571a3ed22f51c66697990f9d227dd4b1e46e7e4d5"
26
+ }
27
+ ]
28
+ },
29
+ "paths": {
30
+ "layout_version": 1,
31
+ "root_local": "runs/2026-02-04_21-50-53"
32
+ },
33
+ "resume": {
34
+ "base_step": 1001,
35
+ "exact": true
36
+ },
37
+ "rng": {
38
+ "keys": [
39
+ {
40
+ "key": "runs/2026-02-04_21-50-53/versions/v001000/rng_rank0000.json",
41
+ "rank": 0
42
+ }
43
+ ],
44
+ "per_rank": true
45
+ },
46
+ "run_id": "2026-02-04_21-50-53",
47
+ "schema_version": 1,
48
+ "step": 1000,
49
+ "version_id": "v001000"
50
+ }
versions/v001000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ebb11d87c5025c61bd4ab43d5b68eb9ff3e55cf5c7fca808b6c1515a051d2a31
3
+ size 42058920
versions/v001000/opt_shard_rank0000.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:206f53052a9aebc2e21613a571a3ed22f51c66697990f9d227dd4b1e46e7e4d5
3
+ size 84167913
versions/v001000/rng_rank0000.json ADDED
The diff for this file is too large to render. See raw diff
 
versions/v002000/manifest.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "amp_scaler": null,
3
+ "code": {},
4
+ "config": {
5
+ "bytes": 1700,
6
+ "key": "runs/2026-02-04_21-50-53/config/train.toml",
7
+ "sha256": "391209bbf0737f88212f9f90b609e8db15c2ed63b217ca26bb56dbb84ced42e5"
8
+ },
9
+ "created_at": "2026-02-04T22:35:00.278291Z",
10
+ "metrics": {
11
+ "val_loss": 0.3754102289676666
12
+ },
13
+ "model": {
14
+ "bytes": 42058920,
15
+ "key": "runs/2026-02-04_21-50-53/versions/v002000/model.safetensors",
16
+ "sha256": "370edaeeb9ef1fcd3b9b6c32d40541851f1bc884d64fc2bfecc9c968472d16d6"
17
+ },
18
+ "optimizer": {
19
+ "sharding": "custom",
20
+ "shards": [
21
+ {
22
+ "bytes": 84167913,
23
+ "key": "runs/2026-02-04_21-50-53/versions/v002000/opt_shard_rank0000.bin",
24
+ "rank": 0,
25
+ "sha256": "d4d7423c4d74633a72d4ab0f00202566d80817559fd2b8fcac56f31688af9e98"
26
+ }
27
+ ]
28
+ },
29
+ "paths": {
30
+ "layout_version": 1,
31
+ "root_local": "runs/2026-02-04_21-50-53"
32
+ },
33
+ "resume": {
34
+ "base_step": 2001,
35
+ "exact": true
36
+ },
37
+ "rng": {
38
+ "keys": [
39
+ {
40
+ "key": "runs/2026-02-04_21-50-53/versions/v002000/rng_rank0000.json",
41
+ "rank": 0
42
+ }
43
+ ],
44
+ "per_rank": true
45
+ },
46
+ "run_id": "2026-02-04_21-50-53",
47
+ "schema_version": 1,
48
+ "step": 2000,
49
+ "version_id": "v002000"
50
+ }
versions/v002000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:370edaeeb9ef1fcd3b9b6c32d40541851f1bc884d64fc2bfecc9c968472d16d6
3
+ size 42058920
versions/v002000/opt_shard_rank0000.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d4d7423c4d74633a72d4ab0f00202566d80817559fd2b8fcac56f31688af9e98
3
+ size 84167913
versions/v002000/rng_rank0000.json ADDED
The diff for this file is too large to render. See raw diff
 
versions/v003000/manifest.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "amp_scaler": null,
3
+ "code": {},
4
+ "config": {
5
+ "bytes": 1700,
6
+ "key": "runs/2026-02-04_21-50-53/config/train.toml",
7
+ "sha256": "391209bbf0737f88212f9f90b609e8db15c2ed63b217ca26bb56dbb84ced42e5"
8
+ },
9
+ "created_at": "2026-02-04T22:56:53.759137Z",
10
+ "metrics": {
11
+ "val_loss": 0.3638891577720642
12
+ },
13
+ "model": {
14
+ "bytes": 42058920,
15
+ "key": "runs/2026-02-04_21-50-53/versions/v003000/model.safetensors",
16
+ "sha256": "67f022191a131ad52b4f6efc1fc53e98f9d5d314f04c4e6862ed5eadc555a722"
17
+ },
18
+ "optimizer": {
19
+ "sharding": "custom",
20
+ "shards": [
21
+ {
22
+ "bytes": 84167913,
23
+ "key": "runs/2026-02-04_21-50-53/versions/v003000/opt_shard_rank0000.bin",
24
+ "rank": 0,
25
+ "sha256": "a3c8d55a6d1c0da06c531b4fb6a0895588ad004d500c84a1e2550b9ad648621a"
26
+ }
27
+ ]
28
+ },
29
+ "paths": {
30
+ "layout_version": 1,
31
+ "root_local": "runs/2026-02-04_21-50-53"
32
+ },
33
+ "resume": {
34
+ "base_step": 3001,
35
+ "exact": true
36
+ },
37
+ "rng": {
38
+ "keys": [
39
+ {
40
+ "key": "runs/2026-02-04_21-50-53/versions/v003000/rng_rank0000.json",
41
+ "rank": 0
42
+ }
43
+ ],
44
+ "per_rank": true
45
+ },
46
+ "run_id": "2026-02-04_21-50-53",
47
+ "schema_version": 1,
48
+ "step": 3000,
49
+ "version_id": "v003000"
50
+ }
versions/v003000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:67f022191a131ad52b4f6efc1fc53e98f9d5d314f04c4e6862ed5eadc555a722
3
+ size 42058920
versions/v003000/opt_shard_rank0000.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3c8d55a6d1c0da06c531b4fb6a0895588ad004d500c84a1e2550b9ad648621a
3
+ size 84167913
versions/v003000/rng_rank0000.json ADDED
The diff for this file is too large to render. See raw diff
 
versions/v004000/manifest.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "amp_scaler": null,
3
+ "code": {},
4
+ "config": {
5
+ "bytes": 1700,
6
+ "key": "runs/2026-02-04_21-50-53/config/train.toml",
7
+ "sha256": "391209bbf0737f88212f9f90b609e8db15c2ed63b217ca26bb56dbb84ced42e5"
8
+ },
9
+ "created_at": "2026-02-04T23:18:47.962640Z",
10
+ "metrics": {
11
+ "val_loss": 0.3601241409778595
12
+ },
13
+ "model": {
14
+ "bytes": 42058920,
15
+ "key": "runs/2026-02-04_21-50-53/versions/v004000/model.safetensors",
16
+ "sha256": "78f02963ef05ad8a637ecfb3a7ac40ad473ed11332b17214ed9aaa23d728d77b"
17
+ },
18
+ "optimizer": {
19
+ "sharding": "custom",
20
+ "shards": [
21
+ {
22
+ "bytes": 84167913,
23
+ "key": "runs/2026-02-04_21-50-53/versions/v004000/opt_shard_rank0000.bin",
24
+ "rank": 0,
25
+ "sha256": "7fde52ddeef434b923da0ac4420d1ec51880349dd8ba165dd9b4f372a73076c2"
26
+ }
27
+ ]
28
+ },
29
+ "paths": {
30
+ "layout_version": 1,
31
+ "root_local": "runs/2026-02-04_21-50-53"
32
+ },
33
+ "resume": {
34
+ "base_step": 4001,
35
+ "exact": true
36
+ },
37
+ "rng": {
38
+ "keys": [
39
+ {
40
+ "key": "runs/2026-02-04_21-50-53/versions/v004000/rng_rank0000.json",
41
+ "rank": 0
42
+ }
43
+ ],
44
+ "per_rank": true
45
+ },
46
+ "run_id": "2026-02-04_21-50-53",
47
+ "schema_version": 1,
48
+ "step": 4000,
49
+ "version_id": "v004000"
50
+ }
versions/v004000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:78f02963ef05ad8a637ecfb3a7ac40ad473ed11332b17214ed9aaa23d728d77b
3
+ size 42058920
versions/v004000/opt_shard_rank0000.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7fde52ddeef434b923da0ac4420d1ec51880349dd8ba165dd9b4f372a73076c2
3
+ size 84167913
versions/v004000/rng_rank0000.json ADDED
The diff for this file is too large to render. See raw diff
 
versions/v005000/manifest.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "amp_scaler": null,
3
+ "code": {},
4
+ "config": {
5
+ "bytes": 1700,
6
+ "key": "runs/2026-02-04_21-50-53/config/train.toml",
7
+ "sha256": "391209bbf0737f88212f9f90b609e8db15c2ed63b217ca26bb56dbb84ced42e5"
8
+ },
9
+ "created_at": "2026-02-04T23:40:37.656498Z",
10
+ "metrics": {
11
+ "val_loss": 0.35444357991218567
12
+ },
13
+ "model": {
14
+ "bytes": 42058920,
15
+ "key": "runs/2026-02-04_21-50-53/versions/v005000/model.safetensors",
16
+ "sha256": "7a3d231a2049a290f190ffa8dd6c33fa95419cfc7fbbb52c73b635e472e62252"
17
+ },
18
+ "optimizer": {
19
+ "sharding": "custom",
20
+ "shards": [
21
+ {
22
+ "bytes": 84167913,
23
+ "key": "runs/2026-02-04_21-50-53/versions/v005000/opt_shard_rank0000.bin",
24
+ "rank": 0,
25
+ "sha256": "ee794b557148fe2f9a5076107ce616de0abae8dd2d010fb37f9b3573c2050373"
26
+ }
27
+ ]
28
+ },
29
+ "paths": {
30
+ "layout_version": 1,
31
+ "root_local": "runs/2026-02-04_21-50-53"
32
+ },
33
+ "resume": {
34
+ "base_step": 5001,
35
+ "exact": true
36
+ },
37
+ "rng": {
38
+ "keys": [
39
+ {
40
+ "key": "runs/2026-02-04_21-50-53/versions/v005000/rng_rank0000.json",
41
+ "rank": 0
42
+ }
43
+ ],
44
+ "per_rank": true
45
+ },
46
+ "run_id": "2026-02-04_21-50-53",
47
+ "schema_version": 1,
48
+ "step": 5000,
49
+ "version_id": "v005000"
50
+ }
versions/v005000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a3d231a2049a290f190ffa8dd6c33fa95419cfc7fbbb52c73b635e472e62252
3
+ size 42058920
versions/v005000/opt_shard_rank0000.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee794b557148fe2f9a5076107ce616de0abae8dd2d010fb37f9b3573c2050373
3
+ size 84167913
versions/v005000/rng_rank0000.json ADDED
The diff for this file is too large to render. See raw diff
 
versions/v006000/manifest.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "amp_scaler": null,
3
+ "code": {},
4
+ "config": {
5
+ "bytes": 1700,
6
+ "key": "runs/2026-02-04_21-50-53/config/train.toml",
7
+ "sha256": "391209bbf0737f88212f9f90b609e8db15c2ed63b217ca26bb56dbb84ced42e5"
8
+ },
9
+ "created_at": "2026-02-05T00:02:31.367167Z",
10
+ "metrics": {
11
+ "val_loss": 0.35603439807891846
12
+ },
13
+ "model": {
14
+ "bytes": 42058920,
15
+ "key": "runs/2026-02-04_21-50-53/versions/v006000/model.safetensors",
16
+ "sha256": "f863ca7bfd2fc11fc6cf4f3df57567655a43bf4cf9ccaa66f254ed6ed248c9e0"
17
+ },
18
+ "optimizer": {
19
+ "sharding": "custom",
20
+ "shards": [
21
+ {
22
+ "bytes": 84167913,
23
+ "key": "runs/2026-02-04_21-50-53/versions/v006000/opt_shard_rank0000.bin",
24
+ "rank": 0,
25
+ "sha256": "96198f5eb55fde3b7040b5ee768b14a6d28e1c6539d49f9953c71e22367a5dad"
26
+ }
27
+ ]
28
+ },
29
+ "paths": {
30
+ "layout_version": 1,
31
+ "root_local": "runs/2026-02-04_21-50-53"
32
+ },
33
+ "resume": {
34
+ "base_step": 6001,
35
+ "exact": true
36
+ },
37
+ "rng": {
38
+ "keys": [
39
+ {
40
+ "key": "runs/2026-02-04_21-50-53/versions/v006000/rng_rank0000.json",
41
+ "rank": 0
42
+ }
43
+ ],
44
+ "per_rank": true
45
+ },
46
+ "run_id": "2026-02-04_21-50-53",
47
+ "schema_version": 1,
48
+ "step": 6000,
49
+ "version_id": "v006000"
50
+ }
versions/v006000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f863ca7bfd2fc11fc6cf4f3df57567655a43bf4cf9ccaa66f254ed6ed248c9e0
3
+ size 42058920
versions/v006000/opt_shard_rank0000.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96198f5eb55fde3b7040b5ee768b14a6d28e1c6539d49f9953c71e22367a5dad
3
+ size 84167913
versions/v006000/rng_rank0000.json ADDED
The diff for this file is too large to render. See raw diff