Justin Brown
commited on
Add checkpoint (fineweb): ASA_ASM_fineweb_T1024_L15_D1024_H16_K16_S32_step17500_last.pt
Browse files- README.md +17 -15
- checkpoints/fineweb/ASA_ASM_fineweb_T1024_L15_D1024_H16_K16_S32_step17500_last.pt +3 -0
- configs/fineweb/ASA_ASM_fineweb_T1024_L15_D1024_H16_K16_S32_step17500_last.pt.config.json +62 -0
- metadata/fineweb/ASA_ASM_fineweb_T1024_L15_D1024_H16_K16_S32_step17500_last.pt.metadata.json +88 -0
README.md
CHANGED
|
@@ -5,28 +5,30 @@ tags:
|
|
| 5 |
- asa
|
| 6 |
- asm
|
| 7 |
- language-model
|
| 8 |
-
- wikitext
|
| 9 |
library_name: pytorch
|
| 10 |
---
|
| 11 |
|
| 12 |
-
# ASA
|
| 13 |
|
| 14 |
-
This
|
| 15 |
|
| 16 |
-
##
|
| 17 |
-
|
| 18 |
-
- `
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
## Provenance
|
| 22 |
-
- Code: https://github.com/digitaldaimyo/ASA
|
| 23 |
-
- Revision: `codex/implement-runnable-and-releasable-setup`
|
| 24 |
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
import torch
|
| 28 |
-
ckpt = torch.load("ASA_ASM_wt103-rawv1_gpt2_T1024_L21_D384_H8_K16_M32_ropek1_alibi1_gamma1_step75000_best.pt", map_location="cpu")
|
| 29 |
-
print(ckpt.keys())
|
| 30 |
|
| 31 |
## Notes
|
| 32 |
-
|
|
|
|
|
|
|
|
|
| 5 |
- asa
|
| 6 |
- asm
|
| 7 |
- language-model
|
|
|
|
| 8 |
library_name: pytorch
|
| 9 |
---
|
| 10 |
|
| 11 |
+
# ASA / ASM Checkpoints (Bucket Repo)
|
| 12 |
|
| 13 |
+
This repository stores **training checkpoints** for Addressed State Models (ASM) built from Addressed State Attention (ASA).
|
| 14 |
|
| 15 |
+
## Repository layout
|
| 16 |
+
|
| 17 |
+
- `checkpoints/<dataset>/...pt`
|
| 18 |
+
Raw PyTorch checkpoints (training artifacts).
|
| 19 |
+
|
| 20 |
+
- `configs/<dataset>/<checkpoint>.config.json`
|
| 21 |
+
Training config extracted from the checkpoint (when present).
|
| 22 |
+
|
| 23 |
+
- `metadata/<dataset>/<checkpoint>.metadata.json`
|
| 24 |
+
SHA256 + file size + provenance pointers.
|
| 25 |
|
| 26 |
## Provenance
|
|
|
|
|
|
|
| 27 |
|
| 28 |
+
- Code: https://github.com/digitaldaimyo/ASA
|
| 29 |
+
- Revision: `main`
|
|
|
|
|
|
|
|
|
|
| 30 |
|
| 31 |
## Notes
|
| 32 |
+
|
| 33 |
+
- These are raw training artifacts, primarily intended for research and reproduction.
|
| 34 |
+
- New uploads may overwrite **rolling pointers** (e.g. `last.pt`, `best.pt`), but the Hub preserves history by commit.
|
checkpoints/fineweb/ASA_ASM_fineweb_T1024_L15_D1024_H16_K16_S32_step17500_last.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:13e44f9f44d6ed669099918f1efa0440849b83ab374293e027887a35c385b055
|
| 3 |
+
size 2889860681
|
configs/fineweb/ASA_ASM_fineweb_T1024_L15_D1024_H16_K16_S32_step17500_last.pt.config.json
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"dataset_name": "HuggingFaceFW/fineweb",
|
| 3 |
+
"dataset_config": "sample-10BT",
|
| 4 |
+
"tokenizer_name": "gpt2",
|
| 5 |
+
"max_seq_len": 1024,
|
| 6 |
+
"stride_frac_val": 0.5,
|
| 7 |
+
"seed": 1337,
|
| 8 |
+
"micro_batch_size": 2,
|
| 9 |
+
"grad_accum_steps": 16,
|
| 10 |
+
"train_samples_target": 800000,
|
| 11 |
+
"val_samples_target": 25000,
|
| 12 |
+
"batch_size": 4,
|
| 13 |
+
"learning_rate": 0.0003,
|
| 14 |
+
"weight_decay": 0.01,
|
| 15 |
+
"betas": [
|
| 16 |
+
0.9,
|
| 17 |
+
0.95
|
| 18 |
+
],
|
| 19 |
+
"grad_clip": 1.0,
|
| 20 |
+
"warmup_steps": 300,
|
| 21 |
+
"total_steps": 50000,
|
| 22 |
+
"eval_interval": 1000,
|
| 23 |
+
"log_interval": 100,
|
| 24 |
+
"vocab_size": 50257,
|
| 25 |
+
"embed_dim": 1024,
|
| 26 |
+
"num_layers": 15,
|
| 27 |
+
"num_heads": 16,
|
| 28 |
+
"num_slots": 32,
|
| 29 |
+
"mlp_ratio": 4.0,
|
| 30 |
+
"dropout": 0.1,
|
| 31 |
+
"tie_weights": true,
|
| 32 |
+
"read_temperature": 1.0,
|
| 33 |
+
"write_temperature": 1.0,
|
| 34 |
+
"slot_dropout": 0.05,
|
| 35 |
+
"state_fp32": true,
|
| 36 |
+
"normalize_k": false,
|
| 37 |
+
"use_abs_pos": false,
|
| 38 |
+
"use_rope_keys": true,
|
| 39 |
+
"rope_base": 10000.0,
|
| 40 |
+
"use_alibi_write": true,
|
| 41 |
+
"alibi_strength_init": 0.1,
|
| 42 |
+
"learn_alibi_strength": true,
|
| 43 |
+
"min_strength": 0.0,
|
| 44 |
+
"use_content_read": true,
|
| 45 |
+
"content_read_init": -4.0,
|
| 46 |
+
"content_read_max_gamma": 3.0,
|
| 47 |
+
"use_slotspace_refine": true,
|
| 48 |
+
"slotspace_dim": 16,
|
| 49 |
+
"slotspace_gate_init": -4.0,
|
| 50 |
+
"slotspace_dropout": 0.05,
|
| 51 |
+
"slotspace_signed_weights": true,
|
| 52 |
+
"use_rope_slotspace": true,
|
| 53 |
+
"rope_base_slotspace": 100000.0,
|
| 54 |
+
"write_chunk_size": 1024,
|
| 55 |
+
"enable_compiled": true,
|
| 56 |
+
"eval_max_batches": 150,
|
| 57 |
+
"analytics_last_k": 4,
|
| 58 |
+
"output_dir": "./drive/MyDrive/asm_outputs",
|
| 59 |
+
"tag": "asm_fineweb_1024t_1024d_16h_16sd_32s_1024cs_15l",
|
| 60 |
+
"cache_dir": "./drive/MyDrive/asm_caches/fineweb",
|
| 61 |
+
"val_windows_cache": "./drive/MyDrive/asm_nlp/val_cache_fineweb_u32_windows_1024.u32tok"
|
| 62 |
+
}
|
metadata/fineweb/ASA_ASM_fineweb_T1024_L15_D1024_H16_K16_S32_step17500_last.pt.metadata.json
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"repo_id": "DigitalShogun/ASA-ASM-wikitext103-raw",
|
| 3 |
+
"dataset": "fineweb",
|
| 4 |
+
"exported_at_utc": "2026-01-30T16:21:48.689906+00:00",
|
| 5 |
+
"checkpoint_filename": "ASA_ASM_fineweb_T1024_L15_D1024_H16_K16_S32_step17500_last.pt",
|
| 6 |
+
"checkpoint_sha256": "13e44f9f44d6ed669099918f1efa0440849b83ab374293e027887a35c385b055",
|
| 7 |
+
"checkpoint_bytes": 2889860681,
|
| 8 |
+
"github_repo": "https://github.com/digitaldaimyo/ASA",
|
| 9 |
+
"github_revision": "main",
|
| 10 |
+
"train_config_in_ckpt": {
|
| 11 |
+
"dataset_name": "HuggingFaceFW/fineweb",
|
| 12 |
+
"dataset_config": "sample-10BT",
|
| 13 |
+
"tokenizer_name": "gpt2",
|
| 14 |
+
"max_seq_len": 1024,
|
| 15 |
+
"stride_frac_val": 0.5,
|
| 16 |
+
"seed": 1337,
|
| 17 |
+
"micro_batch_size": 2,
|
| 18 |
+
"grad_accum_steps": 16,
|
| 19 |
+
"train_samples_target": 800000,
|
| 20 |
+
"val_samples_target": 25000,
|
| 21 |
+
"batch_size": 4,
|
| 22 |
+
"learning_rate": 0.0003,
|
| 23 |
+
"weight_decay": 0.01,
|
| 24 |
+
"betas": [
|
| 25 |
+
0.9,
|
| 26 |
+
0.95
|
| 27 |
+
],
|
| 28 |
+
"grad_clip": 1.0,
|
| 29 |
+
"warmup_steps": 300,
|
| 30 |
+
"total_steps": 50000,
|
| 31 |
+
"eval_interval": 1000,
|
| 32 |
+
"log_interval": 100,
|
| 33 |
+
"vocab_size": 50257,
|
| 34 |
+
"embed_dim": 1024,
|
| 35 |
+
"num_layers": 15,
|
| 36 |
+
"num_heads": 16,
|
| 37 |
+
"num_slots": 32,
|
| 38 |
+
"mlp_ratio": 4.0,
|
| 39 |
+
"dropout": 0.1,
|
| 40 |
+
"tie_weights": true,
|
| 41 |
+
"read_temperature": 1.0,
|
| 42 |
+
"write_temperature": 1.0,
|
| 43 |
+
"slot_dropout": 0.05,
|
| 44 |
+
"state_fp32": true,
|
| 45 |
+
"normalize_k": false,
|
| 46 |
+
"use_abs_pos": false,
|
| 47 |
+
"use_rope_keys": true,
|
| 48 |
+
"rope_base": 10000.0,
|
| 49 |
+
"use_alibi_write": true,
|
| 50 |
+
"alibi_strength_init": 0.1,
|
| 51 |
+
"learn_alibi_strength": true,
|
| 52 |
+
"min_strength": 0.0,
|
| 53 |
+
"use_content_read": true,
|
| 54 |
+
"content_read_init": -4.0,
|
| 55 |
+
"content_read_max_gamma": 3.0,
|
| 56 |
+
"use_slotspace_refine": true,
|
| 57 |
+
"slotspace_dim": 16,
|
| 58 |
+
"slotspace_gate_init": -4.0,
|
| 59 |
+
"slotspace_dropout": 0.05,
|
| 60 |
+
"slotspace_signed_weights": true,
|
| 61 |
+
"use_rope_slotspace": true,
|
| 62 |
+
"rope_base_slotspace": 100000.0,
|
| 63 |
+
"write_chunk_size": 1024,
|
| 64 |
+
"enable_compiled": true,
|
| 65 |
+
"eval_max_batches": 150,
|
| 66 |
+
"analytics_last_k": 4,
|
| 67 |
+
"output_dir": "./drive/MyDrive/asm_outputs",
|
| 68 |
+
"tag": "asm_fineweb_1024t_1024d_16h_16sd_32s_1024cs_15l",
|
| 69 |
+
"cache_dir": "./drive/MyDrive/asm_caches/fineweb",
|
| 70 |
+
"val_windows_cache": "./drive/MyDrive/asm_nlp/val_cache_fineweb_u32_windows_1024.u32tok"
|
| 71 |
+
},
|
| 72 |
+
"ckpt_step_inferred": 17500,
|
| 73 |
+
"ckpt_step_fields": {
|
| 74 |
+
"global_step": null,
|
| 75 |
+
"step": 17500,
|
| 76 |
+
"train_step": null,
|
| 77 |
+
"epoch": null,
|
| 78 |
+
"iter": null,
|
| 79 |
+
"iteration": null
|
| 80 |
+
},
|
| 81 |
+
"ckpt_keys": [
|
| 82 |
+
"cfg",
|
| 83 |
+
"model",
|
| 84 |
+
"opt",
|
| 85 |
+
"step",
|
| 86 |
+
"best_val"
|
| 87 |
+
]
|
| 88 |
+
}
|