Justin Brown commited on
Add checkpoint + card (ASA_ASM_wt103-rawv1_gpt2_T1024_L21_D384_H8_K16_M32_ropek1_alibi1_gamma1_step75000_best.pt)
Browse files
ASA_ASM_wt103-rawv1_gpt2_T1024_L21_D384_H8_K16_M32_ropek1_alibi1_gamma1_step75000_best.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3ebad7be0e98fa7c50aa7cb6c5b709f0b4cf218b2a021a6f8aab21e5ecd94165
|
| 3 |
+
size 680885861
|
README.md
CHANGED
|
@@ -1,3 +1,31 @@
|
|
| 1 |
-
---
|
| 2 |
-
license:
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: apache-2.0
|
| 3 |
+
tags:
|
| 4 |
+
- addressed-state-attention
|
| 5 |
+
- asa
|
| 6 |
+
- asm
|
| 7 |
+
- language-model
|
| 8 |
+
- wikitext
|
| 9 |
+
library_name: pytorch
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
# ASA-ASM (Wikitext-103 raw) — Checkpoint
|
| 13 |
+
|
| 14 |
+
This repo contains a checkpoint for an **Addressed State Model (ASM)** built from **Addressed State Attention (ASA)**.
|
| 15 |
+
|
| 16 |
+
## What's included
|
| 17 |
+
- `/ASA_ASM_wt103-rawv1_gpt2_T1024_L21_D384_H8_K16_M32_ropek1_alibi1_gamma1_step75000_best.pt`: training checkpoint (PyTorch)
|
| 18 |
+
- `config.json` (if present): training config serialized from checkpoint
|
| 19 |
+
- `metadata.json`: SHA256 + provenance info (GitHub source pointer)
|
| 20 |
+
|
| 21 |
+
## Provenance
|
| 22 |
+
- Code: https://github.com/digitaldaimyo/ASA
|
| 23 |
+
- Revision: `codex/implement-runnable-and-releasable-setup`
|
| 24 |
+
|
| 25 |
+
## Quick load (raw checkpoint)
|
| 26 |
+
```python
|
| 27 |
+
import torch
|
| 28 |
+
ckpt = torch.load("ASA_ASM_wt103-rawv1_gpt2_T1024_L21_D384_H8_K16_M32_ropek1_alibi1_gamma1_step75000_best.pt", map_location="cpu")
|
| 29 |
+
print(ckpt.keys())
|
| 30 |
+
Notes
|
| 31 |
+
This is a raw training artifact. A lightweight inference wrapper / conversion can be added later.
|
config.json
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"dataset_name": "wikitext",
|
| 3 |
+
"dataset_config": "wikitext-103-raw-v1",
|
| 4 |
+
"tokenizer_name": "gpt2",
|
| 5 |
+
"max_seq_len": 1024,
|
| 6 |
+
"stride_frac_val": 0.5,
|
| 7 |
+
"seed": 1337,
|
| 8 |
+
"train_samples_target": 100000000,
|
| 9 |
+
"val_samples_target": 25000,
|
| 10 |
+
"batch_size": 32,
|
| 11 |
+
"learning_rate": 0.0003,
|
| 12 |
+
"weight_decay": 0.01,
|
| 13 |
+
"betas": [
|
| 14 |
+
0.9,
|
| 15 |
+
0.95
|
| 16 |
+
],
|
| 17 |
+
"grad_clip": 1.0,
|
| 18 |
+
"warmup_steps": 1000,
|
| 19 |
+
"total_steps": 75000,
|
| 20 |
+
"eval_interval": 1000,
|
| 21 |
+
"log_interval": 100,
|
| 22 |
+
"vocab_size": 50257,
|
| 23 |
+
"embed_dim": 384,
|
| 24 |
+
"num_layers": 21,
|
| 25 |
+
"num_heads": 8,
|
| 26 |
+
"num_slots": 16,
|
| 27 |
+
"mlp_ratio": 4.0,
|
| 28 |
+
"dropout": 0.1,
|
| 29 |
+
"tie_weights": true,
|
| 30 |
+
"read_temperature": 1.0,
|
| 31 |
+
"write_temperature": 1.0,
|
| 32 |
+
"slot_dropout": 0.05,
|
| 33 |
+
"state_fp32": true,
|
| 34 |
+
"normalize_k": false,
|
| 35 |
+
"use_abs_pos": false,
|
| 36 |
+
"use_rope_keys": true,
|
| 37 |
+
"rope_base": 10000.0,
|
| 38 |
+
"use_alibi_write": true,
|
| 39 |
+
"alibi_strength_init": 0.1,
|
| 40 |
+
"learn_alibi_strength": true,
|
| 41 |
+
"min_strength": 0.0,
|
| 42 |
+
"use_content_read": true,
|
| 43 |
+
"content_read_init": -4.0,
|
| 44 |
+
"content_read_max_gamma": 3.0,
|
| 45 |
+
"use_slotspace_refine": true,
|
| 46 |
+
"slotspace_dim": 32,
|
| 47 |
+
"slotspace_gate_init": -4.0,
|
| 48 |
+
"slotspace_dropout": 0.05,
|
| 49 |
+
"slotspace_signed_weights": true,
|
| 50 |
+
"use_rope_slotspace": true,
|
| 51 |
+
"rope_base_slotspace": 100000.0,
|
| 52 |
+
"write_chunk_size": 128,
|
| 53 |
+
"slotspace_chunk_size": 128,
|
| 54 |
+
"eval_max_batches": 150,
|
| 55 |
+
"analytics_last_k": 32,
|
| 56 |
+
"output_dir": "./drive/MyDrive/asm_outputs",
|
| 57 |
+
"tag": "asm_wikitext_1024t_384d_32sd_16s_35l",
|
| 58 |
+
"cache_dir": "./drive/MyDrive/asm_caches",
|
| 59 |
+
"val_windows_cache": "./drive/MyDrive/asm_nlp/val_cache_wikitext_windows_1024.pkl"
|
| 60 |
+
}
|
metadata.json
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"repo_id": "DigitalShogun/ASA-ASM-wikitext103-raw",
|
| 3 |
+
"exported_at_utc": "2026-01-25T14:38:05.116518+00:00",
|
| 4 |
+
"checkpoint_filename": "ASA_ASM_wt103-rawv1_gpt2_T1024_L21_D384_H8_K16_M32_ropek1_alibi1_gamma1_step75000_best.pt",
|
| 5 |
+
"checkpoint_sha256": "3ebad7be0e98fa7c50aa7cb6c5b709f0b4cf218b2a021a6f8aab21e5ecd94165",
|
| 6 |
+
"checkpoint_bytes": 680885861,
|
| 7 |
+
"github_repo": "https://github.com/digitaldaimyo/ASA",
|
| 8 |
+
"github_revision": "codex/implement-runnable-and-releasable-setup",
|
| 9 |
+
"train_config_in_ckpt": {
|
| 10 |
+
"dataset_name": "wikitext",
|
| 11 |
+
"dataset_config": "wikitext-103-raw-v1",
|
| 12 |
+
"tokenizer_name": "gpt2",
|
| 13 |
+
"max_seq_len": 1024,
|
| 14 |
+
"stride_frac_val": 0.5,
|
| 15 |
+
"seed": 1337,
|
| 16 |
+
"train_samples_target": 100000000,
|
| 17 |
+
"val_samples_target": 25000,
|
| 18 |
+
"batch_size": 32,
|
| 19 |
+
"learning_rate": 0.0003,
|
| 20 |
+
"weight_decay": 0.01,
|
| 21 |
+
"betas": [
|
| 22 |
+
0.9,
|
| 23 |
+
0.95
|
| 24 |
+
],
|
| 25 |
+
"grad_clip": 1.0,
|
| 26 |
+
"warmup_steps": 1000,
|
| 27 |
+
"total_steps": 75000,
|
| 28 |
+
"eval_interval": 1000,
|
| 29 |
+
"log_interval": 100,
|
| 30 |
+
"vocab_size": 50257,
|
| 31 |
+
"embed_dim": 384,
|
| 32 |
+
"num_layers": 21,
|
| 33 |
+
"num_heads": 8,
|
| 34 |
+
"num_slots": 16,
|
| 35 |
+
"mlp_ratio": 4.0,
|
| 36 |
+
"dropout": 0.1,
|
| 37 |
+
"tie_weights": true,
|
| 38 |
+
"read_temperature": 1.0,
|
| 39 |
+
"write_temperature": 1.0,
|
| 40 |
+
"slot_dropout": 0.05,
|
| 41 |
+
"state_fp32": true,
|
| 42 |
+
"normalize_k": false,
|
| 43 |
+
"use_abs_pos": false,
|
| 44 |
+
"use_rope_keys": true,
|
| 45 |
+
"rope_base": 10000.0,
|
| 46 |
+
"use_alibi_write": true,
|
| 47 |
+
"alibi_strength_init": 0.1,
|
| 48 |
+
"learn_alibi_strength": true,
|
| 49 |
+
"min_strength": 0.0,
|
| 50 |
+
"use_content_read": true,
|
| 51 |
+
"content_read_init": -4.0,
|
| 52 |
+
"content_read_max_gamma": 3.0,
|
| 53 |
+
"use_slotspace_refine": true,
|
| 54 |
+
"slotspace_dim": 32,
|
| 55 |
+
"slotspace_gate_init": -4.0,
|
| 56 |
+
"slotspace_dropout": 0.05,
|
| 57 |
+
"slotspace_signed_weights": true,
|
| 58 |
+
"use_rope_slotspace": true,
|
| 59 |
+
"rope_base_slotspace": 100000.0,
|
| 60 |
+
"write_chunk_size": 128,
|
| 61 |
+
"slotspace_chunk_size": 128,
|
| 62 |
+
"eval_max_batches": 150,
|
| 63 |
+
"analytics_last_k": 32,
|
| 64 |
+
"output_dir": "./drive/MyDrive/asm_outputs",
|
| 65 |
+
"tag": "asm_wikitext_1024t_384d_32sd_16s_35l",
|
| 66 |
+
"cache_dir": "./drive/MyDrive/asm_caches",
|
| 67 |
+
"val_windows_cache": "./drive/MyDrive/asm_nlp/val_cache_wikitext_windows_1024.pkl"
|
| 68 |
+
},
|
| 69 |
+
"ckpt_step_fields": {
|
| 70 |
+
"global_step": null,
|
| 71 |
+
"step": 75000,
|
| 72 |
+
"train_step": null,
|
| 73 |
+
"epoch": null
|
| 74 |
+
}
|
| 75 |
+
}
|