Add checkpoint + card (ASA_ASM_wt103-rawv1_gpt2_T1024_L21_D384_H8_K16_M32_ropek1_alibi1_gamma1_step75000_best.pt)

Files changed (4) hide show

ASA_ASM_wt103-rawv1_gpt2_T1024_L21_D384_H8_K16_M32_ropek1_alibi1_gamma1_step75000_best.pt +3 -0
README.md +31 -3
config.json +60 -0
metadata.json +75 -0

ASA_ASM_wt103-rawv1_gpt2_T1024_L21_D384_H8_K16_M32_ropek1_alibi1_gamma1_step75000_best.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3ebad7be0e98fa7c50aa7cb6c5b709f0b4cf218b2a021a6f8aab21e5ecd94165
+size 680885861

README.md CHANGED Viewed

@@ -1,3 +1,31 @@
----
-license: mit
----

+---
+license: apache-2.0
+tags:
+- addressed-state-attention
+- asa
+- asm
+- language-model
+- wikitext
+library_name: pytorch
+---
+# ASA-ASM (Wikitext-103 raw) — Checkpoint
+This repo contains a checkpoint for an **Addressed State Model (ASM)** built from **Addressed State Attention (ASA)**.
+## What's included
+- `/ASA_ASM_wt103-rawv1_gpt2_T1024_L21_D384_H8_K16_M32_ropek1_alibi1_gamma1_step75000_best.pt`: training checkpoint (PyTorch)
+- `config.json` (if present): training config serialized from checkpoint
+- `metadata.json`: SHA256 + provenance info (GitHub source pointer)
+## Provenance
+- Code: https://github.com/digitaldaimyo/ASA
+- Revision: `codex/implement-runnable-and-releasable-setup`
+## Quick load (raw checkpoint)
+```python
+import torch
+ckpt = torch.load("ASA_ASM_wt103-rawv1_gpt2_T1024_L21_D384_H8_K16_M32_ropek1_alibi1_gamma1_step75000_best.pt", map_location="cpu")
+print(ckpt.keys())
+Notes
+This is a raw training artifact. A lightweight inference wrapper / conversion can be added later.

config.json ADDED Viewed

	@@ -0,0 +1,60 @@

+{
+  "dataset_name": "wikitext",
+  "dataset_config": "wikitext-103-raw-v1",
+  "tokenizer_name": "gpt2",
+  "max_seq_len": 1024,
+  "stride_frac_val": 0.5,
+  "seed": 1337,
+  "train_samples_target": 100000000,
+  "val_samples_target": 25000,
+  "batch_size": 32,
+  "learning_rate": 0.0003,
+  "weight_decay": 0.01,
+  "betas": [
+    0.9,
+    0.95
+  ],
+  "grad_clip": 1.0,
+  "warmup_steps": 1000,
+  "total_steps": 75000,
+  "eval_interval": 1000,
+  "log_interval": 100,
+  "vocab_size": 50257,
+  "embed_dim": 384,
+  "num_layers": 21,
+  "num_heads": 8,
+  "num_slots": 16,
+  "mlp_ratio": 4.0,
+  "dropout": 0.1,
+  "tie_weights": true,
+  "read_temperature": 1.0,
+  "write_temperature": 1.0,
+  "slot_dropout": 0.05,
+  "state_fp32": true,
+  "normalize_k": false,
+  "use_abs_pos": false,
+  "use_rope_keys": true,
+  "rope_base": 10000.0,
+  "use_alibi_write": true,
+  "alibi_strength_init": 0.1,
+  "learn_alibi_strength": true,
+  "min_strength": 0.0,
+  "use_content_read": true,
+  "content_read_init": -4.0,
+  "content_read_max_gamma": 3.0,
+  "use_slotspace_refine": true,
+  "slotspace_dim": 32,
+  "slotspace_gate_init": -4.0,
+  "slotspace_dropout": 0.05,
+  "slotspace_signed_weights": true,
+  "use_rope_slotspace": true,
+  "rope_base_slotspace": 100000.0,
+  "write_chunk_size": 128,
+  "slotspace_chunk_size": 128,
+  "eval_max_batches": 150,
+  "analytics_last_k": 32,
+  "output_dir": "./drive/MyDrive/asm_outputs",
+  "tag": "asm_wikitext_1024t_384d_32sd_16s_35l",
+  "cache_dir": "./drive/MyDrive/asm_caches",
+  "val_windows_cache": "./drive/MyDrive/asm_nlp/val_cache_wikitext_windows_1024.pkl"
+}

metadata.json ADDED Viewed

	@@ -0,0 +1,75 @@

+{
+  "repo_id": "DigitalShogun/ASA-ASM-wikitext103-raw",
+  "exported_at_utc": "2026-01-25T14:38:05.116518+00:00",
+  "checkpoint_filename": "ASA_ASM_wt103-rawv1_gpt2_T1024_L21_D384_H8_K16_M32_ropek1_alibi1_gamma1_step75000_best.pt",
+  "checkpoint_sha256": "3ebad7be0e98fa7c50aa7cb6c5b709f0b4cf218b2a021a6f8aab21e5ecd94165",
+  "checkpoint_bytes": 680885861,
+  "github_repo": "https://github.com/digitaldaimyo/ASA",
+  "github_revision": "codex/implement-runnable-and-releasable-setup",
+  "train_config_in_ckpt": {
+    "dataset_name": "wikitext",
+    "dataset_config": "wikitext-103-raw-v1",
+    "tokenizer_name": "gpt2",
+    "max_seq_len": 1024,
+    "stride_frac_val": 0.5,
+    "seed": 1337,
+    "train_samples_target": 100000000,
+    "val_samples_target": 25000,
+    "batch_size": 32,
+    "learning_rate": 0.0003,
+    "weight_decay": 0.01,
+    "betas": [
+      0.9,
+      0.95
+    ],
+    "grad_clip": 1.0,
+    "warmup_steps": 1000,
+    "total_steps": 75000,
+    "eval_interval": 1000,
+    "log_interval": 100,
+    "vocab_size": 50257,
+    "embed_dim": 384,
+    "num_layers": 21,
+    "num_heads": 8,
+    "num_slots": 16,
+    "mlp_ratio": 4.0,
+    "dropout": 0.1,
+    "tie_weights": true,
+    "read_temperature": 1.0,
+    "write_temperature": 1.0,
+    "slot_dropout": 0.05,
+    "state_fp32": true,
+    "normalize_k": false,
+    "use_abs_pos": false,
+    "use_rope_keys": true,
+    "rope_base": 10000.0,
+    "use_alibi_write": true,
+    "alibi_strength_init": 0.1,
+    "learn_alibi_strength": true,
+    "min_strength": 0.0,
+    "use_content_read": true,
+    "content_read_init": -4.0,
+    "content_read_max_gamma": 3.0,
+    "use_slotspace_refine": true,
+    "slotspace_dim": 32,
+    "slotspace_gate_init": -4.0,
+    "slotspace_dropout": 0.05,
+    "slotspace_signed_weights": true,
+    "use_rope_slotspace": true,
+    "rope_base_slotspace": 100000.0,
+    "write_chunk_size": 128,
+    "slotspace_chunk_size": 128,
+    "eval_max_batches": 150,
+    "analytics_last_k": 32,
+    "output_dir": "./drive/MyDrive/asm_outputs",
+    "tag": "asm_wikitext_1024t_384d_32sd_16s_35l",
+    "cache_dir": "./drive/MyDrive/asm_caches",
+    "val_windows_cache": "./drive/MyDrive/asm_nlp/val_cache_wikitext_windows_1024.pkl"
+  },
+  "ckpt_step_fields": {
+    "global_step": null,
+    "step": 75000,
+    "train_step": null,
+    "epoch": null
+  }
+}