Add checkpoint (fineweb): ASA_ASM_fineweb_T1024_L15_D1024_H16_K16_S32_step17500_last.pt

Browse files

Files changed (4) hide show

README.md +17 -15
checkpoints/fineweb/ASA_ASM_fineweb_T1024_L15_D1024_H16_K16_S32_step17500_last.pt +3 -0
configs/fineweb/ASA_ASM_fineweb_T1024_L15_D1024_H16_K16_S32_step17500_last.pt.config.json +62 -0
metadata/fineweb/ASA_ASM_fineweb_T1024_L15_D1024_H16_K16_S32_step17500_last.pt.metadata.json +88 -0

README.md CHANGED Viewed

@@ -5,28 +5,30 @@ tags:
 - asa
 - asm
 - language-model
-- wikitext
 library_name: pytorch
 ---
-# ASA-ASM (Wikitext-103 raw) — Checkpoint
-This repo contains a checkpoint for an **Addressed State Model (ASM)** built from **Addressed State Attention (ASA)**.
-## What's included
-- `/ASA_ASM_wt103-rawv1_gpt2_T1024_L21_D384_H8_K16_M32_ropek1_alibi1_gamma1_step75000_best.pt`: training checkpoint (PyTorch)
-- `config.json` (if present): training config serialized from checkpoint
-- `metadata.json`: SHA256 + provenance info (GitHub source pointer)
 ## Provenance
-- Code: https://github.com/digitaldaimyo/ASA
-- Revision: `codex/implement-runnable-and-releasable-setup`
-## Quick load (raw checkpoint)
-```python
-import torch
-ckpt = torch.load("ASA_ASM_wt103-rawv1_gpt2_T1024_L21_D384_H8_K16_M32_ropek1_alibi1_gamma1_step75000_best.pt", map_location="cpu")
-print(ckpt.keys())
 ## Notes
-This is a raw training artifact. A lightweight inference wrapper / conversion can be added later.

 - asa
 - asm
 - language-model
 library_name: pytorch
 ---
+# ASA / ASM Checkpoints (Bucket Repo)
+This repository stores **training checkpoints** for Addressed State Models (ASM) built from Addressed State Attention (ASA).
+## Repository layout
+- `checkpoints/<dataset>/...pt`
+  Raw PyTorch checkpoints (training artifacts).
+- `configs/<dataset>/<checkpoint>.config.json`
+  Training config extracted from the checkpoint (when present).
+- `metadata/<dataset>/<checkpoint>.metadata.json`
+  SHA256 + file size + provenance pointers.
 ## Provenance
+- Code: https://github.com/digitaldaimyo/ASA
+- Revision: `main`
 ## Notes
+- These are raw training artifacts, primarily intended for research and reproduction.
+- New uploads may overwrite **rolling pointers** (e.g. `last.pt`, `best.pt`), but the Hub preserves history by commit.

checkpoints/fineweb/ASA_ASM_fineweb_T1024_L15_D1024_H16_K16_S32_step17500_last.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:13e44f9f44d6ed669099918f1efa0440849b83ab374293e027887a35c385b055
+size 2889860681

configs/fineweb/ASA_ASM_fineweb_T1024_L15_D1024_H16_K16_S32_step17500_last.pt.config.json ADDED Viewed

	@@ -0,0 +1,62 @@

+{
+  "dataset_name": "HuggingFaceFW/fineweb",
+  "dataset_config": "sample-10BT",
+  "tokenizer_name": "gpt2",
+  "max_seq_len": 1024,
+  "stride_frac_val": 0.5,
+  "seed": 1337,
+  "micro_batch_size": 2,
+  "grad_accum_steps": 16,
+  "train_samples_target": 800000,
+  "val_samples_target": 25000,
+  "batch_size": 4,
+  "learning_rate": 0.0003,
+  "weight_decay": 0.01,
+  "betas": [
+    0.9,
+    0.95
+  ],
+  "grad_clip": 1.0,
+  "warmup_steps": 300,
+  "total_steps": 50000,
+  "eval_interval": 1000,
+  "log_interval": 100,
+  "vocab_size": 50257,
+  "embed_dim": 1024,
+  "num_layers": 15,
+  "num_heads": 16,
+  "num_slots": 32,
+  "mlp_ratio": 4.0,
+  "dropout": 0.1,
+  "tie_weights": true,
+  "read_temperature": 1.0,
+  "write_temperature": 1.0,
+  "slot_dropout": 0.05,
+  "state_fp32": true,
+  "normalize_k": false,
+  "use_abs_pos": false,
+  "use_rope_keys": true,
+  "rope_base": 10000.0,
+  "use_alibi_write": true,
+  "alibi_strength_init": 0.1,
+  "learn_alibi_strength": true,
+  "min_strength": 0.0,
+  "use_content_read": true,
+  "content_read_init": -4.0,
+  "content_read_max_gamma": 3.0,
+  "use_slotspace_refine": true,
+  "slotspace_dim": 16,
+  "slotspace_gate_init": -4.0,
+  "slotspace_dropout": 0.05,
+  "slotspace_signed_weights": true,
+  "use_rope_slotspace": true,
+  "rope_base_slotspace": 100000.0,
+  "write_chunk_size": 1024,
+  "enable_compiled": true,
+  "eval_max_batches": 150,
+  "analytics_last_k": 4,
+  "output_dir": "./drive/MyDrive/asm_outputs",
+  "tag": "asm_fineweb_1024t_1024d_16h_16sd_32s_1024cs_15l",
+  "cache_dir": "./drive/MyDrive/asm_caches/fineweb",
+  "val_windows_cache": "./drive/MyDrive/asm_nlp/val_cache_fineweb_u32_windows_1024.u32tok"
+}

metadata/fineweb/ASA_ASM_fineweb_T1024_L15_D1024_H16_K16_S32_step17500_last.pt.metadata.json ADDED Viewed

	@@ -0,0 +1,88 @@

+{
+  "repo_id": "DigitalShogun/ASA-ASM-wikitext103-raw",
+  "dataset": "fineweb",
+  "exported_at_utc": "2026-01-30T16:21:48.689906+00:00",
+  "checkpoint_filename": "ASA_ASM_fineweb_T1024_L15_D1024_H16_K16_S32_step17500_last.pt",
+  "checkpoint_sha256": "13e44f9f44d6ed669099918f1efa0440849b83ab374293e027887a35c385b055",
+  "checkpoint_bytes": 2889860681,
+  "github_repo": "https://github.com/digitaldaimyo/ASA",
+  "github_revision": "main",
+  "train_config_in_ckpt": {
+    "dataset_name": "HuggingFaceFW/fineweb",
+    "dataset_config": "sample-10BT",
+    "tokenizer_name": "gpt2",
+    "max_seq_len": 1024,
+    "stride_frac_val": 0.5,
+    "seed": 1337,
+    "micro_batch_size": 2,
+    "grad_accum_steps": 16,
+    "train_samples_target": 800000,
+    "val_samples_target": 25000,
+    "batch_size": 4,
+    "learning_rate": 0.0003,
+    "weight_decay": 0.01,
+    "betas": [
+      0.9,
+      0.95
+    ],
+    "grad_clip": 1.0,
+    "warmup_steps": 300,
+    "total_steps": 50000,
+    "eval_interval": 1000,
+    "log_interval": 100,
+    "vocab_size": 50257,
+    "embed_dim": 1024,
+    "num_layers": 15,
+    "num_heads": 16,
+    "num_slots": 32,
+    "mlp_ratio": 4.0,
+    "dropout": 0.1,
+    "tie_weights": true,
+    "read_temperature": 1.0,
+    "write_temperature": 1.0,
+    "slot_dropout": 0.05,
+    "state_fp32": true,
+    "normalize_k": false,
+    "use_abs_pos": false,
+    "use_rope_keys": true,
+    "rope_base": 10000.0,
+    "use_alibi_write": true,
+    "alibi_strength_init": 0.1,
+    "learn_alibi_strength": true,
+    "min_strength": 0.0,
+    "use_content_read": true,
+    "content_read_init": -4.0,
+    "content_read_max_gamma": 3.0,
+    "use_slotspace_refine": true,
+    "slotspace_dim": 16,
+    "slotspace_gate_init": -4.0,
+    "slotspace_dropout": 0.05,
+    "slotspace_signed_weights": true,
+    "use_rope_slotspace": true,
+    "rope_base_slotspace": 100000.0,
+    "write_chunk_size": 1024,
+    "enable_compiled": true,
+    "eval_max_batches": 150,
+    "analytics_last_k": 4,
+    "output_dir": "./drive/MyDrive/asm_outputs",
+    "tag": "asm_fineweb_1024t_1024d_16h_16sd_32s_1024cs_15l",
+    "cache_dir": "./drive/MyDrive/asm_caches/fineweb",
+    "val_windows_cache": "./drive/MyDrive/asm_nlp/val_cache_fineweb_u32_windows_1024.u32tok"
+  },
+  "ckpt_step_inferred": 17500,
+  "ckpt_step_fields": {
+    "global_step": null,
+    "step": 17500,
+    "train_step": null,
+    "epoch": null,
+    "iter": null,
+    "iteration": null
+  },
+  "ckpt_keys": [
+    "cfg",
+    "model",
+    "opt",
+    "step",
+    "best_val"
+  ]
+}