{ "repo_id": "DigitalShogun/ASA-ASM-wikitext103-raw", "exported_at_utc": "2026-01-25T14:38:05.116518+00:00", "checkpoint_filename": "ASA_ASM_wt103-rawv1_gpt2_T1024_L21_D384_H8_K16_M32_ropek1_alibi1_gamma1_step75000_best.pt", "checkpoint_sha256": "3ebad7be0e98fa7c50aa7cb6c5b709f0b4cf218b2a021a6f8aab21e5ecd94165", "checkpoint_bytes": 680885861, "github_repo": "https://github.com/digitaldaimyo/ASA", "github_revision": "codex/implement-runnable-and-releasable-setup", "train_config_in_ckpt": { "dataset_name": "wikitext", "dataset_config": "wikitext-103-raw-v1", "tokenizer_name": "gpt2", "max_seq_len": 1024, "stride_frac_val": 0.5, "seed": 1337, "train_samples_target": 100000000, "val_samples_target": 25000, "batch_size": 32, "learning_rate": 0.0003, "weight_decay": 0.01, "betas": [ 0.9, 0.95 ], "grad_clip": 1.0, "warmup_steps": 1000, "total_steps": 75000, "eval_interval": 1000, "log_interval": 100, "vocab_size": 50257, "embed_dim": 384, "num_layers": 21, "num_heads": 8, "num_slots": 16, "mlp_ratio": 4.0, "dropout": 0.1, "tie_weights": true, "read_temperature": 1.0, "write_temperature": 1.0, "slot_dropout": 0.05, "state_fp32": true, "normalize_k": false, "use_abs_pos": false, "use_rope_keys": true, "rope_base": 10000.0, "use_alibi_write": true, "alibi_strength_init": 0.1, "learn_alibi_strength": true, "min_strength": 0.0, "use_content_read": true, "content_read_init": -4.0, "content_read_max_gamma": 3.0, "use_slotspace_refine": true, "slotspace_dim": 32, "slotspace_gate_init": -4.0, "slotspace_dropout": 0.05, "slotspace_signed_weights": true, "use_rope_slotspace": true, "rope_base_slotspace": 100000.0, "write_chunk_size": 128, "slotspace_chunk_size": 128, "eval_max_batches": 150, "analytics_last_k": 32, "output_dir": "./drive/MyDrive/asm_outputs", "tag": "asm_wikitext_1024t_384d_32sd_16s_35l", "cache_dir": "./drive/MyDrive/asm_caches", "val_windows_cache": "./drive/MyDrive/asm_nlp/val_cache_wikitext_windows_1024.pkl" }, "ckpt_step_fields": { "global_step": null, "step": 75000, "train_step": null, "epoch": null } }