{
  "dataset_name": "wikitext",
  "dataset_config": "wikitext-103-raw-v1",
  "tokenizer_name": "gpt2",
  "max_seq_len": 1024,
  "stride_frac_val": 0.5,
  "seed": 1337,
  "train_samples_target": 100000000,
  "val_samples_target": 25000,
  "batch_size": 32,
  "learning_rate": 0.0003,
  "weight_decay": 0.01,
  "betas": [
    0.9,
    0.95
  ],
  "grad_clip": 1.0,
  "warmup_steps": 1000,
  "total_steps": 75000,
  "eval_interval": 1000,
  "log_interval": 100,
  "vocab_size": 50257,
  "embed_dim": 384,
  "num_layers": 21,
  "num_heads": 8,
  "num_slots": 16,
  "mlp_ratio": 4.0,
  "dropout": 0.1,
  "tie_weights": true,
  "read_temperature": 1.0,
  "write_temperature": 1.0,
  "slot_dropout": 0.05,
  "state_fp32": true,
  "normalize_k": false,
  "use_abs_pos": false,
  "use_rope_keys": true,
  "rope_base": 10000.0,
  "use_alibi_write": true,
  "alibi_strength_init": 0.1,
  "learn_alibi_strength": true,
  "min_strength": 0.0,
  "use_content_read": true,
  "content_read_init": -4.0,
  "content_read_max_gamma": 3.0,
  "use_slotspace_refine": true,
  "slotspace_dim": 32,
  "slotspace_gate_init": -4.0,
  "slotspace_dropout": 0.05,
  "slotspace_signed_weights": true,
  "use_rope_slotspace": true,
  "rope_base_slotspace": 100000.0,
  "write_chunk_size": 128,
  "slotspace_chunk_size": 128,
  "eval_max_batches": 150,
  "analytics_last_k": 32,
  "output_dir": "./drive/MyDrive/asm_outputs",
  "tag": "asm_wikitext_1024t_384d_32sd_16s_35l",
  "cache_dir": "./drive/MyDrive/asm_caches",
  "val_windows_cache": "./drive/MyDrive/asm_nlp/val_cache_wikitext_windows_1024.pkl"
}