{ "dataset_name": "wikitext", "dataset_config": "wikitext-103-raw-v1", "tokenizer_name": "gpt2", "max_seq_len": 1024, "stride_frac_val": 0.5, "seed": 1337, "train_samples_target": 100000000, "val_samples_target": 25000, "batch_size": 32, "learning_rate": 0.0003, "weight_decay": 0.01, "betas": [ 0.9, 0.95 ], "grad_clip": 1.0, "warmup_steps": 1000, "total_steps": 75000, "eval_interval": 1000, "log_interval": 100, "vocab_size": 50257, "embed_dim": 384, "num_layers": 21, "num_heads": 8, "num_slots": 16, "mlp_ratio": 4.0, "dropout": 0.1, "tie_weights": true, "read_temperature": 1.0, "write_temperature": 1.0, "slot_dropout": 0.05, "state_fp32": true, "normalize_k": false, "use_abs_pos": false, "use_rope_keys": true, "rope_base": 10000.0, "use_alibi_write": true, "alibi_strength_init": 0.1, "learn_alibi_strength": true, "min_strength": 0.0, "use_content_read": true, "content_read_init": -4.0, "content_read_max_gamma": 3.0, "use_slotspace_refine": true, "slotspace_dim": 32, "slotspace_gate_init": -4.0, "slotspace_dropout": 0.05, "slotspace_signed_weights": true, "use_rope_slotspace": true, "rope_base_slotspace": 100000.0, "write_chunk_size": 128, "slotspace_chunk_size": 128, "eval_max_batches": 150, "analytics_last_k": 32, "output_dir": "./drive/MyDrive/asm_outputs", "tag": "asm_wikitext_1024t_384d_32sd_16s_35l", "cache_dir": "./drive/MyDrive/asm_caches", "val_windows_cache": "./drive/MyDrive/asm_nlp/val_cache_wikitext_windows_1024.pkl" }