{ "model_type": "FlashSTU", "_name_or_path": "FlashSTU-340M-0408", "architectures": ["STUForCausalLM"], "dim": 1024, "num_heads": 4, "num_layers": 12, "seq_len": 4096, "window_size": 512, "vocab_size": 200064, "weight_tying": true, "inter_dim": 4096, "mlp_scale": 12, "bias": false, "num_eigh": 24, "r": 896, "use_hankel_L": false, "num_epochs": 1, "global_bsz": 524288, "bsz": 8, "warmup_steps": 1907, "eval_period": 50, "save_period": 1000, "max_lr": 4.0e-4, "min_lr": 4.0e-5, "max_norm": 1.0, "fsdp": true, "ddp": false, "reshard_after_forward_policy": "default", "mixed_precision": true, "torch_dtype": "bfloat16", "cpu_offload": false, "sharding_strategy": "full_shard", "state_dict_type": "full", "auto_wrap_policy": "partial", "backward_prefetch": "backward_pre", "forward_prefetch": false, "sync_module_states": true, "use_orig_params": true, "device_id": null, "precision": { "param": "bfloat16", "reduce": "bfloat16", "buffer": "bfloat16" }, "fsdp_modules": [ "STULayer", "AttentionLayer" ], "num_workers": 0, "snapshot_every_n_steps": 50, "use_activation_checkpointing": true, "use_flash_fft": false, "use_tensordot": true, "use_attn": true, "use_alibi": false, "softcap": 50.0, "rope_theta": 10000.0, "torch_compile": true, "torch_compile_kwargs": { "mode": "default", "fullgraph": false }, "enable_compiled_autograd": false }