File size: 1,485 Bytes
79d6f8d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 | {
"model_type": "FlashSTU",
"_name_or_path": "FlashSTU-340M-0408",
"architectures": ["STUForCausalLM"],
"dim": 1024,
"num_heads": 4,
"num_layers": 12,
"seq_len": 4096,
"window_size": 512,
"vocab_size": 200064,
"weight_tying": true,
"inter_dim": 4096,
"mlp_scale": 12,
"bias": false,
"num_eigh": 24,
"r": 896,
"use_hankel_L": false,
"num_epochs": 1,
"global_bsz": 524288,
"bsz": 8,
"warmup_steps": 1907,
"eval_period": 50,
"save_period": 1000,
"max_lr": 4.0e-4,
"min_lr": 4.0e-5,
"max_norm": 1.0,
"fsdp": true,
"ddp": false,
"reshard_after_forward_policy": "default",
"mixed_precision": true,
"torch_dtype": "bfloat16",
"cpu_offload": false,
"sharding_strategy": "full_shard",
"state_dict_type": "full",
"auto_wrap_policy": "partial",
"backward_prefetch": "backward_pre",
"forward_prefetch": false,
"sync_module_states": true,
"use_orig_params": true,
"device_id": null,
"precision": {
"param": "bfloat16",
"reduce": "bfloat16",
"buffer": "bfloat16"
},
"fsdp_modules": [
"STULayer",
"AttentionLayer"
],
"num_workers": 0,
"snapshot_every_n_steps": 50,
"use_activation_checkpointing": true,
"use_flash_fft": false,
"use_tensordot": true,
"use_attn": true,
"use_alibi": false,
"softcap": 50.0,
"rope_theta": 10000.0,
"torch_compile": true,
"torch_compile_kwargs": {
"mode": "default",
"fullgraph": false
},
"enable_compiled_autograd": false
}
|