windsornguyen commited on
Commit
79d6f8d
·
verified ·
1 Parent(s): 0759474

Upload config.json

Browse files
Files changed (1) hide show
  1. config.json +65 -0
config.json ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "FlashSTU",
3
+ "_name_or_path": "FlashSTU-340M-0408",
4
+ "architectures": ["STUForCausalLM"],
5
+ "dim": 1024,
6
+ "num_heads": 4,
7
+ "num_layers": 12,
8
+ "seq_len": 4096,
9
+ "window_size": 512,
10
+ "vocab_size": 200064,
11
+ "weight_tying": true,
12
+ "inter_dim": 4096,
13
+ "mlp_scale": 12,
14
+ "bias": false,
15
+ "num_eigh": 24,
16
+ "r": 896,
17
+ "use_hankel_L": false,
18
+ "num_epochs": 1,
19
+ "global_bsz": 524288,
20
+ "bsz": 8,
21
+ "warmup_steps": 1907,
22
+ "eval_period": 50,
23
+ "save_period": 1000,
24
+ "max_lr": 4.0e-4,
25
+ "min_lr": 4.0e-5,
26
+ "max_norm": 1.0,
27
+ "fsdp": true,
28
+ "ddp": false,
29
+ "reshard_after_forward_policy": "default",
30
+ "mixed_precision": true,
31
+ "torch_dtype": "bfloat16",
32
+ "cpu_offload": false,
33
+ "sharding_strategy": "full_shard",
34
+ "state_dict_type": "full",
35
+ "auto_wrap_policy": "partial",
36
+ "backward_prefetch": "backward_pre",
37
+ "forward_prefetch": false,
38
+ "sync_module_states": true,
39
+ "use_orig_params": true,
40
+ "device_id": null,
41
+ "precision": {
42
+ "param": "bfloat16",
43
+ "reduce": "bfloat16",
44
+ "buffer": "bfloat16"
45
+ },
46
+ "fsdp_modules": [
47
+ "STULayer",
48
+ "AttentionLayer"
49
+ ],
50
+ "num_workers": 0,
51
+ "snapshot_every_n_steps": 50,
52
+ "use_activation_checkpointing": true,
53
+ "use_flash_fft": false,
54
+ "use_tensordot": true,
55
+ "use_attn": true,
56
+ "use_alibi": false,
57
+ "softcap": 50.0,
58
+ "rope_theta": 10000.0,
59
+ "torch_compile": true,
60
+ "torch_compile_kwargs": {
61
+ "mode": "default",
62
+ "fullgraph": false
63
+ },
64
+ "enable_compiled_autograd": false
65
+ }