File size: 1,244 Bytes
5a1cdf2
 
c062128
5a1cdf2
43bd63d
5a1cdf2
 
 
43bd63d
5a1cdf2
 
 
 
 
 
 
 
 
43bd63d
5a1cdf2
43bd63d
 
5a1cdf2
 
 
43bd63d
5a1cdf2
 
 
43bd63d
5a1cdf2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43bd63d
5a1cdf2
 
 
43bd63d
 
5a1cdf2
43bd63d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
{
  "model_type": "ministu",
  "_name_or_path": "STU_500M",
  "architectures": ["MiniSTU"],
  "n_embd": 896,
  "n_heads": 8,
  "n_layers": 12,
  "seq_len": 8192,
  "weight_tying": true,
  "window_size": 1024,
  "vocab_size": 200064,
  "mlp_scale": 12,
  "bias": false,
  "dropout": 0.0,
  "num_eigh": 24,
  "use_hankel_L": false,
  "num_epochs": 1,
  "global_bsz": 524288,
  "bsz": 2,
  "warmup_steps": 1907,
  "eval_period": 50,
  "save_period": 500,
  "max_lr": 3.0e-3,
  "min_lr": 3.0e-5,
  "max_norm": 1.0,
  "dilation": 2,
  "fsdp": true,
  "ddp": false,
  "mixed_precision": true,
  "torch_dtype": "bfloat16",
  "use_cpu_offload": false,
  "sharding_strategy": "full_shard",
  "state_dict_type": "full",
  "auto_wrap_policy": "partial",
  "backward_prefetch": "backward_pre",
  "forward_prefetch": false,
  "sync_module_states": true,
  "use_orig_params": true,
  "device_id": null,
  "precision": {
    "param": "bfloat16",
    "reduce": "bfloat16",
    "buffer": "bfloat16"
  },
  "fsdp_modules": [
    "STU",
    "Attention",
    "MLP"
  ],
  "use_activation_checkpointing": true,
  "use_flash_fft": true,
  "use_approx": true,
  "use_attn": true,
  "softcap": 50.0,
  "theta": 10000.0,
  "use_alibi": false,
  "torch_compile": false
}