yagizdevre commited on
Commit
ff98655
·
1 Parent(s): bd4690a
__pycache__/__init__.cpython-312.pyc ADDED
Binary file (280 Bytes). View file
 
configuration_ministu.py CHANGED
@@ -4,31 +4,31 @@ from transformers import PretrainedConfig, AutoConfig
4
  class MiniSTUConfig(PretrainedConfig):
5
  model_type = "ministu"
6
 
7
- def __init__(
8
- self,
9
- bsz: int = 1,
10
- dim: int = 896,
11
- num_heads: int = 8,
12
- num_layers: int = 12,
13
- seq_len: int = 8192,
14
- weight_tying: bool = False,
15
- window_size: int = 1024,
16
- vocab_size: int = 200064,
17
- mlp_scale: int = 12,
18
- bias: bool = False,
19
- dropout: float = 0.0,
20
- num_eigh: int = 24,
21
- use_hankel_L: bool = False,
22
- use_flash_fft: bool = True,
23
- use_approx: bool = True,
24
- use_attn: bool = True,
25
- softcap: float = 50.0,
26
- theta: float = 10_000.0,
27
- use_alibi: bool = False,
28
- dilation: int = 2,
29
- torch_dtype: torch.dtype = torch.bfloat16,
30
- device: torch.device = None,
31
- **kwargs,
32
  ):
33
  super().__init__(**kwargs)
34
  self.bsz = bsz
 
4
  class MiniSTUConfig(PretrainedConfig):
5
  model_type = "ministu"
6
 
7
+ def __init__(
8
+ self,
9
+ bsz: int = 1,
10
+ dim: int = 896,
11
+ num_heads: int = 8,
12
+ num_layers: int = 12,
13
+ seq_len: int = 8192,
14
+ weight_tying: bool = False,
15
+ window_size: int = 1024,
16
+ vocab_size: int = 200064,
17
+ mlp_scale: int = 12,
18
+ bias: bool = False,
19
+ dropout: float = 0.0,
20
+ num_eigh: int = 24,
21
+ use_hankel_L: bool = False,
22
+ use_flash_fft: bool = True,
23
+ use_approx: bool = True,
24
+ use_attn: bool = True,
25
+ softcap: float = 50.0,
26
+ theta: float = 10_000.0,
27
+ use_alibi: bool = False,
28
+ dilation: int = 2,
29
+ torch_dtype: torch.dtype = torch.bfloat16,
30
+ device: torch.device = None,
31
+ **kwargs,
32
  ):
33
  super().__init__(**kwargs)
34
  self.bsz = bsz