Add custom modeling code
Browse files- modeling_nebula.py +12 -3
modeling_nebula.py
CHANGED
|
@@ -12,9 +12,17 @@ class NebulaConfig(PretrainedConfig):
|
|
| 12 |
def __init__(self, dim=1280, n_layers=14, n_heads=10, n_kv_heads=10, vocab_size=60729,
|
| 13 |
multiple_of=256, ffn_dim_multiplier=8/3, norm_eps=1e-5, max_seq_len=2048,
|
| 14 |
dropout=0.1, use_cache=True, **kwargs):
|
| 15 |
-
self.dim
|
| 16 |
-
self.
|
| 17 |
-
self.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
super().__init__(**kwargs)
|
| 19 |
|
| 20 |
class RMSNorm(nn.Module):
|
|
@@ -32,6 +40,7 @@ class RoPE(nn.Module):
|
|
| 32 |
super().__init__()
|
| 33 |
self.dim = config.dim // config.n_heads
|
| 34 |
self.max_seq_len = config.max_seq_len
|
|
|
|
| 35 |
self._build_cache(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
|
| 36 |
def _build_cache(self, device, base=10000):
|
| 37 |
theta = 1.0 / (base ** (torch.arange(0, self.dim, 2, device=device).float() / self.dim))
|
|
|
|
| 12 |
def __init__(self, dim=1280, n_layers=14, n_heads=10, n_kv_heads=10, vocab_size=60729,
|
| 13 |
multiple_of=256, ffn_dim_multiplier=8/3, norm_eps=1e-5, max_seq_len=2048,
|
| 14 |
dropout=0.1, use_cache=True, **kwargs):
|
| 15 |
+
self.dim = dim
|
| 16 |
+
self.n_layers = n_layers
|
| 17 |
+
self.n_heads = n_heads
|
| 18 |
+
self.n_kv_heads = n_kv_heads
|
| 19 |
+
self.vocab_size = vocab_size
|
| 20 |
+
self.multiple_of = multiple_of
|
| 21 |
+
self.ffn_dim_multiplier = ffn_dim_multiplier
|
| 22 |
+
self.norm_eps = norm_eps
|
| 23 |
+
self.max_seq_len = max_seq_len
|
| 24 |
+
self.dropout = dropout
|
| 25 |
+
self.use_cache = use_cache
|
| 26 |
super().__init__(**kwargs)
|
| 27 |
|
| 28 |
class RMSNorm(nn.Module):
|
|
|
|
| 40 |
super().__init__()
|
| 41 |
self.dim = config.dim // config.n_heads
|
| 42 |
self.max_seq_len = config.max_seq_len
|
| 43 |
+
# The device will be inferred from the model, so we don't need it in the config
|
| 44 |
self._build_cache(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
|
| 45 |
def _build_cache(self, device, base=10000):
|
| 46 |
theta = 1.0 / (base ** (torch.arange(0, self.dim, 2, device=device).float() / self.dim))
|