Fleshing out model, config
Browse files
model.py
CHANGED
|
@@ -22,11 +22,13 @@ class TransformerBlock(nn.Module):
|
|
| 22 |
super().__init__()
|
| 23 |
self.config = config
|
| 24 |
|
| 25 |
-
|
| 26 |
-
self.
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
|
|
|
|
|
|
| 30 |
|
| 31 |
def forward(self, x: t.Tensor) -> t.Tensor:
|
| 32 |
pass
|
|
|
|
| 22 |
super().__init__()
|
| 23 |
self.config = config
|
| 24 |
|
| 25 |
+
self.embed = nn.Embedding(config.num_embeddings, config.d_model)
|
| 26 |
+
self.linear = nn.Sequential(
|
| 27 |
+
nn.Linear(config.d_model, config.d_model),
|
| 28 |
+
SoLU(),
|
| 29 |
+
)
|
| 30 |
+
self.layer_norm = nn.LayerNorm(normalized_shape)
|
| 31 |
+
self.unembed = nn.Embedding(config.num_embeddings, config.d_model)
|
| 32 |
|
| 33 |
def forward(self, x: t.Tensor) -> t.Tensor:
|
| 34 |
pass
|
utils.py
CHANGED
|
@@ -1,6 +1,10 @@
|
|
| 1 |
@dataclass
|
| 2 |
class OsSoluConfig:
|
| 3 |
-
d_model: int = 512
|
| 4 |
-
vocab_size: int = 65536
|
| 5 |
-
learning_rate: float = 1e-3
|
| 6 |
-
num_embeddings: int = 1024
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
@dataclass
|
| 2 |
class OsSoluConfig:
|
| 3 |
+
d_model: int = 512 # Hidden size of the model.
|
| 4 |
+
vocab_size: int = 65536 # Vocabulary size of the input sequence. Unsure about this.
|
| 5 |
+
learning_rate: float = 1e-3 # Learning rate for the optimiser.
|
| 6 |
+
num_embeddings: int = 1024 # Number of embeddings. Unsure about this.
|
| 7 |
+
num_blocks: int = 1 # Number of transformer blocks.
|
| 8 |
+
dropout: float = 0.1 # Probability of dropout.
|
| 9 |
+
ln_eps: float = 1e-3 # Layer norm epsilon.
|
| 10 |
+
num_heads: int = 4 # Number of attention heads in each attention layer.
|