inwaves
/

os-solu

inwaves commited on Sep 6, 2022

Commit

2896dec

1 Parent(s): 4e1467d

Fleshing out model, config

Files changed (2) hide show

model.py CHANGED Viewed

@@ -22,11 +22,13 @@ class TransformerBlock(nn.Module):
         super().__init__()
         self.config = config
-        # Embed,
-        self.embed = nn.Embedding(num_embeddings, config.d_model)
-        # One MLP, one attention
-        # one layernorm, one dropout (?)
-        # Unembed
     def forward(self, x: t.Tensor) -> t.Tensor:
         pass

         super().__init__()
         self.config = config
+        self.embed = nn.Embedding(config.num_embeddings, config.d_model)
+        self.linear = nn.Sequential(
+            nn.Linear(config.d_model, config.d_model),
+            SoLU(),
+        )
+        self.layer_norm = nn.LayerNorm(normalized_shape)
+        self.unembed = nn.Embedding(config.num_embeddings, config.d_model)
     def forward(self, x: t.Tensor) -> t.Tensor:
         pass

utils.py CHANGED Viewed

@@ -1,6 +1,10 @@
 @dataclass
 class OsSoluConfig:
-    d_model: int = 512
-    vocab_size: int = 65536 # Unsure about this.
-    learning_rate: float = 1e-3
-    num_embeddings: int = 1024 # Unsure about this.

 @dataclass
 class OsSoluConfig:
+    d_model: int = 512                  # Hidden size of the model.
+    vocab_size: int = 65536             # Vocabulary size of the input sequence. Unsure about this.
+    learning_rate: float = 1e-3         # Learning rate for the optimiser.
+    num_embeddings: int = 1024          # Number of embeddings. Unsure about this.
+    num_blocks: int = 1                 # Number of transformer blocks.
+    dropout: float = 0.1                # Probability of dropout.
+    ln_eps: float = 1e-3                # Layer norm epsilon.
+    num_heads: int = 4                  # Number of attention heads in each attention layer.