Skeleton for classes

Files changed (5) hide show

README.MD CHANGED Viewed

	@@ -1 +1,3 @@
1	- # Open-source Softmax Linear Unit


1	+ # Open-source Softmax Linear Unit
2	+
3	+ Replicating the results in the paper [Softmax Linear Units](https://transformer-circuits.pub/2022/solu/index.html) published recently by Anthropic.

main.py ADDED Viewed

+import torch as t
+import torch.nn as nn
+import torch.functional as F
+import torch.optim as optim
+def parse_args():
+    # TODO: command-line args for hparams
+    pass
+def train():
+    # TODO: training loop
+    pass
+def eval():
+    pass
+def setup():
+    # TODO: wandb logging, load configs, all that stuff
+    pass
+if __name__=="__main__":
+    parse_args()

model.py ADDED Viewed

+import torch as t
+import torch.nn as nn
+import torch.functional as F
+import torch.optim as optim
+import wandb
+import fancy_einsum
+from einops import rearrange, repeat, reduce
+class OsSoluModel(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        self.config = config
+        self.transformer_block = TransformerBlock(config)
+    def forward(self, x: t.Tensor) -> t.Tensor:
+        pass
+class TransformerBlock(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        self.config = config
+        # Embed,
+        self.embed = nn.Embedding(num_embeddings, config.d_model)
+        # One MLP, one attention
+        # one layernorm, one dropout (?)
+        # Unembed
+    def forward(self, x: t.Tensor) -> t.Tensor:
+        pass
+class RotaryAttention(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+    def forward(self, x: t.Tensor, attention_mask: t.Tensor) -> t.Tensor:
+        # Compute pre-softmax attention scores
+        # Apply attention mask
+        # Compute softmax
+        # Apply final einsum
+        # Return attention output
+        pass

requirements.txt ADDED Viewed

+torch
+wandb
+einops
+fancy_einsum
+tqdm
+ipykernel
+notebook
+ipywidgets
+jupyter
+matplotlib
+numpy-stl
+wandb
+plotly

utils.py ADDED Viewed

+@dataclass
+class OsSoluConfig:
+    d_model: int = 512
+    vocab_size: int = 65536 # Unsure about this.
+    learning_rate: float = 1e-3
+    num_embeddings: int = 1024 # Unsure about this.