agadelmoula-avey
/

hf_tutorial

@@ -12,17 +12,19 @@ from .configuration_avey import AveyConfig
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from torch.utils.checkpoint import checkpoint
 # torch._dynamo.config.allow_unspec_int_on_nn_module = True
 class Contextualizer(nn.Module):
-    def __init__(self, config: AveyConfig, layer_idx):
         super().__init__()
         self.eps = config.eps
-        self.layer_idx = layer_idx
-        if self.layer_idx % 2 == 0:
             self.spatial_proj = nn.Parameter(torch.empty(config.chunk_size, config.chunk_size))
             nn.init.xavier_normal_(self.spatial_proj)
     def cosim(self, embeddings: torch.Tensor) -> torch.Tensor:
         norm = torch.sqrt(torch.sum(embeddings ** 2, dim=-1, keepdim=True) + self.eps)
         normalized = embeddings / norm
@@ -32,7 +34,7 @@ class Contextualizer(nn.Module):
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         _, T, _ = x.shape
         x0, x1 = x.chunk(2, dim=-1)
-        if self.layer_idx % 2 == 0:
             x0 = self.spatial_proj[:T, :T] @ x0
         else:
             sim_scores = self.cosim(x0)
@@ -44,7 +46,7 @@ class Contextualizer(nn.Module):
 class ContextualizerLayer(nn.Module):
-    def __init__(self, config: AveyConfig, layer_idx):
         super().__init__()
         expanded_dim = config.d_embed * config.expansion_factor
         self.split_factor = [
@@ -58,7 +60,7 @@ class ContextualizerLayer(nn.Module):
             self.split_factor[1] -= 1
         self.enricher = nn.Linear(config.d_embed, expanded_dim)
-        self.contextualizer = Contextualizer(config, layer_idx)
         proj_in_features = int(self.split_factor[0] / 2 + self.split_factor[1])
         self.fuser = nn.Linear(proj_in_features, config.d_embed)
@@ -71,12 +73,12 @@ class ContextualizerLayer(nn.Module):
 class AveyLayer(nn.Module):
-    def __init__(self, config: AveyConfig, layer_idx):
         super().__init__()
         self.rms_norm = nn.RMSNorm(config.d_embed, eps=config.eps)
-        self.ctxt = ContextualizerLayer(config, layer_idx)
-    @torch.compile()
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         return x + self.ctxt(self.rms_norm(x))
@@ -206,13 +208,12 @@ class AveyPreTrainedModel(PreTrainedModel):
             if module.padding_idx is not None:
                 module.weight.data[module.padding_idx].zero_()
 class AveyModel(AveyPreTrainedModel):
     def __init__(self, config: AveyConfig):
         super().__init__(config)
         self.config = config
         self.embeddings = nn.Embedding(config.vocab_size, config.d_embed)
-        self.layers = nn.ModuleList([AveyLayer(config, i) for i in range(config.n_layers)])
         self.ranker = Ranker(config)
         self.post_init()

 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from torch.utils.checkpoint import checkpoint
+import torch
 # torch._dynamo.config.allow_unspec_int_on_nn_module = True
 class Contextualizer(nn.Module):
+    def __init__(self, config: AveyConfig, static: bool):
         super().__init__()
         self.eps = config.eps
+        self.static = static
+        if self.static:
             self.spatial_proj = nn.Parameter(torch.empty(config.chunk_size, config.chunk_size))
             nn.init.xavier_normal_(self.spatial_proj)
     def cosim(self, embeddings: torch.Tensor) -> torch.Tensor:
         norm = torch.sqrt(torch.sum(embeddings ** 2, dim=-1, keepdim=True) + self.eps)
         normalized = embeddings / norm
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         _, T, _ = x.shape
         x0, x1 = x.chunk(2, dim=-1)
+        if self.static:
             x0 = self.spatial_proj[:T, :T] @ x0
         else:
             sim_scores = self.cosim(x0)
 class ContextualizerLayer(nn.Module):
+    def __init__(self, config: AveyConfig, static: bool):
         super().__init__()
         expanded_dim = config.d_embed * config.expansion_factor
         self.split_factor = [
             self.split_factor[1] -= 1
         self.enricher = nn.Linear(config.d_embed, expanded_dim)
+        self.contextualizer = Contextualizer(config, static)
         proj_in_features = int(self.split_factor[0] / 2 + self.split_factor[1])
         self.fuser = nn.Linear(proj_in_features, config.d_embed)
 class AveyLayer(nn.Module):
+    def __init__(self, config: AveyConfig, static: bool):
         super().__init__()
         self.rms_norm = nn.RMSNorm(config.d_embed, eps=config.eps)
+        self.ctxt = ContextualizerLayer(config, static)
+    @torch.compile
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         return x + self.ctxt(self.rms_norm(x))
             if module.padding_idx is not None:
                 module.weight.data[module.padding_idx].zero_()
 class AveyModel(AveyPreTrainedModel):
     def __init__(self, config: AveyConfig):
         super().__init__(config)
         self.config = config
         self.embeddings = nn.Embedding(config.vocab_size, config.d_embed)
+        self.layers = nn.ModuleList([AveyLayer(config, i%2 == 0) for i in range(config.n_layers)])
         self.ranker = Ranker(config)
         self.post_init()