Raras-AI
/

gemeo-sus

@@ -50,6 +50,10 @@ class CDFv13Config:
     use_swiglu: bool = True
     use_rmsnorm: bool = True
     tie_embeddings: bool = True
     # Diffusion forcing
     cond_dropout: float = 0.10
     # KG conditioning (GATED adapters)
@@ -187,6 +191,14 @@ class CDFv13Block(nn.Module):
         self.norm2 = norm_cls(cfg.d_model)
         self.qkv = nn.Linear(cfg.d_model, 3 * cfg.d_model, bias=False)
         self.proj = nn.Linear(cfg.d_model, cfg.d_model, bias=False)
         if cfg.use_swiglu:
             self.mlp = SwiGLU(cfg.d_model, cfg.ffn, cfg.dropout)
         else:
@@ -204,12 +216,21 @@ class CDFv13Block(nn.Module):
             self.kg_xattn = GatedKGCrossAttention(
                 cfg.d_model, cfg.kg_dim, cfg.n_heads, cfg.dropout)
-    def forward(self, x, attn_mask, kg_raw=None):
         B, T, D = x.shape
         # MSA
         h = self.norm1(x)
         qkv = self.qkv(h).reshape(B, T, 3, self.cfg.n_heads, self.head_dim)
         q, k, v = qkv.permute(2, 0, 3, 1, 4).unbind(0)
         q, k = self.rope(q, k, T)
         out = F.scaled_dot_product_attention(
             q, k, v,
@@ -217,12 +238,17 @@ class CDFv13Block(nn.Module):
             dropout_p=self.cfg.dropout if self.training else 0.0,
         )
         out = out.transpose(1, 2).reshape(B, T, D)
-        x = x + self.dropout(self.proj(out))
         # Gated KG cross-attn (if enabled at this layer)
         if self.use_kg_in_layer and kg_raw is not None:
             x = self.kg_xattn(x, kg_raw)
         # MLP
-        x = x + self.mlp(self.norm2(x))
         return x
@@ -258,7 +284,15 @@ class CDFv13Transformer(nn.Module):
         # Block-causal mask buffer
         T = c.max_seq_len
         block_id = torch.arange(T) // c.block_size
-        mask = block_id.unsqueeze(0) < block_id.unsqueeze(1)
         self.register_buffer("block_mask", mask, persistent=False)
         # Init
@@ -270,14 +304,25 @@ class CDFv13Transformer(nn.Module):
             if m.bias is not None: nn.init.zeros_(m.bias)
         elif isinstance(m, nn.Embedding):
             nn.init.normal_(m.weight, mean=0.0, std=0.02)
     def forward(self, x, sigma, cond, kg_raw=None):
         B, T = x.shape
-        h = self.tok_emb(x) + self.sigma_emb(sigma) + self.cond_emb(cond).unsqueeze(1)
         h = self.emb_dropout(h)
         mask = self.block_mask[:T, :T]
         for blk in self.blocks:
-            h = blk(h, mask, kg_raw=kg_raw)
         h = self.final_norm(h)
         return self.head(h)
@@ -312,3 +357,52 @@ class CDFv13Transformer(nn.Module):
         ).reshape(B, T)
         n = corrupt.float().sum().clamp(min=1.0)
         return (ce * corrupt.float()).sum() / n

     use_swiglu: bool = True
     use_rmsnorm: bool = True
     tie_embeddings: bool = True
+    # SOTA upgrades (opt-in; default off keeps backward-compat with v13 checkpoints)
+    use_qk_norm: bool = False      # RMSNorm on Q,K per head before RoPE (Gemma2/3-style)
+    use_adaln: bool = False        # AdaLN-Zero (DiT/SD3) per-token sigma+cond conditioning
+    bidirectional: bool = False    # full attention (pure masked diffusion); else block-causal
     # Diffusion forcing
     cond_dropout: float = 0.10
     # KG conditioning (GATED adapters)
         self.norm2 = norm_cls(cfg.d_model)
         self.qkv = nn.Linear(cfg.d_model, 3 * cfg.d_model, bias=False)
         self.proj = nn.Linear(cfg.d_model, cfg.d_model, bias=False)
+        self.head_dim = cfg.d_model // cfg.n_heads
+        # QK-norm: per-head RMSNorm on Q,K before RoPE (stabilises attn logits)
+        if cfg.use_qk_norm:
+            self.q_norm = RMSNorm(self.head_dim)
+            self.k_norm = RMSNorm(self.head_dim)
+        # AdaLN-Zero: per-token modulation (shift/scale/gate) for MSA + MLP
+        if cfg.use_adaln:
+            self.adaln = nn.Sequential(nn.SiLU(), nn.Linear(cfg.d_model, 6 * cfg.d_model, bias=True))
         if cfg.use_swiglu:
             self.mlp = SwiGLU(cfg.d_model, cfg.ffn, cfg.dropout)
         else:
             self.kg_xattn = GatedKGCrossAttention(
                 cfg.d_model, cfg.kg_dim, cfg.n_heads, cfg.dropout)
+    def forward(self, x, attn_mask, kg_raw=None, cond_vec=None):
         B, T, D = x.shape
+        # AdaLN-Zero modulation (per-token shift/scale/gate) from sigma+cond
+        if self.cfg.use_adaln and cond_vec is not None:
+            sh_msa, sc_msa, g_msa, sh_mlp, sc_mlp, g_mlp = self.adaln(cond_vec).chunk(6, dim=-1)
+        else:
+            sh_msa = sc_msa = g_msa = sh_mlp = sc_mlp = g_mlp = None
         # MSA
         h = self.norm1(x)
+        if sc_msa is not None:
+            h = h * (1 + sc_msa) + sh_msa
         qkv = self.qkv(h).reshape(B, T, 3, self.cfg.n_heads, self.head_dim)
         q, k, v = qkv.permute(2, 0, 3, 1, 4).unbind(0)
+        if self.cfg.use_qk_norm:
+            q = self.q_norm(q); k = self.k_norm(k)
         q, k = self.rope(q, k, T)
         out = F.scaled_dot_product_attention(
             q, k, v,
             dropout_p=self.cfg.dropout if self.training else 0.0,
         )
         out = out.transpose(1, 2).reshape(B, T, D)
+        attn_out = self.dropout(self.proj(out))
+        x = x + (g_msa * attn_out if g_msa is not None else attn_out)
         # Gated KG cross-attn (if enabled at this layer)
         if self.use_kg_in_layer and kg_raw is not None:
             x = self.kg_xattn(x, kg_raw)
         # MLP
+        h2 = self.norm2(x)
+        if sc_mlp is not None:
+            h2 = h2 * (1 + sc_mlp) + sh_mlp
+        mlp_out = self.mlp(h2)
+        x = x + (g_mlp * mlp_out if g_mlp is not None else mlp_out)
         return x
         # Block-causal mask buffer
         T = c.max_seq_len
         block_id = torch.arange(T) // c.block_size
+        # Block-causal (Diffusion Forcing): a query may attend to its own block and
+        # all EARLIER blocks; future blocks are masked. mask[i,j]=True => BLOCKED.
+        # (Fixes a prior inverted mask that blocked the past instead of the future.)
+        # Set cfg.bidirectional=True for full bidirectional attention (pure masked
+        # diffusion / gap-fill), which disables the causal mask entirely.
+        if getattr(c, "bidirectional", False):
+            mask = torch.zeros(T, T, dtype=torch.bool)
+        else:
+            mask = block_id.unsqueeze(0) > block_id.unsqueeze(1)
         self.register_buffer("block_mask", mask, persistent=False)
         # Init
             if m.bias is not None: nn.init.zeros_(m.bias)
         elif isinstance(m, nn.Embedding):
             nn.init.normal_(m.weight, mean=0.0, std=0.02)
+        # AdaLN-Zero: zero the modulation output so each block starts as identity
+        if self.cfg.use_adaln:
+            for blk in self.blocks:
+                nn.init.zeros_(blk.adaln[-1].weight)
+                nn.init.zeros_(blk.adaln[-1].bias)
     def forward(self, x, sigma, cond, kg_raw=None):
         B, T = x.shape
+        cond_vec = None
+        if self.cfg.use_adaln:
+            # AdaLN path: conditioning enters via per-token modulation, not additive
+            cond_vec = self.sigma_emb(sigma) + self.cond_emb(cond).unsqueeze(1)
+            h = self.tok_emb(x)
+        else:
+            h = self.tok_emb(x) + self.sigma_emb(sigma) + self.cond_emb(cond).unsqueeze(1)
         h = self.emb_dropout(h)
         mask = self.block_mask[:T, :T]
         for blk in self.blocks:
+            h = blk(h, mask, kg_raw=kg_raw, cond_vec=cond_vec)
         h = self.final_norm(h)
         return self.head(h)
         ).reshape(B, T)
         n = corrupt.float().sum().clamp(min=1.0)
         return (ce * corrupt.float()).sum() / n
+    @staticmethod
+    def recurrence_weights(x_clean, struct_ids, lam: float = 0.25, w_min: float = 0.02):
+        """RAVEN recurrence-aware weights (Rajamohan et al., arXiv 2603.24562).
+        w[i,t] = max(lam ** count, w_min), where `count` is the number of prior
+        occurrences of token x[i,t] earlier in patient i's sequence. First
+        occurrences get full weight; repeats decay geometrically toward w_min.
+        Structural tokens get weight 0. Vectorized (no Python Counter loop).
+        Returns a (B, T) float tensor on x_clean.device.
+        """
+        B, T = x_clean.shape
+        device = x_clean.device
+        # prior-occurrence count per position via equality-with-earlier-positions
+        eq = (x_clean.unsqueeze(2) == x_clean.unsqueeze(1))      # (B,T,T): eq[b,t,s] = x[b,t]==x[b,s]
+        earlier = torch.tril(torch.ones(T, T, device=device), diagonal=-1).bool()  # [t,s]=True if s<t
+        count = (eq & earlier.unsqueeze(0)).sum(dim=2).float()   # (B,T): #earlier positions s<t with same token
+        w = torch.clamp(lam ** count, min=w_min)
+        if struct_ids:
+            sid = torch.tensor(sorted(struct_ids), device=device)
+            is_struct = (x_clean.unsqueeze(-1) == sid).any(-1)
+            w = w.masked_fill(is_struct, 0.0)
+        return w
+    def recurrence_aware_loss(self, x_clean, cond, struct_ids, kg_raw=None,
+                              lam: float = 0.25, w_min: float = 0.02,
+                              mode: str = "uniform") -> torch.Tensor:
+        """Diffusion-forcing loss reweighted by RAVEN recurrence decay — the
+        objective that makes GEMEO predict NOVEL events, not repeats. This is the
+        loss used to train the released `gemeo-sus` flagship."""
+        B, T = x_clean.shape
+        device = x_clean.device
+        drop = torch.rand(B, device=device) < self.cfg.cond_dropout
+        cond = torch.where(drop, torch.zeros_like(cond), cond)
+        if kg_raw is not None:
+            drop_kg = (torch.rand(B, device=device) < self.cfg.cond_dropout).float()
+            kg_raw = kg_raw * (1 - drop_kg).reshape(B, 1, 1)
+        if mode == "logit_normal":
+            sigma = torch.sigmoid(torch.randn(B, T, device=device)).clamp(0.01, 0.99)
+        else:
+            sigma = torch.rand(B, T, device=device).clamp(0.01, 0.99)
+        corrupt = torch.rand(B, T, device=device) < sigma
+        x_noisy = torch.where(corrupt, self.cfg.mask_token, x_clean)
+        logits = self.forward(x_noisy, sigma, cond, kg_raw=kg_raw)
+        ce = F.cross_entropy(
+            logits.reshape(-1, self.cfg.vocab_size), x_clean.reshape(-1),
+            reduction="none").reshape(B, T)
+        w = self.recurrence_weights(x_clean, struct_ids, lam, w_min) * corrupt.float()
+        return (ce * w).sum() / w.sum().clamp(min=1.0)