Add TMT source code for direct installation from HuggingFace

Files changed (8) hide show

tmt/model/attention.py CHANGED Viewed

@@ -11,7 +11,7 @@ Formula: attn = softmax(QK^T / sqrt(d)) * sigmoid(W_decay * temporal_distance)
 from __future__ import annotations
 import math
-from typing import Optional, Tuple
 import torch
 import torch.nn as nn
@@ -89,10 +89,10 @@ class MeshAttention(nn.Module):
             dst_local = dst_global % S
             mask[b_idx, src_local, dst_local] = edge_weight.float()
-        # Also allow causal self (diagonal) so every token has at least itself
-        diag_mask = torch.zeros(S, S, device=x.device)
-        diag_mask.fill_diagonal_(0.0)
-        mask = mask + diag_mask.unsqueeze(0)
         # Apply graph mask
         scores = scores + mask.unsqueeze(1)  # broadcast over heads

 from __future__ import annotations
 import math
+from typing import Optional
 import torch
 import torch.nn as nn
             dst_local = dst_global % S
             mask[b_idx, src_local, dst_local] = edge_weight.float()
+        # Allow self-attention on the diagonal so every token attends to itself.
+        # Direct index-assignment instead of add so -inf diagonal becomes 0.
+        diag_idx = torch.arange(S, device=x.device)
+        mask[:, diag_idx, diag_idx] = 0.0
         # Apply graph mask
         scores = scores + mask.unsqueeze(1)  # broadcast over heads

tmt/model/config.py CHANGED Viewed

@@ -5,7 +5,7 @@ Novel vs standard: a single config surface that governs dynamic graph topology
 (graph_k), per-token adaptive depth (exit_threshold), temporal decay rate, and
 the dual-stream FFN — none of which exist in vanilla transformer configs.
 """
-from dataclasses import dataclass, field
 @dataclass

 (graph_k), per-token adaptive depth (exit_threshold), temporal decay rate, and
 the dual-stream FFN — none of which exist in vanilla transformer configs.
 """
+from dataclasses import dataclass
 @dataclass

tmt/model/ffn.py CHANGED Viewed

@@ -11,7 +11,6 @@ from __future__ import annotations
 import torch
 import torch.nn as nn
-from einops import rearrange
 from torch import Tensor
 from .config import TMTConfig

 import torch
 import torch.nn as nn
 from torch import Tensor
 from .config import TMTConfig

tmt/model/memory.py CHANGED Viewed

@@ -53,7 +53,6 @@ class MemoryAnchorCross(nn.Module):
             memory_state: (M, D) updated memory anchors (detached for logging)
         """
         B, S, D = x.shape
-        M = self.n_anchors
         scale = self.d_head ** -0.5
         # Queries come from tokens, Keys/Values from memory anchors
@@ -75,13 +74,14 @@ class MemoryAnchorCross(nn.Module):
         out = rearrange(out, "b h s d -> b s (h d)")
         out = self.out_proj(out)
-        # EMA update of memory anchors using mean token representation
-        with torch.no_grad():
-            token_mean = x.mean(dim=1).mean(dim=0)  # (D,) across batch
-            self.memory.data = (
-                self.ema_alpha * self.memory.data
-                + (1 - self.ema_alpha) * token_mean.unsqueeze(0)
-            )
         return out, self.memory.detach()

             memory_state: (M, D) updated memory anchors (detached for logging)
         """
         B, S, D = x.shape
         scale = self.d_head ** -0.5
         # Queries come from tokens, Keys/Values from memory anchors
         out = rearrange(out, "b h s d -> b s (h d)")
         out = self.out_proj(out)
+        # EMA update only during training — eval must be deterministic
+        if self.training:
+            with torch.no_grad():
+                token_mean = x.mean(dim=1).mean(dim=0)  # (D,) across batch
+                self.memory.data = (
+                    self.ema_alpha * self.memory.data
+                    + (1 - self.ema_alpha) * token_mean.unsqueeze(0)
+                )
         return out, self.memory.detach()

tmt/model/mesh.py CHANGED Viewed

@@ -36,8 +36,6 @@ def build_mesh(
                     global node indices (0 … B*S-1).
         edge_weight:(E,) cosine similarity of each edge.
     """
-    N = batch_size * seq_len  # total nodes
     # Normalise for cosine similarity
     x_norm = F.normalize(x, p=2, dim=-1)  # (N, D)

                     global node indices (0 … B*S-1).
         edge_weight:(E,) cosine similarity of each edge.
     """
     # Normalise for cosine similarity
     x_norm = F.normalize(x, p=2, dim=-1)  # (N, D)

tmt/model/model.py CHANGED Viewed

@@ -9,7 +9,7 @@ intermediate diagnostic tensors (exit_masks, graph edges, memory state).
 """
 from __future__ import annotations
-from dataclasses import dataclass, field
 from typing import List, Optional, Tuple
 import torch

 """
 from __future__ import annotations
+from dataclasses import dataclass
 from typing import List, Optional, Tuple
 import torch

tmt/training/loss.py CHANGED Viewed

@@ -48,7 +48,7 @@ def compute_loss(
     # Exit gate auxiliary: encourage decisiveness
     # Loss = -E[|conf - 0.5|] — penalise uncertainty
-    gate_loss = torch.zeros(1, device=logits.device)
     for conf in confidences:
         gate_loss = gate_loss + -(conf - 0.5).abs().mean()
     gate_loss = gate_loss / max(len(confidences), 1)

     # Exit gate auxiliary: encourage decisiveness
     # Loss = -E[|conf - 0.5|] — penalise uncertainty
+    gate_loss = torch.zeros((), device=logits.device)
     for conf in confidences:
         gate_loss = gate_loss + -(conf - 0.5).abs().mean()
     gate_loss = gate_loss / max(len(confidences), 1)

tmt/training/trainer.py CHANGED Viewed

@@ -7,12 +7,11 @@ Logs: train loss, val perplexity, exit rate per layer, and memory anchor norms.
 from __future__ import annotations
 import os
-from dataclasses import dataclass, field
 from typing import Optional
 import torch
 import torch.nn as nn
-from torch import Tensor
 from torch.optim import AdamW
 from torch.utils.data import DataLoader

 from __future__ import annotations
 import os
+from dataclasses import dataclass
 from typing import Optional
 import torch
 import torch.nn as nn
 from torch.optim import AdamW
 from torch.utils.data import DataLoader