Spaces:

MetaCortex-Dynamics
/

Axiom-Ref

Running

App Files Files Community

MetaCortex-Dynamics commited on 10 days ago

Commit

e5f14b1

verified ·

1 Parent(s): 70c4b1b

Create pipeline/mdlm/model.py

Browse files

Files changed (1) hide show

pipeline/mdlm/model.py +292 -0

pipeline/mdlm/model.py ADDED Viewed

	@@ -0,0 +1,292 @@

+"""
+MDLM — Masked Diffusion Language Model for governed structures.
+Architecture:
+  - Small transformer encoder (4 layers, 128 dim, 4 heads)
+  - Absorbing-state masking: tokens → <MASK> at rate alpha(t)
+  - Denoising: predict original token from masked sequence
+  - Loss: cross-entropy on masked positions (reweighted MLM)
+Masking schedules:
+  A: hierarchical hierarchical (Tier 1 → Tier 2 → Tier 3+readiness)
+  B: flat hierarchical (operators only, no readiness staging)
+  C: Uniform random
+  D: inverted inverted
+Per PLAN-GHA-002 §4.4: A > B > C > D predicted.
+"""
+from __future__ import annotations
+import math
+import random
+from enum import Enum
+try:
+    import torch
+    import torch.nn as nn
+    import torch.nn.functional as F
+    HAS_TORCH = True
+except ImportError:
+    HAS_TORCH = False
+from pipeline.mdlm.tokenizer import (
+    VOCAB_SIZE, MASK, PAD, NEVER_MASKED,
+    TIER_1_TOKENS, TIER_2_TOKENS, TIER_3_TOKENS,
+    get_tier, pad_sequence,
+)
+class MaskingSchedule(str, Enum):
+    """Masking schedule variants for the hierarchical hypothesis test."""
+    HIERARCHICAL = "A"  # hierarchical: Tier 1 → Tier 2 → CL+PreAttest
+    FLAT = "B"  # flat: operators only, uniform within tiers
+    UNIFORM = "C"           # uniform random over all maskable tokens
+    INVERTED = "D"      # inverted: CL first, Tier 1 last
+if HAS_TORCH:
+    class StructureModel(nn.Module):
+        """Small transformer for governed structure denoising."""
+        def __init__(
+            self,
+            vocab_size: int = VOCAB_SIZE,
+            d_model: int = 128,
+            nhead: int = 4,
+            num_layers: int = 4,
+            max_len: int = 40,
+            dropout: float = 0.1,
+        ):
+            super().__init__()
+            self.d_model = d_model
+            self.embedding = nn.Embedding(vocab_size, d_model, padding_idx=PAD)
+            self.pos_embedding = nn.Embedding(max_len, d_model)
+            self.timestep_embedding = nn.Embedding(1000, d_model)  # diffusion timestep
+            encoder_layer = nn.TransformerEncoderLayer(
+                d_model=d_model, nhead=nhead, dim_feedforward=d_model * 4,
+                dropout=dropout, batch_first=True,
+            )
+            self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
+            self.output_proj = nn.Linear(d_model, vocab_size)
+        def forward(self, x: torch.Tensor, t: torch.Tensor) -> torch.Tensor:
+            """
+            x: (batch, seq_len) — token ids with some positions masked
+            t: (batch,) — diffusion timestep (0 = clean, T = fully masked)
+            Returns: (batch, seq_len, vocab_size) — logits for each position
+            """
+            B, L = x.shape
+            positions = torch.arange(L, device=x.device).unsqueeze(0).expand(B, -1)
+            h = self.embedding(x) + self.pos_embedding(positions)
+            h = h + self.timestep_embedding(t).unsqueeze(1)
+            pad_mask = (x == PAD)
+            h = self.transformer(h, src_key_padding_mask=pad_mask)
+            return self.output_proj(h)
+    def apply_mask(
+        tokens: torch.Tensor,
+        mask_rate: float,
+        schedule: MaskingSchedule,
+        timestep: int = 0,
+        total_timesteps: int = 100,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """Apply masking schedule to a batch of token sequences.
+        Returns:
+            masked_tokens: tokens with some positions replaced by MASK
+            mask_positions: boolean tensor (True = was masked)
+        """
+        B, L = tokens.shape
+        masked = tokens.clone()
+        mask_positions = torch.zeros(B, L, dtype=torch.bool, device=tokens.device)
+        for b in range(B):
+            for i in range(L):
+                tok = tokens[b, i].item()
+                if tok in NEVER_MASKED:
+                    continue
+                tier = get_tier(tok)
+                if tier == 0:
+                    continue
+                # Compute per-tier mask probability based on schedule
+                p = _tier_mask_prob(tier, mask_rate, schedule, timestep, total_timesteps)
+                if random.random() < p:
+                    masked[b, i] = MASK
+                    mask_positions[b, i] = True
+        return masked, mask_positions
+    def _tier_mask_prob(
+        tier: int,
+        base_rate: float,
+        schedule: MaskingSchedule,
+        timestep: int,
+        total_timesteps: int,
+    ) -> float:
+        """Compute mask probability for a token based on its tier and the schedule."""
+        t_frac = timestep / max(total_timesteps, 1)  # 0 = clean, 1 = fully masked
+        if schedule == MaskingSchedule.UNIFORM:
+            return base_rate
+        if schedule == MaskingSchedule.HIERARCHICAL:
+            # Tier 1 (Tier 1): masked last, unmasked first
+            # Tier 3 (CL+PreAttest): masked first, unmasked last
+            if tier == 1:
+                return base_rate * max(0.0, (t_frac - 0.66) / 0.34) if t_frac > 0.66 else 0.0
+            elif tier == 2:
+                return base_rate * max(0.0, (t_frac - 0.33) / 0.34) if t_frac > 0.33 else 0.0
+            else:  # tier 3
+                return base_rate * min(1.0, t_frac / 0.33)
+        if schedule == MaskingSchedule.FLAT:
+            # Same as 369 but witness tokens are tier 2 priority
+            if tier == 1:
+                return base_rate * max(0.0, (t_frac - 0.66) / 0.34) if t_frac > 0.66 else 0.0
+            elif tier == 2:
+                return base_rate * max(0.0, (t_frac - 0.33) / 0.34) if t_frac > 0.33 else 0.0
+            else:
+                return base_rate * min(1.0, t_frac / 0.33)
+        if schedule == MaskingSchedule.INVERTED:
+            # Inverted: Tier 1 masked first
+            if tier == 1:
+                return base_rate * min(1.0, t_frac / 0.33)
+            elif tier == 2:
+                return base_rate * max(0.0, (t_frac - 0.33) / 0.34) if t_frac > 0.33 else 0.0
+            else:
+                return base_rate * max(0.0, (t_frac - 0.66) / 0.34) if t_frac > 0.66 else 0.0
+        return base_rate
+    def compute_loss(
+        model: StructureModel,
+        batch: torch.Tensor,
+        schedule: MaskingSchedule,
+        timestep: int,
+        total_timesteps: int = 100,
+        mask_rate: float = 0.5,
+    ) -> torch.Tensor:
+        """Compute MDLM denoising loss on a batch.
+        Loss = cross-entropy on masked positions only.
+        Returns zero loss if no positions were masked (avoids NaN).
+        """
+        device = next(model.parameters()).device
+        batch = batch.to(device)
+        t_tensor = torch.full((batch.size(0),), timestep, dtype=torch.long, device=device)
+        masked, mask_pos = apply_mask(batch, mask_rate, schedule, timestep, total_timesteps)
+        # If nothing was masked, return zero loss
+        if not mask_pos.any():
+            return torch.tensor(0.0, device=device, requires_grad=True)
+        logits = model(masked, t_tensor)
+        # Loss only on masked positions
+        loss = F.cross_entropy(
+            logits[mask_pos],
+            batch[mask_pos],
+            ignore_index=PAD,
+        )
+        return loss
+    def generate(
+        model: StructureModel,
+        num_samples: int,
+        seq_len: int,
+        schedule: MaskingSchedule,
+        total_timesteps: int = 50,
+        g_slots: int = 3,
+        s_slots: int = 4,
+        f_slots: int = 3,
+    ) -> torch.Tensor:
+        """Generate governed structures by template-guided iterative unmasking.
+        The channel_b frame is IMPOSED (governance), not learned:
+          <BOS> <G> [MASK slots] </G> <S> [MASK slots] </S> <F> [MASK slots] </F>
+                [witness MASK slots] <EOS>
+        The model fills in operator tokens and witness attestation status.
+        This respects PROPOSE ≠ PROMOTE: the frame is governance,
+        the content is what the kernel crystallizes.
+        g_slots, s_slots, f_slots: number of operator MASK slots per modality.
+        Should match the corpus distribution.
+        """
+        device = next(model.parameters()).device
+        from pipeline.mdlm.tokenizer import (
+            BOS, EOS, G_OPEN, G_CLOSE, S_OPEN, S_CLOSE, F_OPEN, F_CLOSE,
+            WIT_OFFSET, ATTESTED,
+        )
+        # Build template with configurable slot counts
+        template = [BOS, G_OPEN] + [MASK] * g_slots + [G_CLOSE,
+                    S_OPEN] + [MASK] * s_slots + [S_CLOSE,
+                    F_OPEN] + [MASK] * f_slots + [F_CLOSE]
+        # 7 witness pairs: WIT_TOKEN MASK
+        for w in range(7):
+            template.extend([WIT_OFFSET + w, MASK])
+        template.append(EOS)
+        # Pad to seq_len
+        while len(template) < seq_len:
+            template.append(PAD)
+        template = template[:seq_len]
+        samples = torch.tensor([template] * num_samples, dtype=torch.long, device=device)
+        model.eval()
+        with torch.no_grad():
+            for step in range(total_timesteps, -1, -1):
+                t_tensor = torch.full((num_samples,), step, dtype=torch.long, device=device)
+                logits = model(samples, t_tensor)
+                probs = F.softmax(logits, dim=-1)
+                t_frac = step / total_timesteps
+                for b in range(num_samples):
+                    for i in range(seq_len):
+                        if samples[b, i].item() != MASK:
+                            continue
+                        pred = torch.multinomial(probs[b, i], 1).item()
+                        tier = get_tier(pred)
+                        # Tier-based unmasking schedule
+                        should_unmask = False
+                        if schedule == MaskingSchedule.HIERARCHICAL:
+                            should_unmask = (tier == 1 and t_frac < 0.33) or \
+                                          (tier == 2 and 0.33 <= t_frac < 0.66) or \
+                                          (tier == 3 and t_frac >= 0.66) or \
+                                          (step == 0)  # unmask everything at final step
+                        else:
+                            should_unmask = True
+                        if should_unmask:
+                            samples[b, i] = pred
+            # Final pass: force-unmask any remaining MASK tokens
+            remaining = (samples == MASK)
+            if remaining.any():
+                t_tensor = torch.zeros((num_samples,), dtype=torch.long, device=device)
+                logits = model(samples, t_tensor)
+                for b in range(num_samples):
+                    for i in range(seq_len):
+                        if samples[b, i].item() == MASK:
+                            samples[b, i] = logits[b, i].argmax().item()
+        return samples