zirobtc
/

motion-stream

TensorBoard

Model card Files Files and versions

xet

Metrics Training metrics Community

zirobtc commited on Oct 31, 2025

Commit

3c212d2

verified ·

1 Parent(s): 0b85347

Upload 2 files

Browse files

Files changed (2) hide show

models/diffloss.py +308 -0
models/llama_model.py +1894 -0

models/diffloss.py ADDED Viewed

	@@ -0,0 +1,308 @@

+# models/diffloss.py
+import math
+import torch
+import torch.nn as nn
+from torch.utils.checkpoint import checkpoint
+from models.diffusion import create_diffusion
+# ---------------- utils ----------------
+def modulate(x, shift, scale):
+    return x * (1 + scale) + shift
+class TimestepEmbedder(nn.Module):
+    def __init__(self, hidden_size, frequency_embedding_size=256):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+    @staticmethod
+    def timestep_embedding(t, dim, max_period=10000):
+        half = dim // 2
+        freqs = torch.exp(-math.log(max_period) * torch.arange(0, half, dtype=torch.float32) / half).to(t.device)
+        args = t[:, None].float() * freqs[None]
+        emb = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            emb = torch.cat([emb, torch.zeros_like(emb[:, :1])], dim=-1)
+        return emb
+    def forward(self, t):
+        return self.mlp(self.timestep_embedding(t, self.frequency_embedding_size))
+class SinPos1D(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+    def forward(self, L, device, dtype):
+        pe = torch.zeros(L, self.dim, device=device, dtype=torch.float32)
+        pos = torch.arange(0, L, device=device, dtype=torch.float32).unsqueeze(1)
+        div = torch.exp(torch.arange(0, self.dim, 2, device=device, dtype=torch.float32) * (-math.log(10000.0)/self.dim))
+        pe[:, 0::2] = torch.sin(pos * div)
+        pe[:, 1::2] = torch.cos(pos * div)
+        return pe.to(dtype)
+# --------------- DiT block (causal) ---------------
+class TemporalDiTBlock(nn.Module):
+    """
+    Transformer block with AdaLN (DiT-style), **causal** self-attention over time.
+    """
+    def __init__(self, dim, n_heads, mlp_ratio=4.0, dropout=0.0):
+        super().__init__()
+        self.dim = dim
+        self.n_heads = n_heads
+        self.norm1 = nn.LayerNorm(dim, eps=1e-6)
+        self.attn = nn.MultiheadAttention(dim, n_heads, dropout=dropout, batch_first=True)
+        self.norm2 = nn.LayerNorm(dim, eps=1e-6)
+        hidden = int(dim * mlp_ratio)
+        self.ffn = nn.Sequential(
+            nn.Linear(dim, 2 * hidden, bias=True),
+            nn.SiLU(),
+            nn.Linear(2 * hidden, dim, bias=True),
+        )
+        # AdaLN params: shift/scale/gate for attn and ffn
+        self.adaLN = nn.Sequential(nn.SiLU(), nn.Linear(dim, 6 * dim, bias=True))
+        nn.init.constant_(self.adaLN[-1].weight, 0)
+        nn.init.constant_(self.adaLN[-1].bias, 0)
+    def forward(self, x, y, causal_mask):
+        """
+        x: [B, L, D], y: [B, D], causal_mask: [L, L] bool, True = mask (disallow)
+        """
+        s1, sc1, g1, s2, sc2, g2 = self.adaLN(y).chunk(6, dim=-1)  # [B, D] each
+        # attn (causal)
+        h = modulate(self.norm1(x), s1.unsqueeze(1), sc1.unsqueeze(1))
+        # torch's attn expects attn_mask shape [L, L] or [B*nH, L, L]; True means -inf
+        h, _ = self.attn(h, h, h, attn_mask=causal_mask, need_weights=False)
+        x = x + g1.unsqueeze(1) * h
+        # ffn
+        h2 = modulate(self.norm2(x), s2.unsqueeze(1), sc2.unsqueeze(1))
+        h2 = self.ffn(h2)
+        x = x + g2.unsqueeze(1) * h2
+        return x
+class FinalLayer(nn.Module):
+    def __init__(self, dim, out_channels):
+        super().__init__()
+        self.norm = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(dim, out_channels, bias=True)
+        self.adaLN = nn.Sequential(nn.SiLU(), nn.Linear(dim, 2 * dim, bias=True))
+        nn.init.constant_(self.adaLN[-1].weight, 0)
+        nn.init.constant_(self.adaLN[-1].bias, 0)
+        nn.init.constant_(self.linear.weight, 0)
+        nn.init.constant_(self.linear.bias, 0)
+    def forward(self, x, c):
+        shift, scale = self.adaLN(c).chunk(2, dim=-1)
+        x = modulate(self.norm(x), shift.unsqueeze(1), scale.unsqueeze(1))
+        return self.linear(x)
+# --------------- Temporal DiT (sequence-aware, causal) ---------------
+class TemporalDiTAdaLN(nn.Module):
+    """
+    DiT-like denoiser that:
+      - operates on [B, L, C]
+      - uses **causal** attention (each position sees only <= t)
+      - accepts (B, L) via set_sequence_layout for flatten↔sequence reshaping
+      - returns all positions but we usually **read only the last token** for streaming
+    """
+    def __init__(self, in_channels, model_channels, out_channels, z_channels, depth, n_heads=8,
+                 mlp_ratio=4.0, grad_checkpointing=False):
+        super().__init__()
+        self.in_channels = in_channels
+        self.model_channels = model_channels
+        self.out_channels = out_channels
+        self.z_channels = z_channels
+        self.depth = depth
+        self.n_heads = n_heads
+        self.grad_checkpointing = grad_checkpointing
+        self.time_embed = TimestepEmbedder(model_channels)
+        self.cond_embed = nn.Linear(z_channels, model_channels)
+        self.input_proj = nn.Linear(in_channels, model_channels)
+        self.pos = SinPos1D(model_channels)
+        self.blocks = nn.ModuleList([
+            TemporalDiTBlock(model_channels, n_heads=n_heads, mlp_ratio=mlp_ratio)
+            for _ in range(depth)
+        ])
+        self.final = FinalLayer(model_channels, out_channels)
+        self._seq_B = None
+        self._seq_L = None
+        self._init_weights()
+    def _init_weights(self):
+        def _xav(m):
+            if isinstance(m, nn.Linear):
+                nn.init.xavier_uniform_(m.weight)
+                if m.bias is not None: nn.init.constant_(m.bias, 0)
+        self.apply(_xav)
+        nn.init.normal_(self.time_embed.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.time_embed.mlp[2].weight, std=0.02)
+    def set_sequence_layout(self, B, L):
+        self._seq_B = int(B)
+        self._seq_L = int(L)
+    def _flatten_to_seq(self, x_flat, c_flat):
+        if self._seq_B is None or self._seq_L is None:
+            B, L = x_flat.shape[0], 1
+        else:
+            B, L = self._seq_B, self._seq_L
+            assert B * L == x_flat.shape[0], f"set_sequence_layout({B},{L}) mismatch"
+        x = x_flat.view(B, L, -1)
+        c = c_flat.view(B, L, -1)
+        return x, c
+    @staticmethod
+    def _causal_mask(L, device):
+        # True where masked (disallowed)
+        m = torch.ones(L, L, device=device, dtype=torch.bool).triu(1)
+        # MultiheadAttention expects float mask with -inf where we mask.
+        # But newer PyTorch also supports bool with True=mask. We'll pass bool here.
+        return m
+    def forward(self, x_flat, t, c_flat, cfg_scale: float = 1.0):
+        x, c = self._flatten_to_seq(x_flat, c_flat)      # [B, L, C], [B, L, Cz]
+        B, L, _ = x.shape
+        x = self.input_proj(x)
+        pos = self.pos(L, x.device, x.dtype)
+        x = x + pos.unsqueeze(0)
+        # pool cond to a single AdaLN vector per batch (like DiT)
+        t_emb = self.time_embed(t).view(B, L, -1).mean(dim=1)   # [B, D]
+        c_emb = self.cond_embed(c).mean(dim=1)                  # [B, D]
+        y = t_emb + c_emb
+        causal_mask = self._causal_mask(L, x.device)
+        if self.grad_checkpointing and not torch.jit.is_scripting():
+            for blk in self.blocks:
+                x = checkpoint(blk, x, y, causal_mask)
+        else:
+            for blk in self.blocks:
+                x = blk(x, y, causal_mask)
+        out = self.final(x, y)                         # [B, L, out_channels]
+        return out.view(B * L, -1)
+    def forward_with_cfg(self, x, t, c, cfg_scale):
+        half = x[: len(x) // 2]
+        combined = torch.cat([half, half], dim=0)
+        model_out = self.forward(combined, t, c, cfg_scale=cfg_scale)
+        eps, rest = model_out[:, :self.in_channels], model_out[:, self.in_channels:]
+        cond_eps, uncond_eps = torch.split(eps, len(eps) // 2, dim=0)
+        guided = uncond_eps + cfg_scale * (cond_eps - uncond_eps)
+        eps = torch.cat([guided, guided], dim=0)
+        return torch.cat([eps, rest], dim=1)
+# --------------- Wrapper (same training API) + streaming helpers ---------------
+class DiffLoss(nn.Module):
+    """
+    Diffusion loss with **causal, streamable** temporal DiT denoiser.
+    Training API unchanged; plus:
+      - set_sequence_layout(B, L)
+      - sample_next_token(z_seq, temperature=1.0, cfg=1.0) -> [B, C] (last token)
+    """
+    def __init__(self, target_channels, z_channels, depth, width, num_sampling_steps,
+                 grad_checkpointing=False, learn_sigma=False, n_heads=8, mlp_ratio=4.0):
+        super().__init__()
+        self.in_channels = target_channels
+        self.learn_sigma = learn_sigma
+        self.net = TemporalDiTAdaLN(
+            in_channels=target_channels,
+            model_channels=width,
+            out_channels=target_channels * 2 if learn_sigma else target_channels,
+            z_channels=z_channels,
+            depth=depth,
+            n_heads=n_heads,
+            mlp_ratio=mlp_ratio,
+            grad_checkpointing=grad_checkpointing
+        )
+        self.train_diffusion = create_diffusion(timestep_respacing="", noise_schedule="cosine")
+        self.gen_diffusion = create_diffusion(timestep_respacing=num_sampling_steps, noise_schedule="cosine")
+        # cached (B,L) for flatten↔sequence
+        self._B = None
+        self._L = None
+    # --- layout for flatten<->sequence ---
+    def set_sequence_layout(self, B, L):
+        self._B, self._L = int(B), int(L)
+        self.net.set_sequence_layout(B, L)
+    # --- training ---
+    def forward(self, target, z, mask=None):
+        t = torch.randint(0, self.train_diffusion.num_timesteps, (target.shape[0],), device=target.device)
+        loss_dict = self.train_diffusion.training_losses(self.net, target, t, dict(c=z))
+        loss, pred_xstart = loss_dict["loss"], loss_dict["pred_xstart"]
+        if mask is not None:
+            loss = (loss * mask).sum() / mask.sum()
+        return loss.mean(), pred_xstart
+    # --- full sequence sampling (kept for compatibility) ---
+    def sample(self, z, temperature=1.0, cfg=1.0):
+        if cfg != 1.0:
+            noise = torch.randn(z.shape[0] // 2, self.in_channels, device=z.device)
+            noise = torch.cat([noise, noise], dim=0)
+            sample_fn = self.net.forward_with_cfg
+            kwargs = dict(c=z, cfg_scale=cfg)
+        else:
+            noise = torch.randn(z.shape[0], self.in_channels, device=z.device)
+            sample_fn = self.net.forward
+            kwargs = dict(c=z)
+        return self.gen_diffusion.p_sample_loop(
+            sample_fn, noise.shape, noise, clip_denoised=False, model_kwargs=kwargs,
+            progress=False, temperature=temperature
+        )
+    # --- STREAMING: sample only the **last token** of current window ---
+    @torch.no_grad()
+    def sample_next_token(self, z_seq, temperature=1.0, cfg=1.0):
+        """
+        z_seq: [B, L, Cz] AR conditions for the current streaming window (history + 1 step).
+               Call set_sequence_layout(B, L) first.
+        Returns: next_token: [B, C] (the last position’s denoised sample).
+        Mechanism: denoise **entire window** with causal attention and read the last index only.
+        """
+        assert self._B is not None and self._L is not None, "Call set_sequence_layout(B, L) first."
+        B, L, Cz = z_seq.shape
+        assert B == self._B and L == self._L, "z_seq shape must match set_sequence_layout."
+        z_flat = z_seq.reshape(B * L, Cz)
+        if cfg != 1.0:
+            noise = torch.randn((B * L) // 2, self.in_channels, device=z_seq.device)
+            noise = torch.cat([noise, noise], dim=0)
+            sample_fn = self.net.forward_with_cfg
+            kwargs = dict(c=z_flat, cfg_scale=cfg)
+        else:
+            noise = torch.randn(B * L, self.in_channels, device=z_seq.device)
+            sample_fn = self.net.forward
+            kwargs = dict(c=z_flat)
+        x = self.gen_diffusion.p_sample_loop(
+            sample_fn, noise.shape, noise, clip_denoised=False, model_kwargs=kwargs,
+            progress=False, temperature=temperature
+        )  # [B*L, C]
+        x_seq = x.view(B, L, self.in_channels)
+        return x_seq[:, -1, :]   # last token only

models/llama_model.py ADDED Viewed

	@@ -0,0 +1,1894 @@

+import math
+from dataclasses import dataclass
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from typing_extensions import Self
+from typing import Optional
+from transformers.modeling_utils import PreTrainedModel
+from torch.distributions import Categorical
+@dataclass
+class LLaMAHFConfig:
+    block_size: int = 156
+    n_layer: int = 32
+    n_head: int = 32
+    n_kv_head: Optional[int] = None
+    n_embd: int = 4096
+    rope_base: int = 500000
+    T5_xxl_dim: int = 768
+    @classmethod
+    def from_name(cls, name: str) -> Self:
+        return cls(**llama_configs[name])
+llama_configs = {
+    "Normal_size": dict(n_layer=12, n_head=12, n_embd=768)
+}
+class LLaMAHF(nn.Module):
+    def __init__(self, config: LLaMAHFConfig, num_diffusion_head_layers=6, n_diffusion_heads=4, input_token_dim=16, device=torch.device('cuda'), width=512) -> None:
+        super().__init__()
+        assert config.block_size is not None
+        self.config = config
+        cond_dim = config.T5_xxl_dim
+        self.transformer = nn.ModuleDict(
+            dict(
+                wte=nn.Linear(input_token_dim, config.n_embd),   # vector tokens -> embeddings
+                cond_embed=nn.Linear(cond_dim, config.n_embd),   # text feature -> context emb
+                h=nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
+                ln_f=RMSNorm(config.n_embd),
+            )
+        )
+        target_channels = input_token_dim
+        from models.diffloss import DiffLoss
+        self.diff_loss = DiffLoss(
+            target_channels=target_channels,
+            z_channels=config.n_embd,
+            width=width,
+            depth=num_diffusion_head_layers,
+            num_sampling_steps='50',
+            grad_checkpointing=False,
+            n_heads=n_diffusion_heads,
+            mlp_ratio=2.0
+        ).to(device)
+        self.out_proj = nn.Linear(config.n_embd, config.n_embd)
+        self.use_out_proj = True
+        # --- Persistent prompt cache & BOS token ---
+        self._prompt_cached = False
+        self._prompt_bsz = None
+        self.bos = nn.Parameter(torch.zeros(1, 1, config.n_embd))
+        # === Needed by several sampling/forward paths ===
+        # projects raw text features when they are concatenated as tokens
+        self.llama_proj = nn.Linear(config.T5_xxl_dim, config.n_embd)
+        # special boundary-of-motion token used in forward_babel
+        self.BOM_tag = nn.Parameter(torch.zeros(1, 1, config.n_embd))
+        # (Optional) only if sample_for_eval_classification() is used:
+        # self.classify_head = nn.Linear(config.n_embd, num_classes)
+    @torch.no_grad()
+    def set_prompt(self, feature: torch.Tensor):
+        """
+        Precompute and cache cross-attention K/V for the current prompt (feature).
+        Call this ONCE when you switch prompt (e.g., 'walk' -> 'crawl').
+        """
+        context = self._prepare_context(feature)
+        if context is None:
+            raise ValueError("set_prompt: feature cannot be None")
+        self._prompt_bsz = context.size(0)
+        for blk in self.transformer.h:
+            blk.set_context_cache(context)
+        self._prompt_cached = True
+    @torch.no_grad()
+    def clear_prompt(self):
+        for blk in self.transformer.h:
+            blk.clear_context_cache()
+        self._prompt_cached = False
+        self._prompt_bsz = None
+    def _prepare_context(self, feature: Optional[torch.Tensor], batch_size: Optional[int] = None) -> Optional[torch.Tensor]:
+        if feature is None:
+            return None
+        if not torch.is_tensor(feature):
+            feature = torch.as_tensor(
+                feature,
+                dtype=self.transformer.cond_embed.weight.dtype,
+                device=self.transformer.cond_embed.weight.device,
+            )
+        else:
+            feature = feature.to(
+                dtype=self.transformer.cond_embed.weight.dtype,
+                device=self.transformer.cond_embed.weight.device,
+            )
+        if feature.dim() == 1:
+            feature = feature.unsqueeze(0)
+        context = self.transformer.cond_embed(feature)
+        if context.dim() == 2:
+            context = context.unsqueeze(1)
+        if batch_size is not None and context.size(0) != batch_size:
+            if context.size(0) == 1:
+                context = context.expand(batch_size, -1, -1)
+            else:
+                raise ValueError(
+                    f"Condition batch ({context.size(0)}) does not match token batch ({batch_size})."
+                )
+        return context
+    def _tie_or_clone_weights(self, output_embeddings, input_embeddings):
+        """Tie or clone module weights depending of whether we are using TorchScript or not"""
+        output_embeddings.weight = input_embeddings.weight
+        if getattr(output_embeddings, "bias", None) is not None:
+            output_embeddings.bias.data = nn.functional.pad(
+                output_embeddings.bias.data,
+                (
+                    0,
+                    output_embeddings.weight.shape[0] - output_embeddings.bias.shape[0],
+                ),
+                "constant",
+                0,
+            )
+        if hasattr(output_embeddings, "out_features") and hasattr(input_embeddings, "num_embeddings"):
+            output_embeddings.out_features = input_embeddings.num_embeddings
+    def get_input_embeddings(self):
+        return self.transformer.wte
+    def set_input_embeddings(self, value):
+        self.transformer.wte = value
+    def get_output_embeddings(self):
+        return self.out_proj
+    def set_output_embeddings(self, new_embeddings):
+        self.out_proj = new_embeddings
+    def _init_weights(self, module: nn.Module) -> None:
+        if isinstance(module, nn.Linear):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02 / math.sqrt(2 * self.config.n_layer))
+        elif isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02 / math.sqrt(2 * self.config.n_layer))
+    def forward_sample(self, idx: torch.Tensor, clip_feature: torch.Tensor, y_mask) -> torch.Tensor:
+        text_length = clip_feature.shape[1]
+        context = self._prepare_context(clip_feature)
+        if len(idx) == 0:
+            x = self.llama_proj(clip_feature)[:, :int(y_mask[0].sum()), :]
+        else:
+            _, t = idx.size()
+            assert (
+                t <= self.config.block_size
+            ), f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
+            # forward the LLaMA model itself
+            x = self.transformer.wte(idx)  # token embeddings of shape (b, t, n_embd)
+            x = torch.cat((self.llama_proj(clip_feature)[:, :int(y_mask[0].sum()), :],x), dim=1)
+        if context is not None and context.size(0) != x.size(0):
+            if context.size(0) == 1:
+                context = context.expand(x.size(0), -1, -1)
+            else:
+                raise ValueError("Conditioning batch size does not match token batch size.")
+        for block in self.transformer.h:
+            x = block(x, context=context)
+        x = self.transformer.ln_f(x)
+        logits = x
+        return logits
+    def sample_for_eval_CFG(self, text, length=196, tokenize_model=None, device=torch.device('cuda'), unit_length=4, cfg=4.0):
+        max_token_len = length // unit_length
+        # Prepare conditioned prompt once and cache it
+        feat_text = torch.from_numpy(tokenize_model.encode(text)).float().to(device)
+        self.set_prompt(feat_text)  # <-- persist until you change it
+        # Prepare empty/uncond prompt once and cache it too
+        empty_feat_text = torch.from_numpy(tokenize_model.encode('')).float().unsqueeze(0).to(device)
+        # We'll flip between two caches: cond and uncond
+        def _use_cond_cache():
+            self.set_prompt(feat_text)
+        def _use_uncond_cache():
+            self.set_prompt(empty_feat_text)
+        xs = None
+        for k in range(max_token_len):
+            x = [] if k == 0 else xs
+            # conditioned next-step
+            _use_cond_cache()
+            conditions = self.forward(x, feature=None)[:, -1, :]
+            # unconditioned next-step
+            _use_uncond_cache()
+            empty_conditions = self.forward(x, feature=None)[:, -1, :]
+            temperature = 1.0
+            if cfg != 1:
+                mix_conditions = torch.cat([conditions, empty_conditions], dim=0)
+                sampled_token_latent = self.diff_loss.sample(mix_conditions, temperature=temperature, cfg=cfg)
+                scaled_logits, _ = sampled_token_latent.chunk(2, dim=0)
+            else:
+                scaled_logits = self.diff_loss.sample(conditions, temperature=temperature, cfg=1)
+            scaled_logits = scaled_logits.unsqueeze(0)
+            xs = scaled_logits if k == 0 else torch.cat((xs, scaled_logits), dim=1)
+        # re-enable the conditioned prompt cache for whatever comes next
+        self.set_prompt(feat_text)
+        return xs
+    # For inference, can stop sampling when the distance between the current token and the reference end token is less than the threshold.
+    def sample_for_eval_CFG_inference(self, text, length=312, tokenizer=None, device=torch.device('cuda'),
+                                    unit_length=4, reference_end_latent=None, threshold=0.1, cfg=4.0, temperature=1.0):
+        max_token_len = length // unit_length
+        feat_text = torch.from_numpy(tokenizer.encode(text)).float().to(device)
+        empty_feat_text = torch.from_numpy(tokenizer.encode('')).float().unsqueeze(0).to(device)
+        def _use_cond():   self.set_prompt(feat_text)
+        def _use_uncond(): self.set_prompt(empty_feat_text)
+        xs = None
+        for k in range(max_token_len):
+            x = [] if k == 0 else xs
+            _use_cond()
+            conditions = self.forward_inference(x, feature=None)[:, -1, :]
+            _use_uncond()
+            empty_conditions = self.forward(x, feature=None)[:, -1, :]
+            mix = torch.cat([conditions, empty_conditions], dim=0)
+            sampled = self.diff_loss.sample(mix, temperature=temperature, cfg=cfg)
+            scaled_logits, _ = sampled.chunk(2, dim=0) if cfg != 1 else (sampled, None)
+            scaled_logits = scaled_logits.unsqueeze(0)
+            if reference_end_latent is not None:
+                dist = torch.sqrt(torch.sum((scaled_logits - reference_end_latent)**2))
+                if dist < threshold: break
+            xs = scaled_logits if k == 0 else torch.cat((xs, scaled_logits), dim=1)
+        # leave the cond cache active
+        self.set_prompt(feat_text)
+        return xs
+    def sample_for_eval_CFG_inference2(self, feat_clip_text, empty_feat_clip_text, if_categorial=False, length=312, clip_model=None, device=torch.device('cuda'), tokenizer='clip', unit_length=4, reference_end_token=None, threshold=3, cfg=4.5, temperature=1.0):
+        import clip
+        max_token_len = length // unit_length
+        for k in range(max_token_len):
+            if k == 0:
+                x = []
+            else:
+                x = xs
+            try:
+                conditions = self.forward(x, feat_clip_text)
+            except:
+                conditions = self.forward(x, feat_clip_text.unsqueeze(0))
+            conditions = conditions[:, -1, :]
+            empty_conditions = self.forward(x, empty_feat_clip_text)
+            empty_conditions = empty_conditions[:, -1, :]
+            mix_conditions = torch.cat([conditions, empty_conditions], dim=0)
+            sampled_token_latent = self.diff_loss.sample(mix_conditions, temperature=temperature, cfg=cfg)
+            # chunk
+            if cfg != 1:
+                scaled_logits, _ = sampled_token_latent.chunk(2, dim=0)
+            else:
+                scaled_logits = sampled_token_latent
+            scaled_logits = scaled_logits.unsqueeze(0)
+            if reference_end_token is not None:
+                distance_l2 = torch.sqrt(torch.sum((scaled_logits - reference_end_token)**2))
+                print(distance_l2)
+                if distance_l2 < threshold:
+                    break
+            if k == 0:
+                xs = scaled_logits
+            else:
+                xs = torch.cat((xs, scaled_logits), dim=1)
+        return xs
+    def sample_for_eval_CFG_inference_next_one(self, current_token=[], feat_clip_text=None, empty_feat_clip_text=None, if_categorial=False, length=312, clip_model=None, device=torch.device('cuda'), tokenizer='clip', unit_length=4, reference_end_token=None, threshold=3, cfg=4.5, temperature=1.0):
+        import clip
+        max_token_len = length // unit_length
+        for k in range(1):
+            if current_token == []:
+                x = []
+            else:
+                x = torch.cat(current_token, dim=1)
+            try:
+                conditions = self.forward(x, feat_clip_text)
+            except:
+                conditions = self.forward(x, feat_clip_text.unsqueeze(0))
+            conditions = conditions[:, -1, :]
+            empty_conditions = self.forward(x, empty_feat_clip_text)
+            empty_conditions = empty_conditions[:, -1, :]
+            mix_conditions = torch.cat([conditions, empty_conditions], dim=0)
+            sampled_token_latent = self.diff_loss.sample(mix_conditions, temperature=temperature, cfg=cfg)
+            # chunk
+            if cfg != 1:
+                scaled_logits, _ = sampled_token_latent.chunk(2, dim=0)
+            else:
+                scaled_logits = sampled_token_latent
+            scaled_logits = scaled_logits.unsqueeze(0)
+            if k == 0:
+                xs = scaled_logits
+            else:
+                xs = torch.cat((xs, scaled_logits), dim=1)
+        return xs
+    def sample_for_eval_CFG_babel(self, A_text, B_text, A_motion, if_categorial=False, length=6400, clip_model=None, device=torch.device('cuda'), tokenizer='clip', unit_length=4, reference_end_token=None, cfg=7.0, threshold=3):
+        import clip
+        B_token_length = length // unit_length - A_motion.shape[0]
+        if tokenizer == 'clip':
+            A_text = clip.tokenize(A_text, truncate=True).to(device)
+            A_feat_clip_text = clip_model.encode_text(A_text).float()
+            B_text = clip.tokenize(B_text, truncate=True).to(device)
+            B_feat_clip_text = clip_model.encode_text(B_text).float()
+        elif tokenizer == 't5-xxl':
+            A_feat_clip_text = torch.from_numpy(clip_model.encode(A_text)).float()
+            A_feat_clip_text = A_feat_clip_text.to(device)
+            B_feat_clip_text = torch.from_numpy(clip_model.encode(B_text)).float()
+            B_feat_clip_text = B_feat_clip_text.to(device)
+        A_text_embeddings = self.transformer.cond_embed(A_feat_clip_text).unsqueeze(0)
+        B_text_embeddings = self.transformer.cond_embed(B_feat_clip_text).unsqueeze(0)
+        A_motion = A_motion.unsqueeze(0)
+        A_motion_embeddings = self.transformer.wte(A_motion)
+        B_motion = torch.tensor([]).to(device)
+        for k in range(B_token_length):
+            if k == 0:
+                x = torch.cat([A_text_embeddings, A_motion_embeddings, B_text_embeddings], dim=1)
+            else:
+                x = xs
+            conditions = self.forward_babel_eval(x)
+            conditions = conditions[:, -1, :]
+            empty_clip_text = ''
+            if tokenizer == 'clip':
+                empty_text = clip.tokenize(empty_clip_text, truncate=True).to(device)
+                empty_feat_clip_text = clip_model.encode_text(empty_text).float()
+            elif tokenizer == 't5-xxl':
+                empty_feat_clip_text = torch.from_numpy(clip_model.encode(empty_clip_text)).float()
+                empty_feat_clip_text = empty_feat_clip_text.unsqueeze(0)
+                empty_feat_clip_text = empty_feat_clip_text.to(device)
+            empty_feat_clip_text_embedding = self.transformer.cond_embed(empty_feat_clip_text).unsqueeze(0)
+            if k == 0:
+                empty_input = torch.cat([empty_feat_clip_text_embedding, A_motion_embeddings, empty_feat_clip_text_embedding], dim=1)
+                empty_conditions = self.forward_babel_eval(empty_input)
+            else:
+                B_motion_embeddings = self.transformer.wte(B_motion)
+                empty_input = torch.cat([empty_feat_clip_text_embedding, A_motion_embeddings, empty_feat_clip_text_embedding, B_motion_embeddings], dim=1)
+                empty_conditions = self.forward_babel_eval(empty_input)
+            empty_conditions = empty_conditions[:, -1, :]
+            temperature = 1.0
+            mix_conditions = torch.cat([conditions, empty_conditions], dim=0)
+            sampled_token_latent = self.diff_loss.sample(mix_conditions, temperature=temperature, cfg=cfg)
+            # chunk
+            if cfg != 1:
+                scaled_logits, _ = sampled_token_latent.chunk(2, dim=0)
+            else:
+                scaled_logits = sampled_token_latent
+            scaled_logits = scaled_logits.unsqueeze(0)
+            B_motion = torch.cat((B_motion, scaled_logits), dim=1)
+            scaled_logits_embedding = self.transformer.wte(scaled_logits)
+            xs = torch.cat((x, scaled_logits_embedding), dim=1)
+        return xs, B_motion
+    def sample_for_eval_CFG_babel_inference(self, A_text, B_text, A_motion, if_categorial=False, length=6400, clip_model=None, device=torch.device('cuda'), tokenizer='clip', unit_length=4, reference_end_token=None, cfg=7.0, threshold=3):
+        import clip
+        B_token_length = length // unit_length - A_motion.shape[0]
+        if tokenizer == 'clip':
+            A_text = clip.tokenize(A_text, truncate=True).to(device)
+            A_feat_clip_text = clip_model.encode_text(A_text).float()
+            B_text = clip.tokenize(B_text, truncate=True).to(device)
+            B_feat_clip_text = clip_model.encode_text(B_text).float()
+        elif tokenizer == 't5-xxl':
+            A_feat_clip_text = torch.from_numpy(clip_model.encode(A_text)).float()
+            A_feat_clip_text = A_feat_clip_text.to(device)
+            B_feat_clip_text = torch.from_numpy(clip_model.encode(B_text)).float()
+            B_feat_clip_text = B_feat_clip_text.to(device)
+        A_text_embeddings = self.transformer.cond_embed(A_feat_clip_text).unsqueeze(0)
+        A_text_embeddings = A_text_embeddings.unsqueeze(0)
+        B_text_embeddings = self.transformer.cond_embed(B_feat_clip_text).unsqueeze(0)
+        B_text_embeddings = B_text_embeddings.unsqueeze(0)
+        A_motion = A_motion.unsqueeze(0)
+        A_motion_embeddings = self.transformer.wte(A_motion)
+        B_motion = torch.tensor([]).to(device)
+        attention_weights = []
+        for k in range(B_token_length):
+            if k == 0:
+                x = torch.cat([A_text_embeddings, A_motion_embeddings, B_text_embeddings], dim=1)
+            else:
+                x = xs
+            conditions = self.forward_babel_eval(x, return_attention=False)
+            conditions = conditions[:, -1, :]
+            empty_clip_text = ''
+            if tokenizer == 'clip':
+                empty_text = clip.tokenize(empty_clip_text, truncate=True).to(device)
+                empty_feat_clip_text = clip_model.encode_text(empty_text).float()
+            elif tokenizer == 't5-xxl':
+                empty_feat_clip_text = torch.from_numpy(clip_model.encode(empty_clip_text)).float()
+                empty_feat_clip_text = empty_feat_clip_text.unsqueeze(0)
+                empty_feat_clip_text = empty_feat_clip_text.to(device)
+            empty_feat_clip_text_embedding = self.transformer.cond_embed(empty_feat_clip_text).unsqueeze(0)
+            if k == 0:
+                empty_input = torch.cat([empty_feat_clip_text_embedding, A_motion_embeddings, empty_feat_clip_text_embedding], dim=1)
+                empty_conditions = self.forward_babel_eval(empty_input)
+            else:
+                B_motion_embeddings = self.transformer.wte(B_motion)
+                empty_input = torch.cat([empty_feat_clip_text_embedding, A_motion_embeddings, empty_feat_clip_text_embedding, B_motion_embeddings], dim=1)
+                empty_conditions = self.forward_babel_eval(empty_input)
+            empty_conditions = empty_conditions[:, -1, :]
+            temperature = 1.0
+            mix_conditions = torch.cat([conditions, empty_conditions], dim=0)
+            sampled_token_latent = self.diff_loss.sample(mix_conditions, temperature=temperature, cfg=cfg)
+            # chunk
+            if cfg != 1:
+                scaled_logits, _ = sampled_token_latent.chunk(2, dim=0)
+            else:
+                scaled_logits = sampled_token_latent
+            scaled_logits = scaled_logits.unsqueeze(0)
+            if reference_end_token is not None:
+                distance_l2 = torch.sqrt(torch.sum((scaled_logits - reference_end_token)**2))
+                print(distance_l2)
+                if distance_l2 < threshold:
+                    break
+            B_motion = torch.cat((B_motion, scaled_logits), dim=1)
+            scaled_logits_embedding = self.transformer.wte(scaled_logits)
+            xs = torch.cat((x, scaled_logits_embedding), dim=1)
+        return xs, B_motion
+    def sample_for_eval_CFG_babel_inference_new(self, B_text, A_motion, if_categorial=False, length=78, clip_model=None, device=torch.device('cuda'), tokenizer='clip', unit_length=4, reference_end_token=None, cfg=4.5, threshold=3):
+        import clip
+        B_token_length = length // unit_length
+        if tokenizer == 'clip':
+            A_text = clip.tokenize(A_text, truncate=True).to(device)
+            A_feat_clip_text = clip_model.encode_text(A_text).float()
+            B_text = clip.tokenize(B_text, truncate=True).to(device)
+            B_feat_clip_text = clip_model.encode_text(B_text).float()
+        elif tokenizer == 't5-xxl':
+            B_feat_clip_text = torch.from_numpy(clip_model.encode(B_text)).float()
+            B_feat_clip_text = B_feat_clip_text.to(device)
+        empty_clip_text = ''
+        if tokenizer == 'clip':
+            empty_text = clip.tokenize(empty_clip_text, truncate=True).to(device)
+            empty_feat_clip_text = clip_model.encode_text(empty_text).float()
+        elif tokenizer == 't5-xxl':
+            empty_feat_clip_text = torch.from_numpy(clip_model.encode(empty_clip_text)).float()
+            empty_feat_clip_text = empty_feat_clip_text.unsqueeze(0)
+            empty_feat_clip_text = empty_feat_clip_text.to(device)
+        B_text_embeddings = self.transformer.cond_embed(B_feat_clip_text).unsqueeze(0)
+        A_motion = A_motion.unsqueeze(0)
+        A_motion_embeddings = self.transformer.wte(A_motion)
+        B_motion = torch.tensor([]).to(device)
+        attention_weights = []
+        for k in range(B_token_length):
+            if k == 0:
+                x = torch.cat([B_text_embeddings, A_motion_embeddings], dim=1)
+            else:
+                x = xs
+            conditions = self.forward_babel_eval(x, return_attention=False)
+            conditions = conditions[:, -1, :]
+            empty_feat_clip_text_embedding = self.transformer.cond_embed(empty_feat_clip_text).unsqueeze(0)
+            if k == 0:
+                empty_input = torch.cat([empty_feat_clip_text_embedding, A_motion_embeddings], dim=1)
+                empty_conditions = self.forward_babel_eval(empty_input)
+            else:
+                B_motion_embeddings = self.transformer.wte(B_motion)
+                empty_input = torch.cat([empty_feat_clip_text_embedding, A_motion_embeddings, B_motion_embeddings], dim=1)
+                empty_conditions = self.forward_babel_eval(empty_input)
+            empty_conditions = empty_conditions[:, -1, :]
+            temperature = 1.0
+            mix_conditions = torch.cat([conditions, empty_conditions], dim=0)
+            sampled_token_latent = self.diff_loss.sample(mix_conditions, temperature=temperature, cfg=cfg)
+            # chunk
+            if cfg != 1:
+                scaled_logits, _ = sampled_token_latent.chunk(2, dim=0)
+            else:
+                scaled_logits = sampled_token_latent
+            scaled_logits = scaled_logits.unsqueeze(0)
+            if reference_end_token is not None:
+                distance_l2 = torch.sqrt(torch.sum((scaled_logits - reference_end_token)**2))
+                print(distance_l2)
+                if distance_l2 < threshold:
+                    break
+            B_motion = torch.cat((B_motion, scaled_logits), dim=1)
+            scaled_logits_embedding = self.transformer.wte(scaled_logits)
+            xs = torch.cat((x, scaled_logits_embedding), dim=1)
+        return xs, B_motion
+    def sample_for_eval_CFG_babel_inference_new_demo(self, B_text, A_motion, if_categorial=False, length=312, clip_model=None, device=torch.device('cuda'), tokenizer='clip', unit_length=4, reference_end_token=None, cfg=4.5, threshold=3, temperature=1.0):
+        import clip
+        B_token_length = length // unit_length - A_motion.shape[0]
+        if tokenizer == 'clip':
+            A_text = clip.tokenize(A_text, truncate=True).to(device)
+            A_feat_clip_text = clip_model.encode_text(A_text).float()
+            B_text = clip.tokenize(B_text, truncate=True).to(device)
+            B_feat_clip_text = clip_model.encode_text(B_text).float()
+        elif tokenizer == 't5-xxl':
+            B_feat_clip_text = torch.from_numpy(clip_model.encode(B_text)).float()
+            B_feat_clip_text = B_feat_clip_text.to(device)
+        empty_clip_text = ''
+        if tokenizer == 'clip':
+            empty_text = clip.tokenize(empty_clip_text, truncate=True).to(device)
+            empty_feat_clip_text = clip_model.encode_text(empty_text).float()
+        elif tokenizer == 't5-xxl':
+            empty_feat_clip_text = torch.from_numpy(clip_model.encode(empty_clip_text)).float()
+            empty_feat_clip_text = empty_feat_clip_text.unsqueeze(0)
+            empty_feat_clip_text = empty_feat_clip_text.to(device)
+        B_text_embeddings = self.transformer.cond_embed(B_feat_clip_text).unsqueeze(0)
+        B_text_embeddings = B_text_embeddings.unsqueeze(0)
+        A_motion = A_motion.unsqueeze(0)
+        A_motion_embeddings = self.transformer.wte(A_motion)
+        B_motion = torch.tensor([]).to(device)
+        # 存储所有层的注意力权重
+        attention_weights = []
+        for k in range(B_token_length):
+            if k == 0:
+                x = torch.cat([B_text_embeddings, A_motion_embeddings], dim=1)
+            else:
+                x = xs
+            conditions = self.forward_babel_eval(x, return_attention=False)
+            conditions = conditions[:, -1, :]
+            empty_feat_clip_text_embedding = self.transformer.cond_embed(empty_feat_clip_text).unsqueeze(0)
+            if k == 0:
+                empty_input = torch.cat([empty_feat_clip_text_embedding, A_motion_embeddings], dim=1)
+                empty_conditions = self.forward_babel_eval(empty_input)
+            else:
+                B_motion_embeddings = self.transformer.wte(B_motion)
+                empty_input = torch.cat([empty_feat_clip_text_embedding, A_motion_embeddings, B_motion_embeddings], dim=1)
+                empty_conditions = self.forward_babel_eval(empty_input)
+            empty_conditions = empty_conditions[:, -1, :]
+            mix_conditions = torch.cat([conditions, empty_conditions], dim=0)
+            sampled_token_latent = self.diff_loss.sample(mix_conditions, temperature=temperature, cfg=cfg)
+            # chunk
+            if cfg != 1:
+                scaled_logits, _ = sampled_token_latent.chunk(2, dim=0)
+            else:
+                scaled_logits = sampled_token_latent
+            scaled_logits = scaled_logits.unsqueeze(0)
+            if reference_end_token is not None:
+                distance_l2 = torch.sqrt(torch.sum((scaled_logits - reference_end_token)**2))
+                print(distance_l2)
+                if distance_l2 < threshold and k > 10:
+                    break
+            B_motion = torch.cat((B_motion, scaled_logits), dim=1)
+            scaled_logits_embedding = self.transformer.wte(scaled_logits)
+            xs = torch.cat((x, scaled_logits_embedding), dim=1)
+        return xs, B_motion
+    def sample_for_eval_CFG_babel_inference_two_forward(self, B_text, A_motion, if_categorial=False, length=312, clip_model=None, device=torch.device('cuda'), tokenizer='clip', unit_length=4, reference_end_token=None, cfg=4.5, threshold=3, temperature=1.0):
+        """
+        Inference loop that mimics the "Two-Forward" training strategy.
+        This version correctly performs two full passes over the entire sequence.
+        """
+        import clip
+        B_token_length = length // unit_length - A_motion.shape[0]
+        if tokenizer == 't5-xxl':
+            B_feat_clip_text = torch.from_numpy(clip_model.encode(B_text)).float().to(device)
+        else:
+            raise NotImplementedError("Only t5-xxl is supported for this function.")
+        empty_feat_clip_text = torch.from_numpy(clip_model.encode('')).float().unsqueeze(0).to(device)
+        # --- Create 3D embeddings [batch, seq, dim] ---
+        B_text_embeddings = self.transformer.cond_embed(B_feat_clip_text).unsqueeze(0).unsqueeze(0)
+        empty_text_embeddings = self.transformer.cond_embed(empty_feat_clip_text).unsqueeze(0) # This is [1, 1, 768]
+        A_motion_embeddings = self.transformer.wte(A_motion.unsqueeze(0))
+        # === 1. First Forward Pass (Generate Rough Draft) ===
+        rough_motion_tokens = A_motion
+        for k in range(B_token_length):
+            current_rough_embeddings = self.transformer.wte(rough_motion_tokens.unsqueeze(0))
+            # Conditioned
+            x_cond = torch.cat([B_text_embeddings, current_rough_embeddings], dim=1)
+            conditions = self.forward_babel_eval(x_cond, return_attention=False)[:, -1, :]
+            # Unconditioned
+            x_uncond = torch.cat([empty_text_embeddings, current_rough_embeddings], dim=1)
+            empty_conditions = self.forward_babel_eval(x_uncond, return_attention=False)[:, -1, :]
+            # Sample a rough prediction for the next token
+            mix_conditions = torch.cat([conditions, empty_conditions], dim=0)
+            pred_xstart_rough = self.diff_loss.sample(mix_conditions, temperature=temperature, cfg=cfg)
+            if cfg != 1:
+                pred_xstart_rough, _ = pred_xstart_rough.chunk(2, dim=0)
+            rough_motion_tokens = torch.cat([rough_motion_tokens, pred_xstart_rough], dim=0)
+        # === 2. Second Forward Pass (Generate Refined Motion) ===
+        # Now we have the full rough draft. We use it as the input for the second pass.
+        refined_motion_tokens = A_motion
+        for k in range(B_token_length):
+            # The input to the transformer is the full rough sequence
+            rough_embeddings = self.transformer.wte(rough_motion_tokens.unsqueeze(0))
+            # Conditioned
+            x_cond_refined = torch.cat([B_text_embeddings, rough_embeddings], dim=1)
+            # We take the condition corresponding to the token we want to predict
+            conditions_refined = self.forward_babel_eval(x_cond_refined, return_attention=False)[:, A_motion.shape[0] + k, :]
+            # Unconditioned
+            x_uncond_refined = torch.cat([empty_text_embeddings, rough_embeddings], dim=1)
+            empty_conditions_refined = self.forward_babel_eval(x_uncond_refined, return_attention=False)[:, A_motion.shape[0] + k, :]
+            # Sample the final, refined token
+            mix_conditions_refined = torch.cat([conditions_refined, empty_conditions_refined], dim=0)
+            final_token, _ = self.diff_loss.sample(mix_conditions_refined, temperature=temperature, cfg=cfg).chunk(2, dim=0)
+            # Append the refined token to our final output history
+            refined_motion_tokens = torch.cat([refined_motion_tokens, final_token], dim=0)
+            # IMPORTANT: For the next step, we must update the "rough draft" with our new refined token
+            # This mimics the training where the input is a mix of GT and predictions.
+            # Here, it's a mix of the initial rough draft and the new refined tokens.
+            rough_motion_tokens[A_motion.shape[0] + k] = final_token.squeeze(0)
+        # Return only the newly generated tokens (B_motion)
+        B_motion = refined_motion_tokens[A_motion.shape[0]:, :].unsqueeze(0)
+        return None, B_motion
+    #--------------Test classification head--------------------
+    def sample_for_eval_classification(self, clip_text, if_categorial=False, length=196, clip_model=None, device=torch.device('cuda'), tokenizer='clip', unit_length=4):
+        import clip
+        for k in range(51):
+            if k == 0:
+                x = []
+            else:
+                x = xs
+            if tokenizer == 'clip':
+                text = clip.tokenize(clip_text, truncate=True).to(device)
+                feat_clip_text = clip_model.encode_text(text).float()
+            elif tokenizer == 't5-xxl':
+                feat_clip_text = torch.from_numpy(clip_model.module.encode(clip_text)).float()
+            conditions = self.forward(x, feat_clip_text)
+            conditions = conditions[:, -1, :]
+            empty_clip_text = ''
+            if tokenizer == 'clip':
+                empty_text = clip.tokenize(empty_clip_text, truncate=True).to(device)
+                empty_feat_clip_text = clip_model.encode_text(empty_text).float()
+            elif tokenizer == 't5-xxl':
+                empty_feat_clip_text = torch.from_numpy(clip_model.module.encode(empty_clip_text)).float()
+                empty_feat_clip_text = empty_feat_clip_text.unsqueeze(0)
+                empty_feat_clip_text = empty_feat_clip_text.to(device)
+            empty_conditions = self.forward(x, empty_feat_clip_text)
+            empty_conditions = empty_conditions[:, -1, :]
+            temperature = 1.0
+            cfg = 7.5
+            mix_conditions = torch.cat([conditions, empty_conditions], dim=0)
+            sampled_token_latent = self.diff_loss.sample(mix_conditions, temperature=temperature, cfg=cfg)
+            # chunk
+            if cfg != 1:
+                scaled_logits, _ = sampled_token_latent.chunk(2, dim=0)
+            else:
+                scaled_logits = sampled_token_latent
+            prediction_logits = self.classify_head(conditions)
+            probs = torch.sigmoid(prediction_logits)
+            predicted_classes = torch.argmax(probs, dim=-1)
+            scaled_logits = scaled_logits.unsqueeze(0)
+            if k == 0:
+                xs = scaled_logits
+            else:
+                xs = torch.cat((xs, scaled_logits), dim=1)
+            if predicted_classes == 1:
+                break
+        return xs
+    #--------------------Test CFG-----------------------
+    def sample_for_eval_CFG_test(self, clip_text, if_categorial=False, length=196, clip_model=None, cfg=1, device=torch.device('cuda'), tokenizer='clip', unit_length=4):
+        import clip
+        max_token_len = length // unit_length
+        for k in range(max_token_len):
+            if k == 0:
+                x = []
+            else:
+                x = xs
+            if cfg != 1:
+                if tokenizer == 'clip':
+                    text = clip.tokenize(clip_text, truncate=True).to(device)
+                    feat_clip_text = clip_model.encode_text(text).float()
+                elif tokenizer == 't5-xxl':
+                    feat_clip_text = torch.from_numpy(clip_model.module.encode(clip_text)).float()
+                conditions = self.forward(x, feat_clip_text)
+                conditions = conditions[:, -1, :]
+                empty_clip_text = ''
+                if tokenizer == 'clip':
+                    empty_text = clip.tokenize(empty_clip_text, truncate=True).to(device)
+                    empty_feat_clip_text = clip_model.encode_text(empty_text).float()
+                elif tokenizer == 't5-xxl':
+                    empty_feat_clip_text = torch.from_numpy(clip_model.module.encode(empty_clip_text)).float()
+                    empty_feat_clip_text = empty_feat_clip_text.unsqueeze(0)
+                    empty_feat_clip_text = empty_feat_clip_text.to(device)
+                empty_conditions = self.forward(x, empty_feat_clip_text)
+                empty_conditions = empty_conditions[:, -1, :]
+                temperature = 1.0
+                mix_conditions = torch.cat([conditions, empty_conditions], dim=0)
+                sampled_token_latent = self.diff_loss.sample(mix_conditions, temperature=temperature, cfg=cfg)
+                # chunk
+                scaled_logits, _ = sampled_token_latent.chunk(2, dim=0)
+            else:
+                if tokenizer == 'clip':
+                    text = clip.tokenize(clip_text, truncate=True).to(device)
+                    feat_clip_text = clip_model.encode_text(text).float()
+                elif tokenizer == 't5-xxl':
+                    feat_clip_text = torch.from_numpy(clip_model.module.encode(clip_text)).float()
+                    feat_clip_text = feat_clip_text.to(device)
+                conditions = self.forward(x, feat_clip_text)
+                conditions = conditions[:, -1, :]
+                temperature = 1.0
+                sampled_token_latent = self.diff_loss.sample(conditions, temperature=temperature, cfg=cfg)
+                scaled_logits = sampled_token_latent
+            scaled_logits = scaled_logits.unsqueeze(0)
+            if k == 0:
+                xs = scaled_logits
+            else:
+                xs = torch.cat((xs, scaled_logits), dim=1)
+        return xs
+    #--------------------------------------------------
+    def forward_discrete(self, idx: torch.Tensor, clip_feature: torch.Tensor, use_cache=False, past_key_values=None) -> torch.Tensor:
+        """
+        Vector-token path: idx must be shape [B, T, input_token_dim].
+        If you want discrete IDs instead, you must switch wte to nn.Embedding.
+        """
+        context = None
+        if idx.numel() == 0:
+            context = self._prepare_context(clip_feature)
+            token_embeddings = context
+            if token_embeddings is None:
+                raise ValueError("Conditioning features are required when no motion tokens are provided.")
+        else:
+            b, t, _ = idx.size()
+            assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
+            token_embeddings = self.transformer.wte(idx)  # Linear -> [B, T, n_embd]
+            context = self._prepare_context(clip_feature, batch_size=b)
+            if context is not None:
+                token_embeddings = torch.cat([context, token_embeddings], dim=1)
+        x = token_embeddings
+        if use_cache and past_key_values is None:
+            past_key_values = [None] * len(self.transformer.h)
+        for i, block in enumerate(self.transformer.h):
+            if use_cache:
+                last_past = past_key_values[i]
+                x, presents = block(x, context=context, last_past=last_past, use_cache=use_cache)
+                past_key_values[i] = list(presents)
+            else:
+                x = block(x, context=context)
+        x = self.transformer.ln_f(x)
+        logits = self.out_proj(x)
+        return logits
+    def forward(self, idx: torch.Tensor, feature: Optional[torch.Tensor]) -> torch.Tensor:
+        """
+        If self._prompt_cached is True, we DO NOT concat context each call.
+        Instead, blocks read the cached prompt KV.
+        Otherwise we embed and concat context as before.
+        """
+        context = None
+        if len(idx) == 0:
+            if self._prompt_cached:
+                if self._prompt_bsz is None:
+                    raise ValueError("Prompt cache set but batch size unknown.")
+                b = self._prompt_bsz
+                token_embeddings = torch.empty(b, 0, self.config.n_embd, device=self.bos.device, dtype=self.bos.dtype)
+            else:
+                context = self._prepare_context(feature)
+                token_embeddings = context
+                if token_embeddings is None:
+                    raise ValueError("Conditioning features are required when no motion tokens are provided.")
+        else:
+            b, t, c = idx.size()
+            idx = idx.float()
+            assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
+            token_embeddings = self.transformer.wte(idx)
+            if not self._prompt_cached:
+                context = self._prepare_context(feature, batch_size=b)
+                if context is not None:
+                    token_embeddings = torch.cat([context, token_embeddings], dim=1)
+        # Always prepend BOS scene token
+        bos = self.bos.expand(token_embeddings.size(0), 1, -1)
+        x = torch.cat([bos, token_embeddings], dim=1)
+        # blocks: if context is None -> use cached prompt kv (if set)
+        for block in self.transformer.h:
+            x = block(x, context=context)
+        x = self.transformer.ln_f(x)
+        logits = self.out_proj(x)
+        return logits
+    def forward_inference(self, idx: torch.Tensor, feature: Optional[torch.Tensor]) -> torch.Tensor:
+        context = None
+        if len(idx) == 0:
+            if self._prompt_cached:
+                if self._prompt_bsz is None:
+                    raise ValueError("Prompt cache set but batch size unknown.")
+                b = self._prompt_bsz
+                token_embeddings = torch.empty(b, 0, self.config.n_embd, device=self.bos.device, dtype=self.bos.dtype)
+            else:
+                context = self._prepare_context(feature)
+                token_embeddings = context
+                if token_embeddings is None:
+                    raise ValueError("Conditioning features are required when no motion tokens are provided.")
+        else:
+            b, t, c = idx.size()
+            idx = idx.float()
+            assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
+            token_embeddings = self.transformer.wte(idx)
+            if not self._prompt_cached:
+                context = self._prepare_context(feature, batch_size=b)
+                if context is not None:
+                    token_embeddings = torch.cat([context, token_embeddings], dim=1)
+        x = token_embeddings
+        if len(x.shape) == 2:
+            x = x.unsqueeze(0)
+        # prepend BOS
+        bos = self.bos.expand(x.size(0), 1, -1)
+        x = torch.cat([bos, x], dim=1)
+        if context is not None and context.size(0) != x.size(0):
+            if context.size(0) == 1:
+                context = context.expand(x.size(0), -1, -1)
+            else:
+                raise ValueError("Conditioning batch size does not match token batch size.")
+        for block in self.transformer.h:
+            x = block(x, context=context)
+        x = self.transformer.ln_f(x)
+        logits = self.out_proj(x)
+        return logits
+    def babel_long(self, idx: torch.Tensor, clip_feature: torch.Tensor, use_cache=False, past_key_values=None, num_subseq=None, length=None) -> torch.Tensor:
+        b, t, c = idx.size()
+        idx = idx.float()
+        idx = self.transformer.wte(idx)
+        assert (
+                t <= self.config.block_size
+            ), f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
+        for i in range(b):
+            length_i = length[i][:num_subseq[i]]
+            clip_feature_i = clip_feature[i][:num_subseq[i]]
+            pointer = 0
+            for j in range(num_subseq[i]):
+                if j > 0:
+                    pointer += length_i[j].item()
+                    pointer += 1
+                pointer = int(pointer)
+                clip_feature_i_j = self.transformer.cond_embed(clip_feature_i[j].unsqueeze(0)).unsqueeze(1)
+                idx[i] = torch.cat([idx[i][:pointer].unsqueeze(0), clip_feature_i_j, idx[i][pointer:-1].unsqueeze(0)], dim=1)[0]
+        x = idx
+        context = None
+        if use_cache:
+            if past_key_values is None:
+                past_key_values = [None] * len(self.transformer.h)
+        for i,block in enumerate(self.transformer.h):
+            if use_cache:
+                last_past = past_key_values[i]
+                x, presents = block(x, context=context, last_past=last_past, use_cache=use_cache)
+                past_key_values[i] = list(presents)
+            else:
+                x = block(x, context=context)
+        x = self.transformer.ln_f(x)
+        logits = self.out_proj(x)
+        return logits
+    def forward_babel_eval(self, x, return_attention=False) -> torch.Tensor:
+        layer_attentions = []
+        context = None
+        for block in self.transformer.h:
+            if return_attention:
+                x, att = block(x, context=context, return_attention=True)
+                layer_attentions.append(att)
+            else:
+                x = block(x, context=context)
+        x = self.transformer.ln_f(x)
+        if self.use_out_proj:
+            logits = self.out_proj(x)
+        else:
+            logits = x
+        if return_attention:
+            return logits, layer_attentions
+        return logits
+    def forward_babel(self, idx: torch.Tensor, clip_feature: torch.Tensor, A_token_length) -> torch.Tensor:
+        context = None
+        if len(idx) == 0:   # inference
+            context = self._prepare_context(clip_feature)
+            token_embeddings = context
+        else:
+            b, t, c = idx.size()
+            idx = idx.float()
+            assert (
+                t <= self.config.block_size
+            ), f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
+            A_feature = clip_feature[:, 0, :]
+            B_feature = clip_feature[:, 1, :]
+            A_text_embeddings = self.transformer.cond_embed(A_feature).unsqueeze(1)
+            B_text_embeddings = self.transformer.cond_embed(B_feature).unsqueeze(1)
+            context = torch.cat([A_text_embeddings, B_text_embeddings], dim=1)
+            token_embeddings = torch.zeros(b, self.config.block_size, self.config.n_embd).to(idx.device)
+            for i in range(b):
+                A_idx = idx[i, :A_token_length[i].item(), :]
+                B_idx = idx[i, A_token_length[i].item():-2, :]
+                token_embeddings[i, :, :] = torch.cat([A_text_embeddings[i], self.BOM_tag, self.transformer.wte(A_idx), B_text_embeddings[i], self.BOM_tag, self.transformer.wte(B_idx)], dim=0)  #token_embeddings.shape = (b,t+1,1024)
+        x = token_embeddings
+        if context is not None and context.size(0) != x.size(0):
+            if context.size(0) == 1:
+                context = context.expand(x.size(0), -1, -1)
+            else:
+                raise ValueError("Conditioning batch size does not match token batch size.")
+        for block in self.transformer.h:
+            x = block(x, context=context)
+        x = self.transformer.ln_f(x)
+        if self.use_out_proj:
+            logits = self.out_proj(x)
+        else:
+            logits = x
+        return logits
+    def forward_babel2(self, idx: torch.Tensor, clip_feature: torch.Tensor) -> torch.Tensor:
+        context = None
+        if idx.numel() == 0:   # inference with only context
+            context = self._prepare_context(clip_feature)
+            token_embeddings = context
+        else:
+            b, t, c = idx.size()
+            idx = idx.float()
+            assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
+            B_feature = clip_feature                                  # [B, D] or [B, 1, D]
+            B_text_embeddings = self.transformer.cond_embed(B_feature) # [B, D] -> [B, D]
+            if B_text_embeddings.dim() == 2:
+                B_text_embeddings = B_text_embeddings.unsqueeze(1)     # [B, 1, D]
+            context = B_text_embeddings                                # [B, 1, D]
+            idx_embeddings = self.transformer.wte(idx)                 # [B, T, D]
+            token_embeddings = torch.cat([B_text_embeddings, idx_embeddings], dim=1)  # [B, 1+T, D]
+        x = token_embeddings
+        if context is not None:
+            if context.dim() == 2:
+                context = context.unsqueeze(1)
+            if context.size(0) != x.size(0):
+                if context.size(0) == 1:
+                    context = context.expand(x.size(0), -1, -1)
+                else:
+                    raise ValueError("Conditioning batch size does not match token batch size.")
+        for block in self.transformer.h:
+            x = block(x, context=context)
+        x = self.transformer.ln_f(x)
+        logits = self.out_proj(x) if self.use_out_proj else x
+        return logits
+    def resize_token_embeddings(
+        self, new_num_tokens: Optional[int] = None, pad_to_multiple_of: Optional[int] = None, using_old_initilization: bool = False
+    ) -> nn.Embedding:
+        """
+        Resizes input token embeddings matrix of the model if `new_num_tokens != config.vocab_size`.
+        Takes care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.
+        Arguments:
+            new_num_tokens (`int`, *optional*):
+                The new number of tokens in the embedding matrix. Increasing the size will add newly initialized
+                vectors at the end. Reducing the size will remove vectors from the end. If not provided or `None`, just
+                returns a pointer to the input tokens `torch.nn.Embedding` module of the model without doing anything.
+            pad_to_multiple_of (`int`, *optional*):
+                If set will pad the embedding matrix to a multiple of the provided value.If `new_num_tokens` is set to
+                `None` will just pad the embedding to a multiple of `pad_to_multiple_of`.
+                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
+                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128. For more
+                details about this, or help on choosing the correct value for resizing, refer to this guide:
+                https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc
+        Return:
+            `torch.nn.Embedding`: Pointer to the input tokens Embeddings Module of the model.
+        """
+        model_embeds = self._resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
+        if new_num_tokens is None and pad_to_multiple_of is None:
+            return model_embeds
+        # Update base model and current model config
+        self.config.vocab_size = model_embeds.weight.shape[0]
+        self.vocab_size = model_embeds.weight.shape[0]
+        # Tie weights again if needed
+        # self.tie_weights()
+        return model_embeds
+    def _resize_token_embeddings(self, new_num_tokens, pad_to_multiple_of=None):
+        old_embeddings = self.get_input_embeddings()
+        new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens, pad_to_multiple_of)
+        old_embeddings_requires_grad = old_embeddings.weight.requires_grad
+        new_embeddings.requires_grad_(old_embeddings_requires_grad)
+        self.set_input_embeddings(new_embeddings)
+        # Update new_num_tokens with the actual size of new_embeddings
+        if pad_to_multiple_of is not None:
+            # if is_deepspeed_zero3_enabled():
+            #     import deepspeed
+            #     with deepspeed.zero.GatheredParameters(new_embeddings.weight, modifier_rank=None):
+            #         new_num_tokens = new_embeddings.weight.shape[0]
+            # else:
+            new_num_tokens = new_embeddings.weight.shape[0]
+        # if word embeddings are not tied, make sure that lm head is resized as well
+        # if self.get_output_embeddings() is not None and not self.config.tie_word_embeddings:
+        if self.get_output_embeddings() is not None and not False:
+            old_lm_head = self.get_output_embeddings()
+            new_lm_head = self._get_resized_lm_head(old_lm_head, new_num_tokens)
+            # if hasattr(old_lm_head, "_hf_hook"):
+            #     hook = old_lm_head._hf_hook
+            #     add_hook_to_module(new_lm_head, hook)
+            old_lm_head_requires_grad = old_lm_head.weight.requires_grad
+            new_lm_head.requires_grad_(old_lm_head_requires_grad)
+            self.set_output_embeddings(new_lm_head)
+        return self.get_input_embeddings()
+    def _get_resized_embeddings(
+        self,
+        old_embeddings: nn.Embedding,
+        new_num_tokens: Optional[int] = None,
+        pad_to_multiple_of: Optional[int] = None,
+    ) -> nn.Embedding:
+        """
+        Build a resized Embedding Module from a provided token Embedding Module. Increasing the size will add newly
+        initialized vectors at the end. Reducing the size will remove vectors from the end
+        Args:
+            old_embeddings (`torch.nn.Embedding`):
+                Old embeddings to be resized.
+            new_num_tokens (`int`, *optional*):
+                New number of tokens in the embedding matrix.
+                Increasing the size will add newly initialized vectors at the end. Reducing the size will remove
+                vectors from the end. If not provided or `None`, just returns a pointer to the input tokens
+                `torch.nn.Embedding` module of the model without doing anything.
+            pad_to_multiple_of (`int`, *optional*):
+                If set will pad the embedding matrix to a multiple of the provided value. If `new_num_tokens` is set to
+                `None` will just pad the embedding to a multiple of `pad_to_multiple_of`.
+                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
+                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128. For more
+                details about this, or help on choosing the correct value for resizing, refer to this guide:
+                https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc
+        Return:
+            `torch.nn.Embedding`: Pointer to the resized Embedding Module or the old Embedding Module if
+            `new_num_tokens` is `None`
+        """
+        if pad_to_multiple_of is not None:
+            if not isinstance(pad_to_multiple_of, int):
+                raise ValueError(
+                    f"Asking to pad the embedding matrix to a multiple of `{pad_to_multiple_of}`, which is not and integer. Please make sure to pass an integer"
+                )
+            if new_num_tokens is None:
+                new_num_tokens = old_embeddings.weight.shape[0]
+            new_num_tokens = ((new_num_tokens + pad_to_multiple_of - 1) // pad_to_multiple_of) * pad_to_multiple_of
+        else:
+            print(
+                "You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding"
+                f" dimension will be {new_num_tokens}. This might induce some performance reduction as *Tensor Cores* will not be available."
+                " For more details about this, or help on choosing the correct value for resizing, refer to this guide:"
+                " https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc"
+            )
+        if new_num_tokens is None:
+            return old_embeddings
+        # if is_deepspeed_zero3_enabled():
+        if False:
+            import deepspeed
+            with deepspeed.zero.GatheredParameters(old_embeddings.weight, modifier_rank=None):
+                old_num_tokens, old_embedding_dim = old_embeddings.weight.size()
+        else:
+            old_num_tokens, old_embedding_dim = old_embeddings.weight.size()
+        # if old_num_tokens == new_num_tokens and not is_deepspeed_zero3_enabled():
+        if old_num_tokens == new_num_tokens and not False:
+            return old_embeddings
+        if not isinstance(old_embeddings, nn.Embedding):
+            raise TypeError(
+                f"Old embeddings are of type {type(old_embeddings)}, which is not an instance of {nn.Embedding}. You"
+                " should either use a different resize function or make sure that `old_embeddings` are an instance of"
+                f" {nn.Embedding}."
+            )
+        # Build new embeddings
+        # When using DeepSpeed ZeRO-3, we shouldn't create new embeddings with DeepSpeed init
+        # because the shape of the new embedding layer is used across various modeling files
+        # as well as to update config vocab size. Shape will be 0 when using DeepSpeed init leading
+        # to errors when training.
+        new_embeddings = nn.Embedding(
+            new_num_tokens,
+            old_embedding_dim,
+            device=old_embeddings.weight.device,
+            dtype=old_embeddings.weight.dtype,
+        )
+        # initialize all new embeddings (in particular added tokens)
+        self._init_weights(new_embeddings)
+        # Copy token embeddings from the previous weights
+        # numbers of tokens to copy
+        n = min(old_num_tokens, new_num_tokens)
+        # if is_deepspeed_zero3_enabled():
+        if False:
+            import deepspeed
+            params = [old_embeddings.weight, new_embeddings.weight]
+            with deepspeed.zero.GatheredParameters(params, modifier_rank=0):
+                new_embeddings.weight.data[:n, :] = old_embeddings.weight.data[:n, :]
+        else:
+            new_embeddings.weight.data[:n, :] = old_embeddings.weight.data[:n, :]
+        return new_embeddings
+    def _get_resized_lm_head(
+        self, old_lm_head: nn.Linear, new_num_tokens: Optional[int] = None, transposed: Optional[bool] = False
+    ) -> nn.Linear:
+        """
+        Build a resized Linear Module from a provided old Linear Module. Increasing the size will add newly initialized
+        vectors at the end. Reducing the size will remove vectors from the end
+        Args:
+            old_lm_head (`torch.nn.Linear`):
+                Old lm head liner layer to be resized.
+            new_num_tokens (`int`, *optional*):
+                New number of tokens in the linear matrix.
+                Increasing the size will add newly initialized vectors at the end. Reducing the size will remove
+                vectors from the end. If not provided or `None`, just returns a pointer to the input tokens
+                `torch.nn.Linear` module of the model without doing anything. transposed (`bool`, *optional*, defaults
+                to `False`): Whether `old_lm_head` is transposed or not. If True `old_lm_head.size()` is `lm_head_dim,
+                vocab_size` else `vocab_size, lm_head_dim`.
+        Return:
+            `torch.nn.Linear`: Pointer to the resized Linear Module or the old Linear Module if `new_num_tokens` is
+            `None`
+        """
+        if new_num_tokens is None:
+            return old_lm_head
+        # if is_deepspeed_zero3_enabled():
+        if False:
+            import deepspeed
+            with deepspeed.zero.GatheredParameters(old_lm_head.weight, modifier_rank=None):
+                old_num_tokens, old_lm_head_dim = (
+                    old_lm_head.weight.size() if not transposed else old_lm_head.weight.t().size()
+                )
+        else:
+            old_num_tokens, old_lm_head_dim = (
+                old_lm_head.weight.size() if not transposed else old_lm_head.weight.t().size()
+            )
+        # if old_num_tokens == new_num_tokens and not is_deepspeed_zero3_enabled():
+        if old_num_tokens == new_num_tokens and not False:
+            return old_lm_head
+        if not isinstance(old_lm_head, nn.Linear):
+            raise TypeError(
+                f"Old language model head is of type {type(old_lm_head)}, which is not an instance of {nn.Linear}. You"
+                " should either use a different resize function or make sure that `old_lm_head` are an instance of"
+                f" {nn.Linear}."
+            )
+        # Build new lm head
+        new_lm_head_shape = (old_lm_head_dim, new_num_tokens) if not transposed else (new_num_tokens, old_lm_head_dim)
+        has_new_lm_head_bias = old_lm_head.bias is not None
+        # When using DeepSpeed ZeRO-3, we shouldn't create new embeddings with DeepSpeed init
+        # because the shape of the new embedding layer is used across various modeling files
+        # as well as to update config vocab size. Shape will be 0 when using DeepSpeed init leading
+        # to errors when training.
+        new_lm_head = nn.Linear(
+            *new_lm_head_shape,
+            bias=has_new_lm_head_bias,
+            device=old_lm_head.weight.device,
+            dtype=old_lm_head.weight.dtype,
+        )
+        # initialize new lm head (in particular added tokens)
+        self._init_weights(new_lm_head)
+        num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
+        # if is_deepspeed_zero3_enabled():
+        if False:
+            import deepspeed
+            params = [old_lm_head.weight, old_lm_head.bias, new_lm_head.weight, new_lm_head.bias]
+            with deepspeed.zero.GatheredParameters(params, modifier_rank=0):
+                self._copy_lm_head_original_to_resized(
+                    new_lm_head, old_lm_head, num_tokens_to_copy, transposed, has_new_lm_head_bias
+                )
+        else:
+            self._copy_lm_head_original_to_resized(
+                new_lm_head, old_lm_head, num_tokens_to_copy, transposed, has_new_lm_head_bias
+            )
+        return new_lm_head
+    def _copy_lm_head_original_to_resized(
+        self, new_lm_head, old_lm_head, num_tokens_to_copy, transposed, has_new_lm_head_bias
+    ):
+        # Copy old lm head weights to new lm head
+        if not transposed:
+            new_lm_head.weight.data[:num_tokens_to_copy, :] = old_lm_head.weight.data[:num_tokens_to_copy, :]
+        else:
+            new_lm_head.weight.data[:, :num_tokens_to_copy] = old_lm_head.weight.data[:, :num_tokens_to_copy]
+        # Copy bias weights to new lm head
+        if has_new_lm_head_bias:
+            new_lm_head.bias.data[:num_tokens_to_copy] = old_lm_head.bias.data[:num_tokens_to_copy]
+    @classmethod
+    def from_name(cls, name: str) -> Self:
+        return cls(LLaMAHFConfig.from_name(name))
+class Block(nn.Module):
+    def __init__(self, config: LLaMAHFConfig) -> None:
+        super().__init__()
+        self.rms_1 = RMSNorm(config.n_embd)
+        self.attn = CausalSelfAttention(config)
+        self.rms_cross = RMSNorm(config.n_embd)
+        self.cross_attn = CrossAttention(config)
+        self.rms_2 = RMSNorm(config.n_embd)
+        self.mlp = MLP(config)
+                # cached prompt kv (precomputed by set_prompt)
+        self._ctx_k_repeat = None
+        self._ctx_v_repeat = None
+        self._ctx_bsz = None
+    @torch.no_grad()
+    def set_context_cache(self, context: torch.Tensor):
+        # Precompute KV for cross attention and repeat across kv groups
+        B, S, D = context.shape
+        ca = self.cross_attn
+        k = ca.k_proj(context).view(B, S, ca.n_kv_head, ca.head_dim).transpose(1, 2)
+        v = ca.v_proj(context).view(B, S, ca.n_kv_head, ca.head_dim).transpose(1, 2)
+        k = ca.k_norm(k)
+        # repeat K/V to match heads
+        self._ctx_k_repeat = repeat_kv(k, ca.num_kv_groups)   # [B, n_head, S, d]
+        self._ctx_v_repeat = repeat_kv(v, ca.num_kv_groups)   # [B, n_head, S, d]
+        self._ctx_bsz = B
+    @torch.no_grad()
+    def clear_context_cache(self):
+        self._ctx_k_repeat = None
+        self._ctx_v_repeat = None
+        self._ctx_bsz = None
+    def _cross_attend_cached(self, x: torch.Tensor):
+        # x: [B, T, D]
+        if self._ctx_k_repeat is None or self._ctx_v_repeat is None:
+            return x  # no-op if no cached prompt
+        B, T, _ = x.size()
+        if self._ctx_bsz is not None and self._ctx_bsz != B:
+            # different batch: ignore cache (or you could raise)
+            return x
+        ca = self.cross_attn
+        q = ca.q_proj(x).view(B, T, ca.n_head, ca.head_dim).transpose(1, 2)
+        q = ca.q_norm(q)
+        y = F.scaled_dot_product_attention(
+            q, self._ctx_k_repeat, self._ctx_v_repeat,
+            attn_mask=None, dropout_p=0.0, is_causal=False, scale=ca.softmax_scale,
+        )
+        y = y.transpose(1, 2).contiguous().view(B, T, ca.n_head * ca.head_dim)
+        return x + ca.o_proj(y)
+    def forward(
+        self,
+        x: torch.Tensor,
+        context: Optional[torch.Tensor] = None,
+        last_past=None,
+        use_cache: bool = False,
+        return_attention: bool = False,
+    ) -> torch.Tensor:
+        present = None
+        # self-attn
+        if use_cache:
+            if return_attention:
+                attn_output, attn = self.attn.forward_attn(self.rms_1(x), last_past, use_cache)
+            else:
+                attn_output, present = self.attn(self.rms_1(x), last_past, use_cache)
+            x = x + attn_output
+        else:
+            if return_attention:
+                attn_output, attn = self.attn.forward_attn(self.rms_1(x))
+            else:
+                attn_output = self.attn(self.rms_1(x))
+            x = x + attn_output
+        # cross-attn: prefer live context if provided; else use cached prompt kv
+        if context is not None:
+            x = x + self.cross_attn(self.rms_cross(x), context)
+        else:
+            x = self._cross_attend_cached(self.rms_cross(x))
+        # mlp
+        x = x + self.mlp(self.rms_2(x))
+        if use_cache:
+            if return_attention:
+                return x, present, attn
+            else:
+                return x, present
+        else:
+            if return_attention:
+                return x, attn
+            else:
+                return x
+class CausalSelfAttention(nn.Module):
+    def __init__(self, config: LLaMAHFConfig) -> None:
+        super().__init__()
+        assert config.n_embd % config.n_head == 0
+        self.n_head = config.n_head
+        self.n_kv_head = config.n_kv_head or max(1, config.n_head // 4)
+        assert self.n_head % self.n_kv_head == 0, "n_head must be divisible by n_kv_head"
+        self.head_dim = config.n_embd // config.n_head
+        self.block_size = config.block_size
+        self.rope_base = config.rope_base
+        self.rope_cache = None
+        self.num_kv_groups = self.n_head // self.n_kv_head
+        self.q_proj = nn.Linear(config.n_embd, self.n_head * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(config.n_embd, self.n_kv_head * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(config.n_embd, self.n_kv_head * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(config.n_embd, config.n_embd, bias=False)
+        self.q_norm = RMSNorm(self.head_dim)
+        self.k_norm = RMSNorm(self.head_dim)
+        self.softmax_scale = self.head_dim ** -0.5
+    def forward(self, x: torch.Tensor, last_past=None, use_cache=False) -> torch.Tensor:
+        B, T, _ = x.size()
+        q = self.q_proj(x).view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        k = self.k_proj(x).view(B, T, self.n_kv_head, self.head_dim).transpose(1, 2)
+        v = self.v_proj(x).view(B, T, self.n_kv_head, self.head_dim).transpose(1, 2)
+        q = self.q_norm(q)
+        k = self.k_norm(k)
+        if (
+            self.rope_cache is None
+            or self.rope_cache.dtype != x.dtype
+            or self.rope_cache.device != x.device
+        ):
+            self.rope_cache = build_rope_cache(
+                seq_len=self.block_size,
+                n_elem=self.head_dim,
+                dtype=x.dtype,
+                device=x.device,
+                base=self.rope_base,
+            )
+        q = apply_rope(q, self.rope_cache)
+        k = apply_rope(k, self.rope_cache)
+        if use_cache:
+            if last_past is not None:
+                past_key, past_value = last_past
+                k = torch.cat([past_key, k], dim=-2)
+                v = torch.cat([past_value, v], dim=-2)
+            present = (k, v)
+        else:
+            present = None
+        k_repeat = repeat_kv(k, self.num_kv_groups)
+        v_repeat = repeat_kv(v, self.num_kv_groups)
+        y = F.scaled_dot_product_attention(
+            q,
+            k_repeat,
+            v_repeat,
+            attn_mask=None,
+            dropout_p=0.0,
+            is_causal=True,
+            scale=self.softmax_scale,
+        )
+        y = y.transpose(1, 2).contiguous().view(B, T, self.n_head * self.head_dim)
+        y = self.o_proj(y)
+        if use_cache:
+            return y, present
+        return y
+    def forward_attn(self, x: torch.Tensor, last_past=None, use_cache=False) -> torch.Tensor:
+        B, T, _ = x.size()
+        q = self.q_proj(x).view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        k = self.k_proj(x).view(B, T, self.n_kv_head, self.head_dim).transpose(1, 2)
+        v = self.v_proj(x).view(B, T, self.n_kv_head, self.head_dim).transpose(1, 2)
+        q = self.q_norm(q)
+        k = self.k_norm(k)
+        if (
+            self.rope_cache is None
+            or self.rope_cache.dtype != x.dtype
+            or self.rope_cache.device != x.device
+        ):
+            self.rope_cache = build_rope_cache(
+                seq_len=self.block_size,
+                n_elem=self.head_dim,
+                dtype=x.dtype,
+                device=x.device,
+                base=self.rope_base,
+            )
+        q = apply_rope(q, self.rope_cache)
+        k = apply_rope(k, self.rope_cache)
+        if use_cache:
+            if last_past is not None:
+                past_key, past_value = last_past
+                k = torch.cat([past_key, k], dim=-2)
+                v = torch.cat([past_value, v], dim=-2)
+        k_repeat = repeat_kv(k, self.num_kv_groups)
+        v_repeat = repeat_kv(v, self.num_kv_groups)
+        att = torch.matmul(q, k_repeat.transpose(-2, -1)) * self.softmax_scale
+        att = F.softmax(att, dim=-1)
+        y = torch.matmul(att, v_repeat)
+        y = y.transpose(1, 2).contiguous().view(B, T, self.n_head * self.head_dim)
+        y = self.o_proj(y)
+        return y, att
+class CrossAttention(nn.Module):
+    def __init__(self, config: LLaMAHFConfig) -> None:
+        super().__init__()
+        assert config.n_embd % config.n_head == 0
+        self.n_head = config.n_head
+        self.n_kv_head = config.n_kv_head or max(1, config.n_head // 4)
+        assert self.n_head % self.n_kv_head == 0, "n_head must be divisible by n_kv_head"
+        self.head_dim = config.n_embd // config.n_head
+        self.num_kv_groups = self.n_head // self.n_kv_head
+        self.q_proj = nn.Linear(config.n_embd, self.n_head * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(config.n_embd, self.n_kv_head * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(config.n_embd, self.n_kv_head * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(config.n_embd, config.n_embd, bias=False)
+        self.q_norm = RMSNorm(self.head_dim)
+        self.k_norm = RMSNorm(self.head_dim)
+        self.softmax_scale = self.head_dim ** -0.5
+    def forward(self, x: torch.Tensor, context: torch.Tensor) -> torch.Tensor:
+        B, T, _ = x.size()
+        _, S, _ = context.size()
+        q = self.q_proj(x).view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        k = self.k_proj(context).view(B, S, self.n_kv_head, self.head_dim).transpose(1, 2)
+        v = self.v_proj(context).view(B, S, self.n_kv_head, self.head_dim).transpose(1, 2)
+        q = self.q_norm(q)
+        k = self.k_norm(k)
+        k_repeat = repeat_kv(k, self.num_kv_groups)
+        v_repeat = repeat_kv(v, self.num_kv_groups)
+        y = F.scaled_dot_product_attention(
+            q,
+            k_repeat,
+            v_repeat,
+            attn_mask=None,
+            dropout_p=0.0,
+            is_causal=False,
+            scale=self.softmax_scale,
+        )
+        y = y.transpose(1, 2).contiguous().view(B, T, self.n_head * self.head_dim)
+        return self.o_proj(y)
+def repeat_kv(hidden_states: torch.Tensor, num_groups: int) -> torch.Tensor:
+    if num_groups == 1:
+        return hidden_states
+    bsz, n_kv, seq_len, head_dim = hidden_states.shape
+    hidden_states = hidden_states.unsqueeze(2).expand(bsz, n_kv, num_groups, seq_len, head_dim)
+    return hidden_states.reshape(bsz, n_kv * num_groups, seq_len, head_dim)
+class LengthCausalSelfAttention(nn.Module):
+    def __init__(self, config: LLaMAHFConfig) -> None:
+        super().__init__()
+        assert config.n_embd % config.n_head == 0
+        self.n_head = config.n_head
+        self.n_kv_head = config.n_kv_head or max(1, config.n_head // 4)
+        assert self.n_head % self.n_kv_head == 0, "n_head must be divisible by n_kv_head"
+        self.head_dim = config.n_embd // config.n_head
+        self.block_size = config.block_size
+        self.rope_base = config.rope_base
+        self.rope_cache = None
+        self.num_kv_groups = self.n_head // self.n_kv_head
+        self.q_proj = nn.Linear(config.n_embd, self.n_head * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(config.n_embd, self.n_kv_head * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(config.n_embd, self.n_kv_head * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(config.n_embd, config.n_embd, bias=False)
+        self.q_norm = RMSNorm(self.head_dim)
+        self.k_norm = RMSNorm(self.head_dim)
+        self.softmax_scale = self.head_dim ** -0.5
+    def forward(self, x: torch.Tensor, y_mask: torch.Tensor) -> torch.Tensor:
+        B, T, _ = x.size()
+        q = self.q_proj(x).view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        k = self.k_proj(x).view(B, T, self.n_kv_head, self.head_dim).transpose(1, 2)
+        v = self.v_proj(x).view(B, T, self.n_kv_head, self.head_dim).transpose(1, 2)
+        q = self.q_norm(q)
+        k = self.k_norm(k)
+        if (
+            self.rope_cache is None
+            or self.rope_cache.dtype != x.dtype
+            or self.rope_cache.device != x.device
+        ):
+            self.rope_cache = build_rope_cache(
+                seq_len=self.block_size,
+                n_elem=self.head_dim,
+                dtype=x.dtype,
+                device=x.device,
+                base=self.rope_base,
+            )
+        q = apply_rope(q, self.rope_cache)
+        k = apply_rope(k, self.rope_cache)
+        attn_mask = torch.ones(T, T, dtype=torch.bool, device=x.device)
+        attn_mask = torch.tril(attn_mask)
+        attn_mask = attn_mask.unsqueeze(0).expand(B, -1, -1)
+        text_mask = y_mask.unsqueeze(2) * y_mask.unsqueeze(1)
+        text_mask = F.pad(text_mask, (0, T - y_mask.shape[1], 0, T - y_mask.shape[1]), mode='constant', value=0)
+        attn_mask = torch.logical_or(attn_mask, text_mask)
+        k_repeat = repeat_kv(k, self.num_kv_groups)
+        v_repeat = repeat_kv(v, self.num_kv_groups)
+        y = F.scaled_dot_product_attention(
+            q,
+            k_repeat,
+            v_repeat,
+            attn_mask=attn_mask.unsqueeze(1),
+            dropout_p=0.0,
+            is_causal=False,
+            scale=self.softmax_scale,
+        )
+        y = y.transpose(1, 2).contiguous().view(B, T, self.n_head * self.head_dim)
+        y = self.o_proj(y)
+        return y
+class MLP(nn.Module):
+    def __init__(self, config: LLaMAHFConfig) -> None:
+        super().__init__()
+        hidden_dim = 4 * config.n_embd
+        n_hidden = int(2 * hidden_dim / 3)
+        N = 256
+        # ensure n_hidden is multiple of N
+        n_hidden = ((n_hidden - 1) // N) * N + N
+        self.c_fc1 = nn.Linear(config.n_embd, n_hidden, bias=False)
+        self.c_fc2 = nn.Linear(config.n_embd, n_hidden, bias=False)
+        self.c_proj = nn.Linear(n_hidden, config.n_embd, bias=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = F.silu(self.c_fc1(x)) * self.c_fc2(x)
+        x = self.c_proj(x)
+        return x
+class RMSNorm(nn.Module):
+    """Root Mean Square Layer Normalization.
+    Derived from https://github.com/bzhangGo/rmsnorm/blob/master/rmsnorm_torch.py. BSD 3-Clause License:
+    https://github.com/bzhangGo/rmsnorm/blob/master/LICENSE.
+    """
+    def __init__(self, size: int, dim: int = -1, eps: float = 1e-5) -> None:
+        super().__init__()
+        self.scale = nn.Parameter(torch.ones(size))
+        self.eps = eps
+        self.dim = dim
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # NOTE: the original RMSNorm paper implementation is not equivalent
+        # norm_x = x.norm(2, dim=self.dim, keepdim=True)
+        # rms_x = norm_x * d_x ** (-1. / 2)
+        # x_normed = x / (rms_x + self.eps)
+        norm_x = torch.mean(x * x, dim=self.dim, keepdim=True)
+        x_normed = x * torch.rsqrt(norm_x + self.eps)
+        return self.scale * x_normed
+def build_rope_cache(seq_len: int, n_elem: int, dtype: torch.dtype, device: torch.device, base: int = 10000) -> torch.Tensor:
+    """
+    Rotary-position cache with safe dtype handling.
+    """
+    theta = 1.0 / (base ** (torch.arange(0, n_elem, 2, dtype=dtype, device=device) / n_elem))
+    seq_idx = torch.arange(seq_len, dtype=dtype, device=device)
+    idx_theta = torch.outer(seq_idx, theta)
+    # cast to float32 for torch.polar when needed
+    dtypes_requiring_casting = [torch.float16, torch.bfloat16, torch.int8]
+    working_dtype = torch.float32 if dtype in dtypes_requiring_casting else dtype
+    complex_dtype = torch.complex64  # torch.complex32 does not exist
+    cache = torch.polar(torch.ones_like(idx_theta, dtype=working_dtype, device=device),
+                        idx_theta.to(working_dtype)).to(complex_dtype)
+    return cache
+def apply_rope(x: torch.Tensor, rope_cache: torch.Tensor) -> torch.Tensor:
+    x = x.transpose(1, 2)
+    # truncate to support variable sizes
+    T = x.size(1)
+    rope_cache = rope_cache[:T]
+    # cast because `view_as_complex` does not support 16 bit tensors
+    xc = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))
+    rope_cache = rope_cache.view(1, xc.size(1), 1, xc.size(3))
+    x_out = torch.view_as_real(xc * rope_cache).flatten(3)
+    return x_out.transpose(1, 2).type_as(x)