File size: 16,263 Bytes

"""Continuous latent reasoning model.

Sequence layout (no <think>/</think> tokens — latent positions are inputs_embeds):

    [ x_tokens ; z_1, ..., z_K ; y_tokens ]
       ^^^^^^^    ^^^^^^^^^^^^   ^^^^^^^^
       discrete    continuous     discrete
                   (W_proj of      (gold answer
                    prev hidden)   during training)

Gradient flow: full backprop through z_t = W_proj(h_{t-1}). No sampling,
no torch.no_grad() in the latent path. The y-row attention mask blocks
attention to x columns so the latent is the only information channel.
"""
from __future__ import annotations

import math
from dataclasses import dataclass
from typing import List, Optional, Tuple

import torch
import torch.nn as nn
import torch.nn.functional as F

NEG = -1e9


@dataclass
class BLTConfig:
    base_model: str = "Qwen/Qwen2.5-1.5B-Instruct"
    use_lora: bool = True
    lora_r: int = 16
    lora_alpha: int = 32
    lora_dropout: float = 0.05
    lora_target_modules: tuple = ("q_proj", "k_proj", "v_proj", "o_proj")
    K_latents: int = 4
    block_y_to_x: bool = True
    block_z_to_x: bool = False  # close the z→x architectural leak path (see build_blt_mask)
    proj_init_scale: float = 0.02
    dtype: str = "bfloat16"
    attn_impl: str = "eager"  # required for 4D additive mask
    gradient_checkpointing: bool = False  # trade compute for activation memory; needed for 7B


def build_base(cfg: BLTConfig):
    """Load tokenizer + base CausalLM, optionally wrap with LoRA."""
    from transformers import AutoModelForCausalLM, AutoTokenizer

    dtype = {"bfloat16": torch.bfloat16, "float16": torch.float16, "float32": torch.float32}[cfg.dtype]
    tok = AutoTokenizer.from_pretrained(cfg.base_model, trust_remote_code=True)
    if tok.pad_token is None:
        tok.pad_token = tok.eos_token
    model = AutoModelForCausalLM.from_pretrained(
        cfg.base_model,
        torch_dtype=dtype,
        attn_implementation=cfg.attn_impl,
        trust_remote_code=True,
    )
    model.config.use_cache = False
    if getattr(cfg, "gradient_checkpointing", False):
        # Must enable BEFORE peft wrap; peft propagates the flag to the base model.
        # use_reentrant=False avoids the deprecation warning and is recommended for
        # modern HF + custom attention masks (our 4D mask path is non-trivial).
        model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False})
        # For HF checkpointing to actually propagate grads through inputs_embeds
        # (which we use for the latent loop), we need to make inputs require grad.
        # peft handles this via enable_input_require_grads on the wrapped model.
        if hasattr(model, "enable_input_require_grads"):
            model.enable_input_require_grads()
    if cfg.use_lora:
        from peft import LoraConfig, get_peft_model, TaskType
        lcfg = LoraConfig(
            r=cfg.lora_r, lora_alpha=cfg.lora_alpha, lora_dropout=cfg.lora_dropout,
            bias="none", task_type=TaskType.CAUSAL_LM,
            target_modules=list(cfg.lora_target_modules),
        )
        model = get_peft_model(model, lcfg)
        model.print_trainable_parameters()
        if getattr(cfg, "gradient_checkpointing", False) and hasattr(model, "enable_input_require_grads"):
            model.enable_input_require_grads()
    return model, tok


class LatentProjector(nn.Module):
    """Maps last-layer hidden state to next-step input embedding.

    Two variants, selected via `use_mlp`:
    * Linear (default, original): a single d→d linear layer, bias=False.
    * MLP: d → (hidden_mult·d) → d with GELU. More expressive non-linear
      compression — necessary if the single linear projection bottlenecks
      latent informativeness. Output bias is zeroed at init so the first
      forward is near-zero, mimicking the Linear variant's startup.

    `init_scale` controls the std of all weight initializations.
    """
    def __init__(self, d_model: int, init_scale: float = 0.02,
                 use_mlp: bool = False, hidden_mult: int = 4):
        super().__init__()
        self.use_mlp = use_mlp
        if use_mlp:
            d_hidden = d_model * hidden_mult
            self.proj = nn.Sequential(
                nn.Linear(d_model, d_hidden, bias=True),
                nn.GELU(),
                nn.Linear(d_hidden, d_model, bias=True),
            )
            nn.init.normal_(self.proj[0].weight, mean=0.0, std=init_scale)
            nn.init.zeros_(self.proj[0].bias)
            nn.init.normal_(self.proj[2].weight, mean=0.0, std=init_scale)
            nn.init.zeros_(self.proj[2].bias)
        else:
            self.proj = nn.Linear(d_model, d_model, bias=False)
            nn.init.normal_(self.proj.weight, mean=0.0, std=init_scale)

    def forward(self, h: torch.Tensor) -> torch.Tensor:
        return self.proj(h)


def _get_input_embeddings(model) -> nn.Module:
    """Returns the input embedding layer, working through PEFT wrap."""
    inner = model.get_base_model() if hasattr(model, "get_base_model") else model
    return inner.get_input_embeddings()


def _get_lm_head(model) -> nn.Module:
    inner = model.get_base_model() if hasattr(model, "get_base_model") else model
    return inner.get_output_embeddings()


def build_blt_mask(
    B: int, P: int, K: int, L_y: int, device, dtype, *,
    block_y_to_x: bool,
    block_z_to_x: bool = False,
) -> torch.Tensor:
    """4D additive attention mask [B, 1, T, T] with T = P + K + L_y.

    - Lower-triangular causal everywhere.
    - If block_y_to_x: y rows (positions [P+K, P+K+L_y)) cannot attend to
      x cols (positions [0, P)).
    - If block_z_to_x: z rows (positions [P, P+K)) ALSO cannot attend to x.
      This closes the architectural "leak" path where z hidden states in
      pass 2 could attend to x and deliver x-info to y bypassing z's input
      content. With block_z_to_x=True, z hidden states depend only on z
      input embeddings + z self-attention. The z input (= π(h_{t-1}) from
      pass 1) becomes the *only* carrier of x→y information, forcing z's
      input value to actually matter at inference.
    """
    T = P + K + L_y
    # Start with full -inf, fill 0 where attention is allowed.
    add = torch.full((B, 1, T, T), NEG, device=device, dtype=dtype)
    # Causal: allow j <= i
    row = torch.arange(T, device=device).unsqueeze(1)  # [T, 1]
    col = torch.arange(T, device=device).unsqueeze(0)  # [1, T]
    causal = (col <= row)                              # [T, T] bool
    add[:, 0, :, :] = torch.where(causal, torch.zeros_like(add[0, 0]), torch.full_like(add[0, 0], NEG))
    if block_y_to_x and P > 0 and L_y > 0:
        # zero-out y→x by re-applying NEG to the y-row × x-col block.
        add[:, 0, P + K : P + K + L_y, 0:P] = NEG
    if block_z_to_x and P > 0 and K > 0:
        # zero-out z→x: z rows cannot attend to x cols.
        add[:, 0, P : P + K, 0:P] = NEG
    return add


def forward_with_latent(
    model,
    x_ids: torch.Tensor,       # [B, P]
    x_attn: torch.Tensor,      # [B, P]  1=keep, 0=pad (left-padded)
    y_ids: Optional[torch.Tensor],   # [B, L_y]  None at inference
    projector: LatentProjector,
    K: int,
    *,
    block_y_to_x: bool = True,
    block_z_to_x: bool = False,
    return_z: bool = True,
):
    """Run [x; z_1..z_K; y] in two passes:

      pass-1 (KV-cached, with grad):  iteratively build z_1..z_K from
                                       the running last-layer hidden state.
      pass-2 (single full forward):    [embed(x); z_1..z_K; embed(y)] with
                                       custom 4D mask blocking y→x. Returns
                                       logits for y positions.

    Returns:
      logits_y : [B, L_y, V]   (None if y_ids is None)
      z        : [B, K, d]     latent vectors (with grad)
      h_last_y : [B, L_y, d]   last-layer hidden states at y positions (None if y is None)
    """
    inner = model.get_base_model() if hasattr(model, "get_base_model") else model
    embed_in = inner.get_input_embeddings()
    lm_head = inner.get_output_embeddings()
    device = x_ids.device
    dtype = embed_in.weight.dtype
    B, P = x_ids.shape

    # ---- Pass 1: iterative z construction with KV cache, grad retained ----
    # Initial forward over x to produce the running last-position hidden state.
    # We use the underlying base model (`inner.model`) for hidden-state access.
    base_lm = inner  # e.g., Qwen2ForCausalLM
    transformer = base_lm.model  # Qwen2Model

    x_embeds = embed_in(x_ids)
    out0 = transformer(
        inputs_embeds=x_embeds,
        attention_mask=x_attn,
        use_cache=True,
        return_dict=True,
    )
    past = out0.past_key_values
    # Grab last-token hidden state, accounting for left-pad: use the last
    # non-pad position. Since we left-pad, the last position is always real.
    h_prev = out0.last_hidden_state[:, -1, :]   # [B, d]

    z_list: List[torch.Tensor] = []
    cur_attn = x_attn
    for t in range(K):
        z_t = projector(h_prev)                        # [B, d]
        z_list.append(z_t)
        cur_attn = torch.cat(
            [cur_attn, torch.ones(B, 1, device=device, dtype=cur_attn.dtype)], dim=1
        )
        out_t = transformer(
            inputs_embeds=z_t.unsqueeze(1),
            attention_mask=cur_attn,
            past_key_values=past,
            use_cache=True,
            return_dict=True,
        )
        past = out_t.past_key_values
        h_prev = out_t.last_hidden_state[:, -1, :]

    z = torch.stack(z_list, dim=1)        # [B, K, d]

    if y_ids is None:
        return None, z, None

    # ---- Pass 2: full forward with custom mask, no past_kv ----
    y_embeds = embed_in(y_ids)
    L_y = y_ids.size(1)
    # Cast z to the embedding dtype to match.
    full_embeds = torch.cat([x_embeds, z.to(y_embeds.dtype), y_embeds], dim=1)
    full_4d = build_blt_mask(B, P, K, L_y, device=device, dtype=full_embeds.dtype,
                              block_y_to_x=block_y_to_x, block_z_to_x=block_z_to_x)

    # We also need to respect x pad columns (left-pad → kv positions in x
    # that are pad should be masked from EVERYTHING, including latents).
    if (x_attn == 0).any():
        # Build a 1D mask of pad columns: True where pad.
        pad_cols = (x_attn == 0)             # [B, P]
        pad_kv = torch.cat([pad_cols, torch.zeros(B, K + L_y, device=device, dtype=torch.bool)], dim=1)
        # Broadcast: for each (b), set add[b, 0, :, j] = NEG where pad_kv[b, j].
        full_4d = full_4d.clone()
        full_4d.masked_fill_(pad_kv[:, None, None, :], NEG)

    out2 = transformer(
        inputs_embeds=full_embeds,
        attention_mask=full_4d,
        use_cache=False,
        return_dict=True,
    )
    h_full = out2.last_hidden_state            # [B, T, d]
    # logits over y *predictions*: position t predicts token t+1, so for the
    # y-segment we read logits at positions [P+K-1, P+K+L_y-1) and compare
    # with y_ids[:, :L_y].
    logits_all = lm_head(h_full)               # [B, T, V]
    pred_slice = logits_all[:, P + K - 1 : P + K - 1 + L_y, :]   # [B, L_y, V]
    h_last_y = h_full[:, P + K : P + K + L_y, :]                 # [B, L_y, d]

    return pred_slice, z, h_last_y


@torch.no_grad()
def generate_with_latent(
    model,
    tokenizer,
    projector: LatentProjector,
    x_ids: torch.Tensor,         # [B, P]
    x_attn: torch.Tensor,
    K: int,
    *,
    block_y_to_x: bool = True,
    max_new_tokens: int = 256,
    temperature: float = 0.0,
    eos_token_id: Optional[int] = None,
    override_z: Optional[torch.Tensor] = None,  # [B, K, d] forced latents (ablation)
):
    """Greedy / temperature decoding with the latent loop.

    override_z: if provided, skip the latent-loop pass and use these latents
                directly. For ablations: random-z (gaussian noise), zero-z
                (K=0), shuffled-z, etc.
    """
    inner = model.get_base_model() if hasattr(model, "get_base_model") else model
    transformer = inner.model
    embed_in = inner.get_input_embeddings()
    lm_head = inner.get_output_embeddings()
    device = x_ids.device
    B, P = x_ids.shape
    eos = eos_token_id if eos_token_id is not None else tokenizer.eos_token_id

    x_embeds = embed_in(x_ids)

    # ---- z (computed or overridden) ----
    if override_z is not None:
        K_eff = override_z.size(1)
        z = override_z
        # Still need to "consume" x and the latents through the transformer
        # to build past_kv used for answer generation. Do a single forward.
        full_embeds = torch.cat([x_embeds, z.to(x_embeds.dtype)], dim=1)
        cur_attn = torch.cat(
            [x_attn, torch.ones(B, K_eff, device=device, dtype=x_attn.dtype)], dim=1
        )
        # Build a 4D mask: causal + x-pads masked
        T0 = P + K_eff
        add = torch.full((B, 1, T0, T0), NEG, device=device, dtype=x_embeds.dtype)
        row = torch.arange(T0, device=device).unsqueeze(1)
        col = torch.arange(T0, device=device).unsqueeze(0)
        causal = (col <= row)
        add[:, 0, :, :] = torch.where(causal, torch.zeros_like(add[0, 0]),
                                       torch.full_like(add[0, 0], NEG))
        if (x_attn == 0).any():
            pad_kv = torch.cat([(x_attn == 0),
                                torch.zeros(B, K_eff, device=device, dtype=torch.bool)], dim=1)
            add.masked_fill_(pad_kv[:, None, None, :], NEG)
        out0 = transformer(inputs_embeds=full_embeds, attention_mask=add,
                           use_cache=True, return_dict=True)
        past = out0.past_key_values
        h_last = out0.last_hidden_state[:, -1, :]
    else:
        K_eff = K
        out0 = transformer(inputs_embeds=x_embeds, attention_mask=x_attn,
                           use_cache=True, return_dict=True)
        past = out0.past_key_values
        h_prev = out0.last_hidden_state[:, -1, :]
        cur_attn = x_attn
        for t in range(K):
            z_t = projector(h_prev)
            cur_attn = torch.cat([cur_attn, torch.ones(B, 1, device=device, dtype=cur_attn.dtype)], dim=1)
            out_t = transformer(inputs_embeds=z_t.unsqueeze(1), attention_mask=cur_attn,
                                past_key_values=past, use_cache=True, return_dict=True)
            past = out_t.past_key_values
            h_prev = out_t.last_hidden_state[:, -1, :]
        h_last = h_prev

    # ---- Answer phase: autoregressive decoding.  ----
    # When block_y_to_x is on, we need y rows to not attend to the first P kv
    # positions. With KV cache + eager, we pass a 2D attn mask over kv-length
    # where the x portion is 0. This zeroes out x in additive form.
    # NB: We zero x but keep latent + prior y at 1.
    gen_ids = []
    last_logits = lm_head(h_last)   # [B, V]

    # Build a base attn mask for y queries: 0 over x, 1 over latents, 1 over prior y.
    # Sequence length grows by 1 each step.
    y_kv_base = torch.cat(
        [torch.zeros(B, P, device=device, dtype=cur_attn.dtype) if block_y_to_x else x_attn,
         torch.ones(B, K_eff, device=device, dtype=cur_attn.dtype)],
        dim=1,
    )

    done = torch.zeros(B, dtype=torch.bool, device=device)
    for step in range(max_new_tokens):
        if temperature <= 0.0:
            nxt = last_logits.argmax(dim=-1)
        else:
            probs = torch.softmax(last_logits.float() / max(temperature, 1e-6), dim=-1)
            nxt = torch.multinomial(probs, num_samples=1).squeeze(-1)
        nxt = torch.where(done, torch.full_like(nxt, tokenizer.pad_token_id), nxt)
        gen_ids.append(nxt)
        new_done = done | (nxt == eos)
        if bool(new_done.all().item()):
            done = new_done
            break
        done = new_done

        y_emb = embed_in(nxt.unsqueeze(-1))    # [B, 1, d]
        y_kv_base = torch.cat([y_kv_base, torch.ones(B, 1, device=device, dtype=y_kv_base.dtype)], dim=1)
        out = transformer(inputs_embeds=y_emb, attention_mask=y_kv_base,
                          past_key_values=past, use_cache=True, return_dict=True)
        past = out.past_key_values
        last_logits = lm_head(out.last_hidden_state[:, -1, :])

    return torch.stack(gen_ids, dim=1)        # [B, L_gen]