from __future__ import annotations

import math
import os
from contextlib import nullcontext
from itertools import chain
from typing import Any, Dict, Optional, Sequence, Tuple

import torch
import torch.nn.functional as F
from torch import Tensor, nn

from instruct_particulate.utils.inference_utils import (
    axis_point_to_plucker_torch,
    estimate_prismatic_limit_torch,
    estimate_revolute_limit_torch,
    fit_axis_to_closest_points_torch,
)
from instruct_particulate.utils.partfield_feature_utils import (
    PARTFIELD_FEATURE_DIM,
    PartFieldFeatureExtractor,
)
from instruct_particulate.utils.text_embedding_utils import (
    encode_clip_text_prompts,
    load_clip_text_encoder,
)


def _make_silu_mlp(
    input_dim: int,
    hidden_dim: int,
    output_dim: int,
    *,
    bias: bool = True,
) -> nn.Sequential:
    return nn.Sequential(
        nn.Linear(input_dim, hidden_dim, bias=bias),
        nn.SiLU(),
        nn.Linear(hidden_dim, output_dim, bias=bias),
    )


_OVERPARAM_AXIS_AABB_HALF_EXTENT_MIN = 1e-4
_PLAIN_JOINT_DECODE_TYPES = frozenset({"plain", "plain+fm"})
_OVERPARAM_JOINT_DECODE_TYPES = frozenset(
    {"overparametrized", "overparam+dir", "overparam+singledir"}
)


def _normalize_joint_decode_type(joint_decode_type: str) -> str:
    normalized_joint_decode_type = str(joint_decode_type).lower()
    if normalized_joint_decode_type in {
        "plain+flowmatching",
        "plain+flow-matching",
        "plain+fm",
    }:
        return "plain+fm"
    if normalized_joint_decode_type in {
        "overparameterization",
        "overparameterized",
        "overparam",
    }:
        return "overparametrized"
    if normalized_joint_decode_type in {
        "overparameterization+dir",
        "overparameterized+dir",
        "overparametrized+dir",
        "overparam+dir",
    }:
        return "overparam+dir"
    if normalized_joint_decode_type in {
        "overparameterization+singledir",
        "overparameterized+singledir",
        "overparametrized+singledir",
        "overparam+singledir",
        "overparameterization+single-dir",
        "overparameterized+single-dir",
        "overparametrized+single-dir",
        "overparam+single-dir",
    }:
        return "overparam+singledir"
    return normalized_joint_decode_type


def _normalize_joint_fm_prediction_type(prediction_type: str) -> str:
    normalized_prediction_type = str(prediction_type).lower()
    if normalized_prediction_type in {"x", "xpred", "x-pred", "x_pred"}:
        return "x"
    if normalized_prediction_type in {"v", "vpred", "v-pred", "v_pred"}:
        return "v"
    return normalized_prediction_type


def _normalize_overparam_closest_axis_space(closest_axis_space: str) -> str:
    normalized_closest_axis_space = str(closest_axis_space).lower()
    if normalized_closest_axis_space in {
        "world",
        "sample",
        "global",
        "global-space",
        "world-space",
    }:
        return "world"
    if normalized_closest_axis_space in {
        "part_aabb",
        "part-aabb",
        "aabb",
        "local_aabb",
        "local-aabb",
    }:
        return "part_aabb"
    return normalized_closest_axis_space


def _coerce_joint_fm_state_stat(
    values: Optional[Sequence[float]],
    *,
    default_value: float,
    name: str,
) -> Tensor:
    if values is None:
        values = [default_value] * 8
    if len(values) != 8:
        raise ValueError(f"{name} must have length 8, got {len(values)}")
    tensor = torch.tensor([float(value) for value in values], dtype=torch.float32)
    if not torch.isfinite(tensor).all():
        raise ValueError(f"{name} must contain only finite values, got {values!r}")
    if name.endswith("_std") and torch.any(tensor <= 0.0):
        raise ValueError(f"{name} must contain only positive values, got {values!r}")
    return tensor


def _gather_joint_link_latents(
    *,
    link_latents: Tensor,
    joint_connections: Tensor,
) -> Tuple[Tensor, Tensor]:
    parent_indices = joint_connections[..., 0].clamp_min(0)
    child_indices = joint_connections[..., 1].clamp_min(0)
    gather_index = parent_indices.unsqueeze(-1).expand(-1, -1, link_latents.shape[-1])
    parent_latents = link_latents.gather(dim=1, index=gather_index)
    gather_index = child_indices.unsqueeze(-1).expand(-1, -1, link_latents.shape[-1])
    child_latents = link_latents.gather(dim=1, index=gather_index)
    return parent_latents, child_latents


def _build_joint_motion_condition_inputs(
    *,
    parent_latents: Tensor,
    child_latents: Tensor,
    motion_type: str,
    revolute_embedding: Tensor,
    prismatic_embedding: Tensor,
) -> Tensor:
    if parent_latents.shape != child_latents.shape:
        raise ValueError(
            "parent_latents and child_latents must share the same shape, "
            f"got {tuple(parent_latents.shape)} and {tuple(child_latents.shape)}"
        )
    if motion_type == "revolute":
        type_embedding = revolute_embedding
    elif motion_type == "prismatic":
        type_embedding = prismatic_embedding
    else:
        raise ValueError(
            "motion_type must be 'revolute' or 'prismatic', "
            f"got {motion_type!r}"
        )
    type_embeddings = type_embedding.to(
        device=parent_latents.device,
        dtype=parent_latents.dtype,
    ).view(1, 1, -1).expand_as(parent_latents)
    return torch.cat((type_embeddings, parent_latents, child_latents), dim=-1)


class SwiGLUFeedForward(nn.Module):
    """Modern gated MLP used after attention blocks."""

    def __init__(
        self,
        dim: int,
        hidden_dim: Optional[int] = None,
        multiplier: float = 4.0,
        dropout: float = 0.0,
    ):
        super().__init__()
        hidden_dim = hidden_dim or int(dim * multiplier)
        self.in_proj = nn.Linear(dim, 2 * hidden_dim)
        self.out_proj = nn.Linear(hidden_dim, dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x: Tensor) -> Tensor:
        value, gate = self.in_proj(x).chunk(2, dim=-1)
        x = value * F.silu(gate)
        x = self.dropout(x)
        return self.out_proj(x)


class FrequencyMLPEmbedder(nn.Module):
    """Embeds coordinates or normals with Fourier features and a small MLP."""

    def __init__(
        self,
        output_dim: int,
        num_frequencies: int,
        input_dim: int = 3,
        hidden_dim: Optional[int] = None,
        max_frequency: float = 32.0,
        include_raw: bool = True,
    ):
        super().__init__()
        if num_frequencies <= 0:
            raise ValueError(f"num_frequencies must be positive, got {num_frequencies}")
        if input_dim <= 0:
            raise ValueError(f"input_dim must be positive, got {input_dim}")
        if max_frequency <= 0.0:
            raise ValueError(f"max_frequency must be positive, got {max_frequency}")

        hidden_dim = hidden_dim or output_dim
        self.input_dim = input_dim
        self.include_raw = include_raw

        if num_frequencies == 1:
            frequencies = torch.tensor([math.pi], dtype=torch.float32)
        else:
            frequencies = torch.exp(
                torch.linspace(0.0, math.log(max_frequency), steps=num_frequencies, dtype=torch.float32)
            ) * math.pi
        self.register_buffer("frequencies", frequencies, persistent=False)

        encoded_dim = input_dim * (2 * num_frequencies + int(include_raw))
        self.input_norm = nn.RMSNorm(encoded_dim)
        self.mlp = _make_silu_mlp(
            input_dim=encoded_dim,
            hidden_dim=hidden_dim,
            output_dim=output_dim,
        )

    def forward(self, x: Tensor) -> Tensor:
        if x.shape[-1] != self.input_dim:
            raise ValueError(
                f"expected inputs with last dimension {self.input_dim}, got {tuple(x.shape)}"
            )

        x_float = x.float()
        angles = x_float.unsqueeze(-1) * self.frequencies
        encoded = torch.cat((angles.sin(), angles.cos()), dim=-1).flatten(start_dim=-2)
        if self.include_raw:
            encoded = torch.cat((encoded, x_float), dim=-1)
        encoded = encoded.to(dtype=self.mlp[0].weight.dtype)
        return self.mlp(self.input_norm(encoded))


class TimestepEmbedder(nn.Module):
    """Embeds scalar diffusion/flow timesteps with sinusoidal features."""

    def __init__(
        self,
        hidden_dim: int,
        *,
        frequency_embedding_dim: int = 256,
        max_period: float = 10_000.0,
    ):
        super().__init__()
        if hidden_dim <= 0:
            raise ValueError(f"hidden_dim must be positive, got {hidden_dim}")
        if frequency_embedding_dim <= 0:
            raise ValueError(
                f"frequency_embedding_dim must be positive, got {frequency_embedding_dim}"
            )
        if max_period <= 0.0:
            raise ValueError(f"max_period must be positive, got {max_period}")
        self.frequency_embedding_dim = int(frequency_embedding_dim)
        self.max_period = float(max_period)
        self.mlp = _make_silu_mlp(
            input_dim=self.frequency_embedding_dim,
            hidden_dim=hidden_dim,
            output_dim=hidden_dim,
        )

    def _frequency_embedding(self, t: Tensor) -> Tensor:
        half_dim = self.frequency_embedding_dim // 2
        if half_dim == 0:
            return t.unsqueeze(-1)
        frequency_exponents = torch.arange(
            half_dim,
            device=t.device,
            dtype=torch.float32,
        )
        frequency_exponents = frequency_exponents / max(half_dim, 1)
        frequencies = torch.exp(-math.log(self.max_period) * frequency_exponents)
        angles = t.float().unsqueeze(-1) * frequencies.unsqueeze(0)
        embedding = torch.cat((angles.cos(), angles.sin()), dim=-1)
        if self.frequency_embedding_dim % 2 != 0:
            embedding = torch.cat((embedding, torch.zeros_like(embedding[:, :1])), dim=-1)
        return embedding

    def forward(self, t: Tensor) -> Tensor:
        if t.ndim != 1:
            raise ValueError(f"expected a 1D timestep tensor, got shape {tuple(t.shape)}")
        frequency_embedding = self._frequency_embedding(t).to(
            dtype=self.mlp[0].weight.dtype
        )
        return self.mlp(frequency_embedding)


class SDPASelfAttention(nn.Module):
    """Self-attention implemented with PyTorch SDPA."""

    def __init__(
        self,
        model_dim: int,
        num_heads: int,
        head_dim: Optional[int] = None,
        attn_dropout: float = 0.0,
        proj_dropout: float = 0.0,
        qk_norm: bool = True,
        qkv_bias: bool = True,
    ):
        super().__init__()
        if head_dim is None:
            if model_dim % num_heads != 0:
                raise ValueError(
                    f"model_dim ({model_dim}) must be divisible by num_heads ({num_heads}) "
                    "when head_dim is not specified"
                )
            head_dim = model_dim // num_heads

        self.model_dim = model_dim
        self.num_heads = num_heads
        self.head_dim = head_dim
        self.inner_dim = num_heads * head_dim
        self.attn_dropout = attn_dropout

        self.qkv_proj = nn.Linear(model_dim, 3 * self.inner_dim, bias=qkv_bias)
        self.out_proj = nn.Linear(self.inner_dim, model_dim)
        self.out_dropout = nn.Dropout(proj_dropout)

        self.q_norm = nn.RMSNorm(head_dim, eps=1e-6) if qk_norm else nn.Identity()
        self.k_norm = nn.RMSNorm(head_dim, eps=1e-6) if qk_norm else nn.Identity()

    def _reshape_heads(self, x: Tensor) -> Tensor:
        batch_size, seq_len, _ = x.shape
        x = x.view(batch_size, seq_len, self.num_heads, self.head_dim)
        return x.transpose(1, 2)

    def forward(self, x: Tensor, *, mask: Tensor | None = None) -> Tensor:
        qkv = self.qkv_proj(x)
        q, k, v = qkv.chunk(3, dim=-1)
        q = self.q_norm(self._reshape_heads(q))
        k = self.k_norm(self._reshape_heads(k))
        v = self._reshape_heads(v)
        attn_mask = None
        if mask is not None:
            attn_mask = mask[:, None, :, None] & mask[:, None, None, :]

        attn_output = F.scaled_dot_product_attention(
            q,
            k,
            v,
            attn_mask=attn_mask,
            dropout_p=self.attn_dropout if self.training else 0.0,
        )
        attn_output = attn_output.transpose(1, 2).contiguous().view(
            x.shape[0],
            x.shape[1],
            self.inner_dim,
        )
        return self.out_dropout(self.out_proj(attn_output))


class SDPACrossAttention(nn.Module):
    """Cross-attention implemented with PyTorch SDPA."""

    def __init__(
        self,
        model_dim: int,
        num_heads: int,
        head_dim: Optional[int] = None,
        attn_dropout: float = 0.0,
        proj_dropout: float = 0.0,
        qk_norm: bool = True,
        q_bias: bool = True,
        kv_bias: bool = True,
    ):
        super().__init__()
        if head_dim is None:
            if model_dim % num_heads != 0:
                raise ValueError(
                    f"model_dim ({model_dim}) must be divisible by num_heads ({num_heads}) "
                    "when head_dim is not specified"
                )
            head_dim = model_dim // num_heads

        self.model_dim = model_dim
        self.num_heads = num_heads
        self.head_dim = head_dim
        self.inner_dim = num_heads * head_dim
        self.attn_dropout = attn_dropout

        self.q_proj = nn.Linear(model_dim, self.inner_dim, bias=q_bias)
        self.kv_proj = nn.Linear(model_dim, 2 * self.inner_dim, bias=kv_bias)
        self.out_proj = nn.Linear(self.inner_dim, model_dim)
        self.out_dropout = nn.Dropout(proj_dropout)

        self.q_norm = nn.RMSNorm(head_dim, eps=1e-6) if qk_norm else nn.Identity()
        self.k_norm = nn.RMSNorm(head_dim, eps=1e-6) if qk_norm else nn.Identity()

    def _reshape_heads(self, x: Tensor) -> Tensor:
        batch_size, seq_len, _ = x.shape
        x = x.view(batch_size, seq_len, self.num_heads, self.head_dim)
        return x.transpose(1, 2)

    def forward(
        self,
        query: Tensor,
        context: Tensor,
        *,
        query_mask: Tensor | None = None,
        context_mask: Tensor | None = None,
    ) -> Tensor:
        q = self.q_norm(self._reshape_heads(self.q_proj(query)))
        k, v = self.kv_proj(context).chunk(2, dim=-1)
        k = self.k_norm(self._reshape_heads(k))
        v = self._reshape_heads(v)

        if query_mask is None and context_mask is None:
            attn_mask = None
        else:
            if query_mask is None:
                query_mask = torch.ones(query.shape[:2], device=query.device, dtype=torch.bool)
            if context_mask is None:
                context_mask = torch.ones(
                    context.shape[:2],
                    device=context.device,
                    dtype=torch.bool,
                )
            attn_mask = query_mask[:, None, :, None] & context_mask[:, None, None, :]

        attn_output = F.scaled_dot_product_attention(
            q,
            k,
            v,
            attn_mask=attn_mask,
            dropout_p=self.attn_dropout if self.training else 0.0,
        )
        attn_output = attn_output.transpose(1, 2).contiguous().view(
            query.shape[0],
            query.shape[1],
            self.inner_dim,
        )
        return self.out_dropout(self.out_proj(attn_output))


class CrossAttentionBlock(nn.Module):
    """Pre-norm cross-attention block with a gated MLP."""

    def __init__(
        self,
        model_dim: int,
        num_heads: int,
        head_dim: Optional[int] = None,
        attn_dropout: float = 0.0,
        proj_dropout: float = 0.0,
        ffn_multiplier: float = 4.0,
        ffn_dropout: float = 0.0,
        qk_norm: bool = True,
        norm_eps: float = 1e-6,
    ):
        super().__init__()
        self.query_norm = nn.RMSNorm(model_dim, eps=norm_eps)
        self.context_norm = nn.RMSNorm(model_dim, eps=norm_eps)
        self.attn = SDPACrossAttention(
            model_dim=model_dim,
            num_heads=num_heads,
            head_dim=head_dim,
            attn_dropout=attn_dropout,
            proj_dropout=proj_dropout,
            qk_norm=qk_norm,
        )
        self.ffn_norm = nn.RMSNorm(model_dim, eps=norm_eps)
        self.ffn = SwiGLUFeedForward(
            dim=model_dim,
            multiplier=ffn_multiplier,
            dropout=ffn_dropout,
        )

    def forward(
        self,
        query: Tensor,
        context: Tensor,
        *,
        query_mask: Tensor | None = None,
        context_mask: Tensor | None = None,
    ) -> Tensor:
        query = query + self.attn(
            self.query_norm(query),
            self.context_norm(context),
            query_mask=query_mask,
            context_mask=context_mask,
        )
        if query_mask is not None:
            query = query.masked_fill(~query_mask.unsqueeze(-1), 0)
        query = query + self.ffn(self.ffn_norm(query))
        if query_mask is not None:
            query = query.masked_fill(~query_mask.unsqueeze(-1), 0)
        return query


class SelfAttentionBlock(nn.Module):
    """Pre-norm self-attention block with a gated MLP."""

    def __init__(
        self,
        model_dim: int,
        num_heads: int,
        head_dim: Optional[int] = None,
        attn_dropout: float = 0.0,
        proj_dropout: float = 0.0,
        ffn_multiplier: float = 4.0,
        ffn_dropout: float = 0.0,
        qk_norm: bool = True,
        norm_eps: float = 1e-6,
    ):
        super().__init__()
        self.attn_norm = nn.RMSNorm(model_dim, eps=norm_eps)
        self.attn = SDPASelfAttention(
            model_dim=model_dim,
            num_heads=num_heads,
            head_dim=head_dim,
            attn_dropout=attn_dropout,
            proj_dropout=proj_dropout,
            qk_norm=qk_norm,
        )
        self.ffn_norm = nn.RMSNorm(model_dim, eps=norm_eps)
        self.ffn = SwiGLUFeedForward(
            dim=model_dim,
            multiplier=ffn_multiplier,
            dropout=ffn_dropout,
        )

    def forward(self, x: Tensor, *, mask: Tensor | None = None) -> Tensor:
        x = x + self.attn(self.attn_norm(x), mask=mask)
        if mask is not None:
            x = x.masked_fill(~mask.unsqueeze(-1), 0)
        x = x + self.ffn(self.ffn_norm(x))
        if mask is not None:
            x = x.masked_fill(~mask.unsqueeze(-1), 0)
        return x


class ShapeLatentEncoder(nn.Module):
    """Encodes a point cloud into a fixed set of shape latents."""

    def __init__(
        self,
        model_dim: int,
        num_shape_latents: int,
        num_heads: int,
        head_dim: Optional[int] = None,
        attn_dropout: float = 0.0,
        proj_dropout: float = 0.0,
        ffn_multiplier: float = 4.0,
        ffn_dropout: float = 0.0,
        qk_norm: bool = True,
        norm_eps: float = 1e-6,
    ):
        super().__init__()
        self.shape_latents = nn.Parameter(torch.empty(num_shape_latents, model_dim))
        self.block = CrossAttentionBlock(
            model_dim=model_dim,
            num_heads=num_heads,
            head_dim=head_dim,
            attn_dropout=attn_dropout,
            proj_dropout=proj_dropout,
            ffn_multiplier=ffn_multiplier,
            ffn_dropout=ffn_dropout,
            qk_norm=qk_norm,
            norm_eps=norm_eps,
        )
        self.output_norm = nn.RMSNorm(model_dim, eps=norm_eps)
        self.reset_parameters()

    def reset_parameters(self) -> None:
        nn.init.trunc_normal_(self.shape_latents, std=0.02)

    def forward(self, point_tokens: Tensor) -> Tensor:
        batch_size = point_tokens.shape[0]
        latents = self.shape_latents.to(dtype=point_tokens.dtype).unsqueeze(0).expand(
            batch_size,
            -1,
            -1,
        )
        latents = self.block(latents, point_tokens)
        return self.output_norm(latents)


class SegmentationDecoder(nn.Module):
    """Pairwise query-to-link scoring over padded link tokens."""

    def __init__(
        self,
        model_dim: int,
        decode_type: str = "dot",
        bias: bool = True,
        query_chunk_size: Optional[int] = None,
    ):
        super().__init__()
        self.model_dim = model_dim
        self.decode_type = str(decode_type)
        if self.decode_type not in {"dot", "mlp"}:
            raise ValueError(
                f"decode_type must be 'dot' or 'mlp', got {decode_type!r}"
            )
        if query_chunk_size is not None and int(query_chunk_size) <= 0:
            raise ValueError(f"query_chunk_size must be positive when set, got {query_chunk_size}")
        self.query_chunk_size = None if query_chunk_size is None else int(query_chunk_size)

        if self.decode_type == "mlp":
            self.decoder = _make_silu_mlp(
                input_dim=model_dim * 2,
                hidden_dim=model_dim * 4,
                output_dim=1,
                bias=bias,
            )

    def _decode_mlp_logits(
        self,
        query_latents: Tensor,
        link_latents: Tensor,
    ) -> Tensor:
        expanded_link_latents = link_latents.unsqueeze(1).expand(
            -1,
            query_latents.shape[1],
            -1,
            -1,
        )
        expanded_query_latents = query_latents.unsqueeze(2).expand(
            -1,
            -1,
            link_latents.shape[1],
            -1,
        )
        pair_latents = torch.cat(
            (expanded_link_latents, expanded_query_latents),
            dim=-1,
        )
        return self.decoder(pair_latents).squeeze(-1)

    def forward(
        self,
        query_latents: Tensor,
        link_latents: Tensor,
        link_valid_flag: Tensor,
    ) -> Tensor:
        if self.decode_type == "dot":
            logits = torch.matmul(query_latents, link_latents.transpose(-1, -2))
        else:
            if (
                self.query_chunk_size is None
                or query_latents.shape[1] <= self.query_chunk_size
            ):
                logits = self._decode_mlp_logits(query_latents, link_latents)
            else:
                logits = torch.cat(
                    [
                        self._decode_mlp_logits(
                            query_latents[:, start : start + self.query_chunk_size],
                            link_latents,
                        )
                        for start in range(0, query_latents.shape[1], self.query_chunk_size)
                    ],
                    dim=1,
                )
        return logits.masked_fill(~link_valid_flag.unsqueeze(1), float("-inf"))


class JointDecoderPlain(nn.Module):
    """Decodes per-joint axes and ranges from parent/child link latents."""

    motion_state_dim = 8

    def __init__(
        self,
        model_dim: int,
        bias: bool = True,
    ):
        super().__init__()
        self.model_dim = model_dim
        self.revolute_embedding = nn.Parameter(torch.empty(model_dim))
        self.prismatic_embedding = nn.Parameter(torch.empty(model_dim))
        self.decoder = _make_silu_mlp(
            input_dim=model_dim * 3,
            hidden_dim=model_dim * 4,
            output_dim=self.motion_state_dim,
            bias=bias,
        )
        self.reset_parameters()

    def reset_parameters(self) -> None:
        nn.init.trunc_normal_(self.revolute_embedding, std=0.02)
        nn.init.trunc_normal_(self.prismatic_embedding, std=0.02)

    def _predict_motion_parameters(
        self,
        *,
        parent_latents: Tensor,
        child_latents: Tensor,
        active_mask: Tensor,
        motion_type: str,
    ) -> Tuple[Tensor, Tensor]:
        decoded = self.decoder(
            _build_joint_motion_condition_inputs(
                parent_latents=parent_latents,
                child_latents=child_latents,
                motion_type=motion_type,
                revolute_embedding=self.revolute_embedding,
                prismatic_embedding=self.prismatic_embedding,
            )
        )
        decoded = decoded.masked_fill(~active_mask.unsqueeze(-1), 0)
        return decoded[..., :6], decoded[..., 6:8]

    def predict(
        self,
        link_latents: Tensor,
        joint_connections: Tensor,
        joint_valid_flag: Tensor,
        is_revolute: Tensor,
        is_prismatic: Tensor,
    ) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
        """Decodes padded joint tensors and zeros invalid rows in the outputs."""
        parent_latents, child_latents = _gather_joint_link_latents(
            link_latents=link_latents,
            joint_connections=joint_connections,
        )
        motion_flags = dict(revolute=is_revolute, prismatic=is_prismatic)
        motion_outputs: Dict[str, Tensor] = {}
        for motion_type in ("revolute", "prismatic"):
            motion_outputs[f"{motion_type}_axis"], motion_outputs[f"{motion_type}_range"] = self._predict_motion_parameters(
                parent_latents=parent_latents,
                child_latents=child_latents,
                active_mask=joint_valid_flag & motion_flags[motion_type],
                motion_type=motion_type)
        return (
            motion_outputs["revolute_axis"],
            motion_outputs["prismatic_axis"],
            motion_outputs["revolute_range"],
            motion_outputs["prismatic_range"],
        )

class JointDecoderPlainFlowMatching(nn.Module):
    """Predicts plain joint FM states with configurable x/v parameterization."""

    motion_state_dim = 8

    def __init__(
        self,
        model_dim: int,
        *,
        hidden_dim: Optional[int] = None,
        prediction_type: str = "v",
        time_embedding_dim: int = 256,
        inference_steps: int = 100,
        time_scale: float = 1000.0,
        sigma_min: float = 0.0,
        rescale_t: float = 1.0,
        cfg_scale: float = 1.0,
        revolute_state_mean: Optional[Sequence[float]] = None,
        revolute_state_std: Optional[Sequence[float]] = None,
        prismatic_state_mean: Optional[Sequence[float]] = None,
        prismatic_state_std: Optional[Sequence[float]] = None,
        bias: bool = True,
    ):
        super().__init__()
        self.model_dim = model_dim
        self.hidden_dim = int(hidden_dim or model_dim)
        self.inference_steps = int(inference_steps)
        if self.inference_steps <= 0:
            raise ValueError(f"inference_steps must be positive, got {inference_steps}")
        prediction_type = _normalize_joint_fm_prediction_type(prediction_type)
        if prediction_type not in {"x", "v"}:
            raise ValueError(
                "prediction_type must be 'x' or 'v', "
                f"got {prediction_type!r}"
            )
        self.prediction_type = prediction_type
        self.time_scale = float(time_scale)
        self.sigma_min = float(sigma_min)
        self.rescale_t = float(rescale_t)
        self.cfg_scale = float(cfg_scale)
        self.register_buffer(
            "revolute_state_mean",
            _coerce_joint_fm_state_stat(
                revolute_state_mean,
                default_value=0.0,
                name="revolute_state_mean",
            ),
            persistent=True,
        )
        self.register_buffer(
            "revolute_state_std",
            _coerce_joint_fm_state_stat(
                revolute_state_std,
                default_value=1.0,
                name="revolute_state_std",
            ),
            persistent=True,
        )
        self.register_buffer(
            "prismatic_state_mean",
            _coerce_joint_fm_state_stat(
                prismatic_state_mean,
                default_value=0.0,
                name="prismatic_state_mean",
            ),
            persistent=True,
        )
        self.register_buffer(
            "prismatic_state_std",
            _coerce_joint_fm_state_stat(
                prismatic_state_std,
                default_value=1.0,
                name="prismatic_state_std",
            ),
            persistent=True,
        )
        self.revolute_embedding = nn.Parameter(torch.empty(model_dim))
        self.prismatic_embedding = nn.Parameter(torch.empty(model_dim))
        self.condition_projector = _make_silu_mlp(
            input_dim=model_dim * 3,
            hidden_dim=self.hidden_dim * 2,
            output_dim=self.hidden_dim,
            bias=bias,
        )
        self.state_projector = nn.Linear(
            self.motion_state_dim,
            self.hidden_dim,
            bias=bias,
        )
        self.time_embedder = TimestepEmbedder(
            self.hidden_dim,
            frequency_embedding_dim=time_embedding_dim,
        )
        self.input_norm = nn.RMSNorm(self.hidden_dim)
        self.residual_block_1 = _make_silu_mlp(
            input_dim=self.hidden_dim,
            hidden_dim=self.hidden_dim * 4,
            output_dim=self.hidden_dim,
            bias=bias,
        )
        self.residual_block_2 = _make_silu_mlp(
            input_dim=self.hidden_dim,
            hidden_dim=self.hidden_dim * 4,
            output_dim=self.hidden_dim,
            bias=bias,
        )
        self.output_norm = nn.RMSNorm(self.hidden_dim)
        self.output_projector = nn.Linear(
            self.hidden_dim,
            self.motion_state_dim,
            bias=bias,
        )
        self.reset_parameters()

    def reset_parameters(self) -> None:
        nn.init.trunc_normal_(self.revolute_embedding, std=0.02)
        nn.init.trunc_normal_(self.prismatic_embedding, std=0.02)

    def _prepare_t_sequence(
        self,
        *,
        device: torch.device,
        steps: int | None = None,
        dtype: torch.dtype = torch.float32,
    ) -> Tensor:
        num_steps = self.inference_steps if steps is None else int(steps)
        if num_steps <= 0:
            raise ValueError(f"steps must be positive, got {num_steps}")
        t_sequence = torch.linspace(
            0.0,
            1.0,
            num_steps + 1,
            device=device,
            dtype=torch.float32,
        )
        if self.rescale_t:
            t_sequence = t_sequence / (
                1.0 + (self.rescale_t - 1.0) * (1.0 - t_sequence)
            )
        return t_sequence.to(dtype=dtype)

    def _motion_state_mean_and_std(
        self,
        *,
        motion_type: str,
        device: torch.device,
        dtype: torch.dtype,
    ) -> Tuple[Tensor, Tensor]:
        if motion_type == "revolute":
            mean = self.revolute_state_mean
            std = self.revolute_state_std
        elif motion_type == "prismatic":
            mean = self.prismatic_state_mean
            std = self.prismatic_state_std
        else:
            raise ValueError(
                "motion_type must be 'revolute' or 'prismatic', "
                f"got {motion_type!r}"
            )
        safe_std = std.to(device=device, dtype=dtype).clamp_min(1.0e-6)
        return mean.to(device=device, dtype=dtype), safe_std

    def _unnormalize_motion_state(
        self,
        motion_state: Tensor,
        *,
        motion_type: str,
    ) -> Tensor:
        mean, std = self._motion_state_mean_and_std(
            motion_type=motion_type,
            device=motion_state.device,
            dtype=motion_state.dtype,
        )
        return motion_state * std + mean

    def _sample_motion_state(
        self,
        *,
        condition_embeddings: Tensor,
        steps: int | None = None,
        cfg_scale: float | None = None,
    ) -> Tensor:
        num_samples = condition_embeddings.shape[0]
        if num_samples == 0:
            return condition_embeddings.new_zeros((0, self.motion_state_dim))
        guidance_scale = self.cfg_scale if cfg_scale is None else float(cfg_scale)
        motion_state = torch.randn(
            (num_samples, self.motion_state_dim),
            device=condition_embeddings.device,
            dtype=condition_embeddings.dtype,
        )
        t_sequence = self._prepare_t_sequence(
            device=condition_embeddings.device,
            steps=steps,
            dtype=condition_embeddings.dtype,
        )
        for t0, t1 in zip(t_sequence[:-1], t_sequence[1:]):
            time_vector = motion_state.new_full((num_samples,), t0)
            if guidance_scale == 1.0:
                velocity = self.predict_velocity(
                    x_t=motion_state,
                    t=time_vector,
                    condition_embeddings=condition_embeddings,
                )
            else:
                velocity = self._predict_guided_velocity(
                    x_t=motion_state,
                    t=time_vector,
                    condition_embeddings=condition_embeddings,
                    cfg_scale=guidance_scale,
                )
            motion_state = motion_state + (t1 - t0).to(dtype=motion_state.dtype) * velocity
        return motion_state

    def _bridge_noise_scale(self, t: Tensor) -> Tensor:
        return 1.0 - (1.0 - self.sigma_min) * t

    def _predict_model_output(
        self,
        *,
        x_t: Tensor,
        t: Tensor,
        condition_embeddings: Tensor,
    ) -> Tensor:
        if x_t.ndim != 2:
            raise ValueError(f"x_t must be rank-2, got shape {tuple(x_t.shape)}")
        if x_t.shape[-1] != self.motion_state_dim:
            raise ValueError(
                f"x_t last dim must be {self.motion_state_dim}, got {tuple(x_t.shape)}"
            )
        if condition_embeddings.ndim != 2:
            raise ValueError(
                "condition_embeddings must be rank-2, "
                f"got shape {tuple(condition_embeddings.shape)}"
            )
        if condition_embeddings.shape[0] != x_t.shape[0]:
            raise ValueError(
                "condition_embeddings batch dim must match x_t, "
                f"got {tuple(condition_embeddings.shape)} and {tuple(x_t.shape)}"
            )
        if t.ndim != 1 or t.shape[0] != x_t.shape[0]:
            raise ValueError(
                "t must be a vector matching x_t batch size, "
                f"got {tuple(t.shape)} and {tuple(x_t.shape)}"
            )
        x_hidden = self.state_projector(x_t.to(dtype=self.state_projector.weight.dtype))
        time_hidden = self.time_embedder(t * self.time_scale)
        condition_embeddings = condition_embeddings.to(dtype=x_hidden.dtype)
        hidden = self.input_norm(x_hidden + time_hidden + condition_embeddings)
        hidden = hidden + self.residual_block_1(hidden)
        hidden = hidden + self.residual_block_2(hidden)
        return self.output_projector(self.output_norm(hidden))

    def _predicted_clean_state_to_velocity(
        self,
        *,
        predicted_clean_state: Tensor,
        x_t: Tensor,
        t: Tensor,
    ) -> Tensor:
        bridge_noise_scale = self._bridge_noise_scale(t).unsqueeze(-1).to(
            dtype=predicted_clean_state.dtype,
        )
        bridge_noise_scale = bridge_noise_scale.clamp_min(
            torch.finfo(predicted_clean_state.dtype).eps
        )
        x_t = x_t.to(dtype=predicted_clean_state.dtype)
        return (
            predicted_clean_state - (1.0 - self.sigma_min) * x_t
        ) / bridge_noise_scale

    def predict_velocity(
        self,
        *,
        x_t: Tensor,
        t: Tensor,
        condition_embeddings: Tensor,
    ) -> Tensor:
        model_output = self._predict_model_output(
            x_t=x_t,
            t=t,
            condition_embeddings=condition_embeddings,
        )
        if self.prediction_type == "v":
            return model_output
        return self._predicted_clean_state_to_velocity(
            predicted_clean_state=model_output,
            x_t=x_t,
            t=t,
        )

    def _predict_guided_velocity(
        self,
        *,
        x_t: Tensor,
        t: Tensor,
        condition_embeddings: Tensor,
        cfg_scale: float | None = None,
    ) -> Tensor:
        guidance_scale = self.cfg_scale if cfg_scale is None else float(cfg_scale)
        if not math.isfinite(guidance_scale) or guidance_scale < 0.0:
            raise ValueError(
                "cfg_scale must be a finite non-negative number, "
                f"got {guidance_scale!r}"
            )
        conditional_velocity = self.predict_velocity(
            x_t=x_t,
            t=t,
            condition_embeddings=condition_embeddings,
        )
        unconditional_velocity = self.predict_velocity(
            x_t=x_t,
            t=t,
            condition_embeddings=torch.zeros_like(condition_embeddings),
        )
        return unconditional_velocity + guidance_scale * (
            conditional_velocity - unconditional_velocity
        )

    def predict(
        self,
        link_latents: Tensor,
        joint_connections: Tensor,
        joint_valid_flag: Tensor,
        is_revolute: Tensor,
        is_prismatic: Tensor,
    ) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
        parent_latents, child_latents = _gather_joint_link_latents(
            link_latents=link_latents,
            joint_connections=joint_connections,
        )
        batch_size, max_joints = joint_valid_flag.shape
        motion_flags = dict(revolute=is_revolute, prismatic=is_prismatic)
        motion_outputs: Dict[str, Tensor] = {}
        for motion_type in ("revolute", "prismatic"):
            motion_outputs[f"{motion_type}_axis"] = link_latents.new_zeros(
                (batch_size, max_joints, 6)
            )
            motion_outputs[f"{motion_type}_range"] = link_latents.new_zeros(
                (batch_size, max_joints, 2)
            )
            active_mask = joint_valid_flag & motion_flags[motion_type]
            if not torch.any(active_mask):
                continue
            condition_embeddings = self.condition_projector(
                _build_joint_motion_condition_inputs(
                    parent_latents=parent_latents,
                    child_latents=child_latents,
                    motion_type=motion_type,
                    revolute_embedding=self.revolute_embedding,
                    prismatic_embedding=self.prismatic_embedding,
                )
            )[active_mask]
            motion_state = self._sample_motion_state(
                condition_embeddings=condition_embeddings,
            )
            motion_state = self._unnormalize_motion_state(
                motion_state,
                motion_type=motion_type,
            )
            motion_outputs[f"{motion_type}_axis"][active_mask] = motion_state[..., :6]
            motion_outputs[f"{motion_type}_range"][active_mask] = motion_state[..., 6:8]
        return (
            motion_outputs["revolute_axis"],
            motion_outputs["prismatic_axis"],
            motion_outputs["revolute_range"],
            motion_outputs["prismatic_range"],
        )

class JointDecoderOverParametrized(nn.Module):
    """Decodes per-query over-parameterized joint supervision targets."""

    def __init__(
        self,
        model_dim: int,
        output_dim: int = 9,
        bias: bool = True,
    ):
        super().__init__()
        self.model_dim = model_dim
        self.output_dim = int(output_dim)
        if self.output_dim not in {9, 12}:
            raise ValueError(
                f"JointDecoderOverParametrized output_dim must be 9 or 12, got {output_dim}"
            )
        self.revolute_embedding = nn.Parameter(torch.empty(model_dim))
        self.prismatic_embedding = nn.Parameter(torch.empty(model_dim))
        self.joint_decoder = _make_silu_mlp(
            input_dim=model_dim * 4,
            hidden_dim=model_dim * 4,
            output_dim=self.output_dim,
            bias=bias,
        )
        self.reset_parameters()

    def reset_parameters(self) -> None:
        nn.init.trunc_normal_(self.revolute_embedding, std=0.02)
        nn.init.trunc_normal_(self.prismatic_embedding, std=0.02)

    def forward(
        self,
        query_latents: Tensor,
        link_latents: Tensor,
        assigned_link_ids: Tensor,
        joint_connections: Tensor,
    ) -> Tuple[Tensor, Tensor]:
        """Decodes `(closest_axis_point, low_pose_point, high_pose_point)` per query."""
        batch_size, num_links = link_latents.shape[:2]
        parent_link_ids = torch.arange(
            num_links,
            device=assigned_link_ids.device,
            dtype=assigned_link_ids.dtype,
        ).view(1, -1).expand(batch_size, -1).clone() # [batch_size, num_links]
        valid_joint_mask = joint_connections[..., 1] >= 0
        child_link_ids = joint_connections[..., 1].clamp_min(0)
        batch_indices = torch.arange(
            batch_size,
            device=joint_connections.device,
        ).unsqueeze(1).expand_as(child_link_ids)
        parent_link_ids[
            batch_indices[valid_joint_mask],
            child_link_ids[valid_joint_mask],
        ] = joint_connections[..., 0][valid_joint_mask]

        assigned_link_latents = link_latents.gather(
            dim=1,
            index=assigned_link_ids.unsqueeze(-1).expand(-1, -1, link_latents.shape[-1]),
        )
        assigned_parent_link_ids = parent_link_ids.gather(dim=1, index=assigned_link_ids)
        assigned_parent_link_latents = link_latents.gather(
            dim=1,
            index=assigned_parent_link_ids.unsqueeze(-1).expand(-1, -1, link_latents.shape[-1]),
        )
        revolute_type_embeddings = self.revolute_embedding.to(
            device=assigned_link_latents.device,
            dtype=assigned_link_latents.dtype,
        ).view(1, 1, -1).expand_as(assigned_link_latents)
        prismatic_type_embeddings = self.prismatic_embedding.to(
            device=assigned_link_latents.device,
            dtype=assigned_link_latents.dtype,
        ).view(1, 1, -1).expand_as(assigned_link_latents)
        return (
            self.joint_decoder(
                torch.cat(
                    (
                        revolute_type_embeddings,
                        query_latents,
                        assigned_parent_link_latents,
                        assigned_link_latents,
                    ),
                    dim=-1,
                )
            ),
            self.joint_decoder(
                torch.cat(
                    (
                        prismatic_type_embeddings,
                        query_latents,
                        assigned_parent_link_latents,
                        assigned_link_latents,
                    ),
                    dim=-1,
                )
            ),
        )


class JointDecoderSingleDirection(nn.Module):
    """Decodes one axis direction per joint from parent/child link latents."""

    direction_dim = 3

    def __init__(
        self,
        model_dim: int,
        bias: bool = True,
    ):
        super().__init__()
        self.model_dim = model_dim
        self.revolute_embedding = nn.Parameter(torch.empty(model_dim))
        self.prismatic_embedding = nn.Parameter(torch.empty(model_dim))
        self.decoder = _make_silu_mlp(
            input_dim=model_dim * 3,
            hidden_dim=model_dim * 4,
            output_dim=self.direction_dim,
            bias=bias,
        )
        self.reset_parameters()

    def reset_parameters(self) -> None:
        nn.init.trunc_normal_(self.revolute_embedding, std=0.02)
        nn.init.trunc_normal_(self.prismatic_embedding, std=0.02)

    def _predict_directions(
        self,
        *,
        parent_latents: Tensor,
        child_latents: Tensor,
        active_mask: Tensor,
        motion_type: str,
    ) -> Tensor:
        decoded = self.decoder(
            _build_joint_motion_condition_inputs(
                parent_latents=parent_latents,
                child_latents=child_latents,
                motion_type=motion_type,
                revolute_embedding=self.revolute_embedding,
                prismatic_embedding=self.prismatic_embedding,
            )
        )
        return decoded.masked_fill(~active_mask.unsqueeze(-1), 0)

    def predict(
        self,
        *,
        link_latents: Tensor,
        joint_connections: Tensor,
        joint_valid_flag: Tensor,
        is_revolute: Tensor,
        is_prismatic: Tensor,
    ) -> Tuple[Tensor, Tensor]:
        parent_latents, child_latents = _gather_joint_link_latents(
            link_latents=link_latents,
            joint_connections=joint_connections,
        )
        return (
            self._predict_directions(
                parent_latents=parent_latents,
                child_latents=child_latents,
                active_mask=joint_valid_flag & is_revolute,
                motion_type="revolute",
            ),
            self._predict_directions(
                parent_latents=parent_latents,
                child_latents=child_latents,
                active_mask=joint_valid_flag & is_prismatic,
                motion_type="prismatic",
            ),
        )


class Particulate2Encoder(nn.Module):
    """Encoder backbone for articulated point-cloud understanding."""

    def __init__(
        self,
        model_dim: int = 768,
        num_heads: int = 12,
        head_dim: Optional[int] = None,
        num_shape_latents: int = 256,
        num_attention_blocks: int = 6,
        coordinate_num_frequencies: int = 32,
        normal_num_frequencies: int = 32,
        coordinate_max_frequency: float = 64.0,
        normal_max_frequency: float = 16.0,
        link_text_feature_dim: int = 768,
        embedding_hidden_dim: Optional[int] = None,
        attn_dropout: float = 0.0,
        proj_dropout: float = 0.0,
        ffn_multiplier: float = 4.0,
        ffn_dropout: float = 0.0,
        qk_norm: bool = True,
        norm_eps: float = 1e-6,
        clip_model_name: str = "openai/clip-vit-large-patch14",
        compute_link_text_embeddings_on_the_fly: bool = False,
        use_text_conditioning: bool = True,
        clip_text_batch_size: int = 64,
        dropout_all_normals_prob: float = 0.3,
        dropout_all_point_prompts_prob: float = 0.0,
        dropout_individual_point_prompt_prob: float = 0.0,
        dropout_individual_text_conditioning_prob: float = 0.0,
        use_pretrained_features_shape: bool = False,
        use_pretrained_features_query: bool = False,
        use_pretrained_features_point_prompt: bool = False,
        pretrained_semantic_point_feature_dim: int = PARTFIELD_FEATURE_DIM,
    ):
        super().__init__()
        self.model_dim = model_dim
        self.link_text_feature_dim = link_text_feature_dim
        self.clip_model_name = clip_model_name
        self.compute_link_text_embeddings_on_the_fly = bool(compute_link_text_embeddings_on_the_fly)
        self.use_text_conditioning = bool(use_text_conditioning)
        if int(clip_text_batch_size) <= 0:
            raise ValueError(f"clip_text_batch_size must be positive, got {clip_text_batch_size}")
        self.clip_text_batch_size = int(clip_text_batch_size)
        self.dropout_all_normals_prob = float(dropout_all_normals_prob)
        if not 0.0 <= self.dropout_all_normals_prob <= 1.0:
            raise ValueError(
                "dropout_all_normals_prob must be in [0, 1], "
                f"got {dropout_all_normals_prob}"
            )
        self.dropout_all_point_prompts_prob = dropout_all_point_prompts_prob
        self.dropout_individual_point_prompt_prob = dropout_individual_point_prompt_prob
        self.dropout_individual_text_conditioning_prob = float(
            dropout_individual_text_conditioning_prob
        )
        if not 0.0 <= self.dropout_individual_text_conditioning_prob <= 1.0:
            raise ValueError(
                "dropout_individual_text_conditioning_prob must be in [0, 1], "
                f"got {dropout_individual_text_conditioning_prob}"
            )
        if not self.use_text_conditioning and (
            self.dropout_all_point_prompts_prob != 0.0
            or self.dropout_individual_point_prompt_prob != 0.0
            or self.dropout_individual_text_conditioning_prob != 0.0
        ):
            raise ValueError(
                "use_text_conditioning=False requires "
                "dropout_all_point_prompts_prob=0 and "
                "dropout_individual_point_prompt_prob=0 and "
                "dropout_individual_text_conditioning_prob=0 to avoid ambiguous "
                "text-free vs. point-prompt-dropped training examples"
            )
        self.use_pretrained_features_shape = bool(use_pretrained_features_shape)
        self.use_pretrained_features_query = bool(use_pretrained_features_query)
        self.use_pretrained_features_point_prompt = bool(use_pretrained_features_point_prompt)
        self.pretrained_semantic_point_feature_dim = int(pretrained_semantic_point_feature_dim)
        self._partfield_feature_extractor: Optional[PartFieldFeatureExtractor] = None
        self.text_tokenizer: Any = None
        self.text_model: Optional[nn.Module] = None
        attn_block_kwargs = {
            "model_dim": model_dim,
            "num_heads": num_heads,
            "head_dim": head_dim,
            "attn_dropout": attn_dropout,
            "proj_dropout": proj_dropout,
            "ffn_multiplier": ffn_multiplier,
            "ffn_dropout": ffn_dropout,
            "qk_norm": qk_norm,
            "norm_eps": norm_eps,
        }

        self.coordinate_embedder = FrequencyMLPEmbedder(
            output_dim=model_dim,
            num_frequencies=coordinate_num_frequencies,
            input_dim=3,
            hidden_dim=embedding_hidden_dim,
            max_frequency=coordinate_max_frequency,
        )
        self.normal_embedder = FrequencyMLPEmbedder(
            output_dim=model_dim,
            num_frequencies=normal_num_frequencies,
            input_dim=3,
            hidden_dim=embedding_hidden_dim,
            max_frequency=normal_max_frequency,
        )
        self.shape_encoder = ShapeLatentEncoder(
            num_shape_latents=num_shape_latents,
            **attn_block_kwargs,
        )
        text_hidden_dim = embedding_hidden_dim or model_dim
        self.link_text_input_norm = nn.RMSNorm(link_text_feature_dim, eps=norm_eps)
        self.link_text_projector = _make_silu_mlp(
            input_dim=link_text_feature_dim,
            hidden_dim=text_hidden_dim,
            output_dim=model_dim,
        )
        if (
            self.use_pretrained_features_shape
            or self.use_pretrained_features_query
            or self.use_pretrained_features_point_prompt
        ):
            self.pretrained_feature_input_norm = nn.RMSNorm(
                self.pretrained_semantic_point_feature_dim,
                eps=norm_eps,
            )
            self.pretrained_feature_projector = _make_silu_mlp(
                input_dim=self.pretrained_semantic_point_feature_dim,
                hidden_dim=text_hidden_dim,
                output_dim=model_dim,
            )
        else:
            self.pretrained_feature_input_norm = None
            self.pretrained_feature_projector = None
        self.link_to_shape_cross_attn = nn.ModuleList(
            [
                CrossAttentionBlock(**attn_block_kwargs)
                for _ in range(num_attention_blocks)
            ]
        )
        self.link_self_attn = nn.ModuleList(
            [
                SelfAttentionBlock(**attn_block_kwargs)
                for _ in range(num_attention_blocks)
            ]
        )
        self.query_to_shape_cross_attn = nn.ModuleList(
            [
                CrossAttentionBlock(**attn_block_kwargs)
                for _ in range(num_attention_blocks)
            ]
        )
        self.query_to_link_cross_attn = nn.ModuleList(
            [
                CrossAttentionBlock(**attn_block_kwargs)
                for _ in range(num_attention_blocks)
            ]
        )
        self.no_point_prompt_embedding = nn.Parameter(torch.zeros(model_dim))
        self.no_text_conditioning_embedding = nn.Parameter(torch.zeros(model_dim))
        self.link_output_norm = nn.RMSNorm(model_dim, eps=norm_eps)
        self.query_output_norm = nn.RMSNorm(model_dim, eps=norm_eps)

    def encode_shape(
        self,
        shape_points: Tensor,
        shape_point_normals: Tensor,
        pretrained_features: Tensor | None = None,
        drop_normal_mask: Tensor | None = None,
    ) -> Tensor:
        point_tokens = self._embed_point_tokens(
            shape_points,
            shape_point_normals,
            pretrained_features=pretrained_features,
            drop_normal_mask=drop_normal_mask,
        )
        return self.shape_encoder(point_tokens)

    def encode_links(
        self,
        link_point_prompts: Tensor | None,
        link_point_prompt_normals: Tensor | None,
        link_valid_flag: Tensor,
        link_point_prompt_dropout_eligible: Tensor | None = None,
        forced_no_point_prompt_mask: Tensor | None = None,
        drop_normal_mask: Tensor | None = None,
        link_point_prompt_pretrained_features: Tensor | None = None,
        link_text_prompts: Optional[Sequence[Sequence[str]]] = None,
        link_text_embeddings: Tensor | None = None,
    ) -> Tensor:
        """Embeds valid link prompts and leaves padded link slots at zero.

        When prompt tensors are omitted, valid links use the learned
        `no_point_prompt_embedding` instead of point-derived features. When
        `forced_no_point_prompt_mask` is provided, those valid links also use
        the no-prompt embedding even in eval mode.
        """
        batch_size, max_links = link_valid_flag.shape
        no_point_prompt_mask = self._resolve_no_point_prompt_mask(
            link_point_prompts=link_point_prompts,
            link_valid_flag=link_valid_flag,
            link_point_prompt_dropout_eligible=link_point_prompt_dropout_eligible,
            forced_no_point_prompt_mask=forced_no_point_prompt_mask,
        )
        point_features = self._resolve_valid_link_point_features(
            link_point_prompts=link_point_prompts,
            link_point_prompt_normals=link_point_prompt_normals,
            link_valid_flag=link_valid_flag,
            no_point_prompt_mask=no_point_prompt_mask,
            drop_normal_mask=drop_normal_mask,
            link_point_prompt_pretrained_features=link_point_prompt_pretrained_features,
        )
        if self.use_text_conditioning:
            text_features = self._resolve_valid_link_text_features(
                link_text_prompts=link_text_prompts,
                link_text_embeddings=link_text_embeddings,
                link_valid_flag=link_valid_flag,
            )
        else:
            text_features = torch.zeros(
                (int(link_valid_flag.sum().item()), self.link_text_feature_dim),
                device=link_valid_flag.device,
                dtype=self.link_text_projector[0].weight.dtype,
            )
        projected_text_features = self.link_text_projector(
            self.link_text_input_norm(text_features)
        )
        if self.use_text_conditioning:
            text_conditioning_dropout_mask = self._sample_text_conditioning_dropout_mask(
                link_valid_flag,
                no_point_prompt_mask=no_point_prompt_mask,
            )
            dropped_valid_text_links = text_conditioning_dropout_mask[link_valid_flag]
            projected_text_features = torch.where(
                dropped_valid_text_links.unsqueeze(-1),
                self.no_text_conditioning_embedding.to(
                    device=projected_text_features.device,
                    dtype=projected_text_features.dtype,
                ).unsqueeze(0).expand_as(projected_text_features),
                projected_text_features,
            )
        if point_features.dtype != projected_text_features.dtype:
            point_features = point_features.to(dtype=projected_text_features.dtype)
        link_latents = projected_text_features.new_zeros((batch_size, max_links, self.model_dim))
        valid_link_features = point_features + projected_text_features
        link_latents[link_valid_flag] = valid_link_features
        return link_latents

    def _run_attention_blocks(
        self,
        shape_latents: Tensor,
        link_latents: Tensor,
        query_latents: Tensor,
        link_valid_flag: Tensor,
    ) -> Tuple[Tensor, Tensor]:
        for (
            link_to_shape_cross_attn,
            link_self_attn,
            query_to_shape_cross_attn,
            query_to_link_cross_attn,
        ) in zip(
            self.link_to_shape_cross_attn,
            self.link_self_attn,
            self.query_to_shape_cross_attn,
            self.query_to_link_cross_attn,
            strict=True,
        ):
            link_latents = link_to_shape_cross_attn(
                link_latents,
                shape_latents,
                query_mask=link_valid_flag,
            )
            link_latents = link_self_attn(link_latents, mask=link_valid_flag)
            query_latents = query_to_shape_cross_attn(query_latents, shape_latents)
            query_latents = query_to_link_cross_attn(
                query_latents,
                link_latents,
                context_mask=link_valid_flag,
            )

        # Residual blocks can write into padded rows unless they are zeroed again.
        link_latents = self.link_output_norm(link_latents)
        link_latents = link_latents.masked_fill(~link_valid_flag.unsqueeze(-1), 0)
        query_latents = self.query_output_norm(query_latents)
        return link_latents, query_latents

    def forward(
        self,
        shape_points: Tensor | None = None,
        shape_point_normals: Tensor | None = None,
        query_points: Tensor | None = None,
        query_point_normals: Tensor | None = None,
        link_point_prompts: Tensor | None = None,
        link_point_prompt_normals: Tensor | None = None,
        link_valid_flag: Tensor | None = None,
        link_point_prompt_dropout_eligible: Tensor | None = None,
        link_text_prompts: Sequence[Sequence[str]] | None = None,
        link_text_embeddings: Tensor | None = None,
    ) -> Dict[str, Any]:
        """Encodes shape, link, and query inputs into latent tokens.

        Args:
            shape_points: Shape points with shape `(B, Ns, 3)`.
            shape_point_normals: Shape normals with shape `(B, Ns, 3)`.
            query_points: Query points with shape `(B, Nq, 3)`.
            query_point_normals: Query normals with shape `(B, Nq, 3)`.
            link_point_prompts: Optional link prompt points with shape
                `(B, L, 3)`. When omitted together with
                `link_point_prompt_normals`, valid link slots use the learned
                no-prompt embedding instead.
            link_point_prompt_normals: Optional link prompt normals aligned
                with `link_point_prompts`, with shape `(B, L, 3)`.
            link_valid_flag: Boolean tensor with shape `(B, L)` indicating
                which padded link slots are valid.
            link_point_prompt_dropout_eligible: Optional boolean tensor with
                shape `(B, L)` indicating which valid links may have their
                point prompt and prompt normal dropped during training.
            link_text_prompts: Per-link text prompts as a batch list of string
                lists, where sample `i` has length `sum(link_valid_flag[i])`.
            link_text_embeddings: Optional padded language features with shape
                `(B, L, D_lang)`.
        """
        if (link_point_prompts is None) != (link_point_prompt_normals is None):
            raise ValueError(
                "link_point_prompts and link_point_prompt_normals must both be provided or both be None"
            )

        if (
            shape_points is None
            or shape_point_normals is None
            or query_points is None
            or query_point_normals is None
            or link_valid_flag is None
        ):
            raise ValueError(
                "forward requires shape/query tensors and link_valid_flag"
            )
        if (
            self.use_text_conditioning
            and link_text_prompts is None
            and link_text_embeddings is None
        ):
            raise ValueError(
                "forward requires either link_text_prompts or link_text_embeddings "
                "when use_text_conditioning=True"
            )

        (
            shape_pretrained_features,
            query_pretrained_features,
            link_point_prompt_pretrained_features,
        ) = (
            self._compute_pretrained_point_features(
                shape_points=shape_points,
                query_points=query_points,
                link_point_prompts=link_point_prompts,
            )
        )
        drop_normal_mask = self._sample_all_normals_dropout_mask(
            batch_size=int(shape_points.shape[0]),
            device=shape_points.device,
        )
        shape_latents = self.encode_shape(
            shape_points,
            shape_point_normals,
            pretrained_features=shape_pretrained_features,
            drop_normal_mask=drop_normal_mask,
        )
        link_latents = self.encode_links(
            link_point_prompts=link_point_prompts,
            link_point_prompt_normals=link_point_prompt_normals,
            link_valid_flag=link_valid_flag,
            link_point_prompt_dropout_eligible=link_point_prompt_dropout_eligible,
            drop_normal_mask=drop_normal_mask,
            link_point_prompt_pretrained_features=link_point_prompt_pretrained_features,
            link_text_prompts=link_text_prompts,
            link_text_embeddings=link_text_embeddings,
        )
        query_latents = self._embed_point_tokens(
            query_points,
            query_point_normals,
            pretrained_features=query_pretrained_features,
            drop_normal_mask=drop_normal_mask,
        )
        link_latents, query_latents = self._run_attention_blocks(
            shape_latents=shape_latents,
            link_latents=link_latents,
            query_latents=query_latents,
            link_valid_flag=link_valid_flag,
        )

        return {
            "shape_latents": shape_latents,
            "query_latents": query_latents,
            "link_latents": link_latents,
        }

    def _embed_point_tokens(
        self,
        points: Tensor,
        normals: Tensor,
        *,
        pretrained_features: Tensor | None = None,
        drop_normal_mask: Tensor | None = None,
    ) -> Tensor:
        point_tokens = self.coordinate_embedder(points)
        normal_tokens = self.normal_embedder(normals)
        if drop_normal_mask is not None:
            if drop_normal_mask.ndim != 1 or drop_normal_mask.shape[0] != points.shape[0]:
                raise ValueError(
                    "drop_normal_mask must have shape (B,), "
                    f"got {tuple(drop_normal_mask.shape)} for points {tuple(points.shape)}"
                )
            drop_normal_mask = drop_normal_mask.to(
                device=normal_tokens.device,
                dtype=torch.bool,
            )
            mask_shape = (drop_normal_mask.shape[0],) + (1,) * (normal_tokens.ndim - 1)
            normal_tokens = normal_tokens.masked_fill(drop_normal_mask.view(mask_shape), 0)
        if point_tokens.dtype != normal_tokens.dtype:
            point_tokens = point_tokens.to(dtype=normal_tokens.dtype)
        point_tokens = point_tokens + normal_tokens
        if pretrained_features is None:
            return point_tokens
        if (
            self.pretrained_feature_input_norm is None
            or self.pretrained_feature_projector is None
        ):
            raise ValueError(
                "Received pretrained_features but pretrained feature projection is disabled"
            )
        pretrained_features = pretrained_features.to(
            dtype=self.pretrained_feature_projector[0].weight.dtype
        )
        projected_pretrained_features = self.pretrained_feature_projector(
            self.pretrained_feature_input_norm(pretrained_features)
        )
        if point_tokens.dtype != projected_pretrained_features.dtype:
            point_tokens = point_tokens.to(dtype=projected_pretrained_features.dtype)
        return point_tokens + projected_pretrained_features

    def _get_partfield_feature_extractor(self) -> PartFieldFeatureExtractor:
        if self._partfield_feature_extractor is None:
            self._partfield_feature_extractor = PartFieldFeatureExtractor()
        return self._partfield_feature_extractor

    def _compute_pretrained_point_features(
        self,
        *,
        shape_points: Tensor,
        query_points: Tensor,
        link_point_prompts: Tensor | None = None,
    ) -> tuple[Tensor | None, Tensor | None, Tensor | None]:
        needs_prompt_features = (
            self.use_pretrained_features_point_prompt
            and link_point_prompts is not None
        )
        if not (
            self.use_pretrained_features_shape
            or self.use_pretrained_features_query
            or needs_prompt_features
        ):
            return None, None, None
        decode_segments: list[tuple[str, Tensor]] = []
        if self.use_pretrained_features_query:
            decode_segments.append(("query", query_points))
        if needs_prompt_features:
            if link_point_prompts.shape[0] != shape_points.shape[0]:
                raise ValueError(
                    "link_point_prompts batch dimension must match shape_points when "
                    "use_pretrained_features_point_prompt=True, "
                    f"got {tuple(link_point_prompts.shape)} and {tuple(shape_points.shape)}"
                )
            decode_segments.append(("prompt", link_point_prompts))

        decode_query_points = (
            torch.cat([points for _, points in decode_segments], dim=1)
            if decode_segments
            else None
        )
        extractor = self._get_partfield_feature_extractor()
        shape_features, combined_decode_features = extractor.extract(
            encode_points=shape_points,
            decode_shape_points=shape_points if self.use_pretrained_features_shape else None,
            decode_query_points=decode_query_points,
        )
        query_features = None
        prompt_features = None
        if combined_decode_features is not None:
            if len(decode_segments) == 1:
                decoded_features = {decode_segments[0][0]: combined_decode_features}
            else:
                decoded_features: dict[str, Tensor] = {}
                offset = 0
                for name, points in decode_segments:
                    count = points.shape[1]
                    decoded_features[name] = combined_decode_features[:, offset : offset + count]
                    offset += count
            query_features = decoded_features.get("query")
            prompt_features = decoded_features.get("prompt")
        return shape_features, query_features, prompt_features

    def _sample_point_prompt_dropout_mask(
        self,
        link_valid_flag: Tensor,
        link_point_prompt_dropout_eligible: Tensor | None = None,
    ) -> Tensor:
        if not self.training or (
            self.dropout_all_point_prompts_prob == 0.0
            and self.dropout_individual_point_prompt_prob == 0.0
        ):
            return torch.zeros_like(link_valid_flag)

        batch_size = link_valid_flag.shape[0]
        device = link_valid_flag.device
        if link_point_prompt_dropout_eligible is None:
            eligible_link_mask = link_valid_flag
        else:
            if link_point_prompt_dropout_eligible.shape != link_valid_flag.shape:
                raise ValueError(
                    "link_point_prompt_dropout_eligible must match link_valid_flag shape, "
                    f"got {tuple(link_point_prompt_dropout_eligible.shape)} and "
                    f"{tuple(link_valid_flag.shape)}"
                )
            eligible_link_mask = link_valid_flag & link_point_prompt_dropout_eligible
        drop_mask = torch.zeros_like(link_valid_flag)
        drop_all_samples = torch.zeros(batch_size, device=device, dtype=torch.bool)

        if self.dropout_all_point_prompts_prob > 0.0:
            drop_all_samples = (
                torch.rand(batch_size, device=device) < self.dropout_all_point_prompts_prob
            )
            drop_mask[drop_all_samples] = eligible_link_mask[drop_all_samples]

        if self.dropout_individual_point_prompt_prob > 0.0:
            keep_individual_dropout = ~drop_all_samples
            individual_drop_mask = (
                torch.rand(link_valid_flag.shape, device=device)
                < self.dropout_individual_point_prompt_prob
            )
            drop_mask[keep_individual_dropout] = (
                individual_drop_mask[keep_individual_dropout]
                & eligible_link_mask[keep_individual_dropout]
            )

        return drop_mask

    def _sample_all_normals_dropout_mask(
        self,
        *,
        batch_size: int,
        device: torch.device,
    ) -> Tensor:
        if not self.training or self.dropout_all_normals_prob == 0.0:
            return torch.zeros((batch_size,), device=device, dtype=torch.bool)
        return torch.rand(batch_size, device=device) < self.dropout_all_normals_prob

    def _resolve_no_point_prompt_mask(
        self,
        *,
        link_point_prompts: Tensor | None,
        link_valid_flag: Tensor,
        link_point_prompt_dropout_eligible: Tensor | None = None,
        forced_no_point_prompt_mask: Tensor | None = None,
    ) -> Tensor:
        if forced_no_point_prompt_mask is not None:
            if forced_no_point_prompt_mask.shape != link_valid_flag.shape:
                raise ValueError(
                    "forced_no_point_prompt_mask must match link_valid_flag shape, "
                    f"got {tuple(forced_no_point_prompt_mask.shape)} and "
                    f"{tuple(link_valid_flag.shape)}"
                )
            forced_no_point_prompt_mask = (
                forced_no_point_prompt_mask.to(
                    device=link_valid_flag.device,
                    dtype=torch.bool,
                )
                & link_valid_flag
            )
        if link_point_prompts is None:
            no_point_prompt_mask = link_valid_flag.clone()
        else:
            no_point_prompt_mask = self._sample_point_prompt_dropout_mask(
                link_valid_flag,
                link_point_prompt_dropout_eligible=link_point_prompt_dropout_eligible,
            )
        if forced_no_point_prompt_mask is not None:
            no_point_prompt_mask = no_point_prompt_mask | forced_no_point_prompt_mask
        return no_point_prompt_mask & link_valid_flag

    def _sample_text_conditioning_dropout_mask(
        self,
        link_valid_flag: Tensor,
        *,
        no_point_prompt_mask: Tensor | None = None,
    ) -> Tensor:
        if (
            not self.training
            or self.dropout_individual_text_conditioning_prob == 0.0
        ):
            return torch.zeros_like(link_valid_flag)
        if no_point_prompt_mask is None:
            text_dropout_eligible_mask = link_valid_flag
        else:
            if no_point_prompt_mask.shape != link_valid_flag.shape:
                raise ValueError(
                    "no_point_prompt_mask must match link_valid_flag shape, "
                    f"got {tuple(no_point_prompt_mask.shape)} and "
                    f"{tuple(link_valid_flag.shape)}"
                )
            text_dropout_eligible_mask = link_valid_flag & ~no_point_prompt_mask.to(
                device=link_valid_flag.device,
                dtype=torch.bool,
            )
        return (
            torch.rand(link_valid_flag.shape, device=link_valid_flag.device)
            < self.dropout_individual_text_conditioning_prob
        ) & text_dropout_eligible_mask

    def _resolve_valid_link_point_features(
        self,
        link_point_prompts: Tensor | None,
        link_point_prompt_normals: Tensor | None,
        link_valid_flag: Tensor,
        no_point_prompt_mask: Tensor | None = None,
        drop_normal_mask: Tensor | None = None,
        link_point_prompt_pretrained_features: Tensor | None = None,
    ) -> Tensor:
        if no_point_prompt_mask is not None:
            if no_point_prompt_mask.shape != link_valid_flag.shape:
                raise ValueError(
                    "no_point_prompt_mask must match link_valid_flag shape, "
                    f"got {tuple(no_point_prompt_mask.shape)} and "
                    f"{tuple(link_valid_flag.shape)}"
                )
            no_point_prompt_mask = (
                no_point_prompt_mask.to(
                    device=link_valid_flag.device,
                    dtype=torch.bool,
                )
                & link_valid_flag
            )

        if link_point_prompts is None:
            if link_point_prompt_pretrained_features is not None:
                raise ValueError(
                    "link_point_prompt_pretrained_features requires link_point_prompts"
                )
            no_prompt_embedding = self.no_point_prompt_embedding.to(
                device=link_valid_flag.device,
                dtype=self.no_point_prompt_embedding.dtype,
            )
            return no_prompt_embedding.view(1, 1, -1).expand(*link_valid_flag.shape, -1)[link_valid_flag]

        point_features = self._embed_point_tokens(
            link_point_prompts,
            link_point_prompt_normals,
            pretrained_features=link_point_prompt_pretrained_features,
            drop_normal_mask=drop_normal_mask,
        )[link_valid_flag]
        if no_point_prompt_mask is None:
            no_point_prompt_mask = torch.zeros_like(link_valid_flag)
        dropped_valid_links = no_point_prompt_mask[link_valid_flag]
        return torch.where(
            dropped_valid_links.unsqueeze(-1),
            self.no_point_prompt_embedding.to(
                device=point_features.device,
                dtype=point_features.dtype,
            ).unsqueeze(0).expand_as(point_features),
            point_features,
        )

    def _resolve_valid_link_text_features(
        self,
        link_text_prompts: Optional[Sequence[Sequence[str]]],
        link_text_embeddings: Tensor | None,
        link_valid_flag: Tensor,
    ) -> Tensor:
        """Returns text features flattened in the same order as `link_valid_flag`."""
        if link_text_embeddings is not None:
            return link_text_embeddings[link_valid_flag].to(
                dtype=self.link_text_projector[0].weight.dtype
            )

        if link_text_prompts is None:
            flattened_prompts = [""] * int(link_valid_flag.sum().item())
        else:
            flattened_prompts = list(chain.from_iterable(link_text_prompts))
        return self._encode_link_text_prompts(
            flattened_prompts,
            dtype=self.link_text_projector[0].weight.dtype,
        )

    def _ensure_text_model_loaded(self) -> None:
        if self.text_model is not None and self.text_tokenizer is not None:
            return

        cache_dir = os.environ.get("HF_HOME")
        self.text_tokenizer, self.text_model = load_clip_text_encoder(
            self.clip_model_name,
            device=self.link_text_projector[0].weight.device,
            expected_embedding_dim=self.link_text_feature_dim,
            cache_dir=cache_dir,
        )

    @torch.no_grad()
    def _encode_link_text_prompts(
        self,
        prompts: Sequence[str],
        dtype: torch.dtype,
    ) -> Tensor:
        if len(prompts) == 0:
            return torch.zeros(
                (0, self.link_text_feature_dim),
                device=self.link_text_projector[0].weight.device,
                dtype=dtype,
            )
        if not self.compute_link_text_embeddings_on_the_fly:
            raise ValueError(
                "Missing link_text_embeddings but compute_link_text_embeddings_on_the_fly=False. "
                "Provide dataset-side text embeddings or enable on-the-fly CLIP text encoding."
            )
        self._ensure_text_model_loaded()
        return encode_clip_text_prompts(
            prompts,
            tokenizer=self.text_tokenizer,
            text_model=self.text_model,
            batch_size=self.clip_text_batch_size,
            output_device=self.link_text_projector[0].weight.device,
            output_dtype=dtype,
        )


class Particulate2ArticulationModel(nn.Module):
    """Encoder, decoders, and training losses for articulation prediction."""

    def __init__(
        self,
        encoder: Optional[Particulate2Encoder] = None,
        *,
        segmentation_decode_type: str = "mlp",
        segmentation_query_chunk_size: Optional[int] = None,
        joint_decode_type: str = "overparameterization",
        joint_fm_hidden_dim: Optional[int] = None,
        joint_fm_prediction_type: str = "v",
        joint_fm_time_embedding_dim: int = 256,
        joint_fm_inference_steps: int = 100,
        joint_fm_time_scale: float = 1000.0,
        joint_fm_sigma_min: float = 0.0,
        joint_fm_rescale_t: float = 1.0,
        joint_fm_cfg_scale: float = 1.0,
        revolute_joint_fm_state_mean: Optional[Sequence[float]] = None,
        revolute_joint_fm_state_std: Optional[Sequence[float]] = None,
        prismatic_joint_fm_state_mean: Optional[Sequence[float]] = None,
        prismatic_joint_fm_state_std: Optional[Sequence[float]] = None,
        joint_fm_training_time_mean: float = 0.0,
        joint_fm_training_time_std: float = 1.0,
        use_ancestor_context_for_segmentation: bool = False,
        ancestor_context_decay: float = 0.5,
        segmentation_bias: bool = True,
        joint_decoder_bias: bool = True,
        segmentation_cross_entropy_weight: float = 1.0,
        segmentation_dice_weight: float = 1.0,
        revolute_joint_axis_l1_weight: float = 1.0,
        prismatic_joint_axis_l1_weight: float = 1.0,
        revolute_joint_range_l1_weight: float = 1.0,
        prismatic_joint_range_l1_weight: float = 1.0,
        revolute_joint_fm_weight: float = 1.0,
        prismatic_joint_fm_weight: float = 1.0,
        revolute_overparam_axis_l1_weight: float = 1.0,
        revolute_overparam_point_l1_weight: float = 1.0,
        prismatic_overparam_axis_l1_weight: float = 1.0,
        prismatic_overparam_point_l1_weight: float = 1.0,
        revolute_overparam_direction_weight: float = 1.0,
        prismatic_overparam_direction_weight: float = 1.0,
        overparam_closest_axis_space: str = "world",
        dice_smoothing: float = 1e-6,
        **encoder_kwargs: Any,
    ):
        super().__init__()
        if encoder is not None and encoder_kwargs:
            raise ValueError("Pass either an encoder instance or encoder kwargs, not both")

        self.encoder = encoder if encoder is not None else Particulate2Encoder(**encoder_kwargs)
        joint_decode_type = _normalize_joint_decode_type(joint_decode_type)
        if joint_decode_type not in {*_PLAIN_JOINT_DECODE_TYPES, *_OVERPARAM_JOINT_DECODE_TYPES}:
            raise ValueError(
                "joint_decode_type must be 'plain', 'plain+fm', 'overparametrized', "
                "'overparameterized', 'overparameterization', 'overparam+dir', or "
                "'overparam+singledir', "
                f"got {joint_decode_type!r}"
            )
        self.joint_decode_type = joint_decode_type
        self.plain_flow_matching_enabled = self.joint_decode_type == "plain+fm"
        self.overparam_predicts_query_axis_direction = self.joint_decode_type == "overparam+dir"
        self.overparam_predicts_single_axis_direction = (
            self.joint_decode_type == "overparam+singledir"
        )
        self.overparam_uses_axis_direction = (
            self.overparam_predicts_query_axis_direction
            or self.overparam_predicts_single_axis_direction
        )
        overparam_closest_axis_space = _normalize_overparam_closest_axis_space(
            overparam_closest_axis_space
        )
        if overparam_closest_axis_space not in {"world", "part_aabb"}:
            raise ValueError(
                "overparam_closest_axis_space must be 'world', 'sample', 'part_aabb', or "
                f"'local_aabb', got {overparam_closest_axis_space!r}"
            )
        self.overparam_closest_axis_space = overparam_closest_axis_space
        self.overparam_closest_axis_uses_part_aabb = (
            self.overparam_closest_axis_space == "part_aabb"
        )
        self.use_ancestor_context_for_segmentation = bool(
            use_ancestor_context_for_segmentation
        )
        self.ancestor_context_decay = float(ancestor_context_decay)
        if not math.isfinite(self.ancestor_context_decay):
            raise ValueError(
                "ancestor_context_decay must be finite, "
                f"got {ancestor_context_decay!r}"
            )
        if self.ancestor_context_decay < 0.0 or self.ancestor_context_decay > 1.0:
            raise ValueError(
                "ancestor_context_decay must be in [0, 1], "
                f"got {ancestor_context_decay!r}"
            )
        self.segmentation_decoder = SegmentationDecoder(
            model_dim=self.encoder.model_dim,
            decode_type=segmentation_decode_type,
            bias=segmentation_bias,
            query_chunk_size=segmentation_query_chunk_size,
        )
        self.joint_direction_decoder: JointDecoderSingleDirection | None = None
        if self.joint_decode_type == "plain":
            self.joint_decoder = JointDecoderPlain(
                model_dim=self.encoder.model_dim,
                bias=joint_decoder_bias,
            )
        elif self.plain_flow_matching_enabled:
            self.joint_decoder = JointDecoderPlainFlowMatching(
                model_dim=self.encoder.model_dim,
                hidden_dim=joint_fm_hidden_dim,
                prediction_type=joint_fm_prediction_type,
                time_embedding_dim=joint_fm_time_embedding_dim,
                inference_steps=joint_fm_inference_steps,
                time_scale=joint_fm_time_scale,
                sigma_min=joint_fm_sigma_min,
                rescale_t=joint_fm_rescale_t,
                cfg_scale=joint_fm_cfg_scale,
                revolute_state_mean=revolute_joint_fm_state_mean,
                revolute_state_std=revolute_joint_fm_state_std,
                prismatic_state_mean=prismatic_joint_fm_state_mean,
                prismatic_state_std=prismatic_joint_fm_state_std,
                bias=joint_decoder_bias,
            )
        else:
            self.joint_decoder = JointDecoderOverParametrized(
                model_dim=self.encoder.model_dim,
                output_dim=12 if self.overparam_predicts_query_axis_direction else 9,
                bias=joint_decoder_bias,
            )
            if self.overparam_predicts_single_axis_direction:
                self.joint_direction_decoder = JointDecoderSingleDirection(
                    model_dim=self.encoder.model_dim,
                    bias=joint_decoder_bias,
                )
        if self.use_ancestor_context_for_segmentation:
            self.ancestor_context_projector = nn.Linear(
                self.encoder.model_dim,
                self.encoder.model_dim,
                bias=False,
            )
            self.ancestor_context_gate = _make_silu_mlp(
                input_dim=self.encoder.model_dim * 2,
                hidden_dim=self.encoder.model_dim,
                output_dim=1,
            )
        else:
            self.ancestor_context_projector = None
            self.ancestor_context_gate = None
        self.segmentation_cross_entropy_weight = float(segmentation_cross_entropy_weight)
        self.segmentation_dice_weight = float(segmentation_dice_weight)
        self.revolute_joint_axis_l1_weight = float(revolute_joint_axis_l1_weight)
        self.prismatic_joint_axis_l1_weight = float(prismatic_joint_axis_l1_weight)
        self.revolute_joint_range_l1_weight = float(revolute_joint_range_l1_weight)
        self.prismatic_joint_range_l1_weight = float(prismatic_joint_range_l1_weight)
        self.revolute_joint_fm_weight = float(revolute_joint_fm_weight)
        self.prismatic_joint_fm_weight = float(prismatic_joint_fm_weight)
        self.revolute_overparam_axis_l1_weight = float(revolute_overparam_axis_l1_weight)
        self.revolute_overparam_point_l1_weight = float(revolute_overparam_point_l1_weight)
        self.prismatic_overparam_axis_l1_weight = float(prismatic_overparam_axis_l1_weight)
        self.prismatic_overparam_point_l1_weight = float(prismatic_overparam_point_l1_weight)
        self.revolute_overparam_direction_weight = float(revolute_overparam_direction_weight)
        self.prismatic_overparam_direction_weight = float(prismatic_overparam_direction_weight)
        self.joint_fm_training_time_mean = float(joint_fm_training_time_mean)
        self.joint_fm_training_time_std = float(joint_fm_training_time_std)
        self.dice_smoothing = float(dice_smoothing)

    def decode_segmentation(
        self,
        query_latents: Tensor,
        link_latents: Tensor,
        link_valid_flag: Tensor,
    ) -> Tensor:
        return self.segmentation_decoder(
            query_latents=query_latents,
            link_latents=link_latents,
            link_valid_flag=link_valid_flag,
        )

    def decode_joint_parameters(
        self,
        *,
        link_latents: Tensor,
        joint_connections: Tensor,
        joint_valid_flag: Tensor,
        is_revolute: Tensor,
        is_prismatic: Tensor,
        query_latents: Tensor | None = None,
        query_points: Tensor | None = None,
        assigned_link_ids: Tensor | None = None,
        decoded_motion_points: Tuple[Tensor, Tensor] | None = None,
        decoded_axis_directions: Tuple[Tensor, Tensor] | None = None,
        decoded_motion_points_are_world: bool = False,
    ) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
        """Returns revolute/prismatic axis and range tensors for the configured decode mode.

        In plain mode the joint decoder predicts per-joint parameters directly from
        parent/child link latents. In plain+fm mode it samples parameters with
        a SAM3D-style flow-matching Euler sampler. In over-parameterized mode it first obtains
        query-wise motion targets and then fits a single joint axis and range for
        each child link from those targets.
        """
        if self.joint_decode_type == "plain":
            return self.joint_decoder.predict(
                link_latents=link_latents,
                joint_connections=joint_connections,
                joint_valid_flag=joint_valid_flag,
                is_revolute=is_revolute,
                is_prismatic=is_prismatic,
            )
        if self.plain_flow_matching_enabled:
            return self.joint_decoder.predict(
                link_latents=link_latents,
                joint_connections=joint_connections,
                joint_valid_flag=joint_valid_flag,
                is_revolute=is_revolute,
                is_prismatic=is_prismatic,
            )

        if query_points is None or assigned_link_ids is None:
            raise ValueError(
                "over-parameterized joint decoding requires query_points and assigned_link_ids"
            )
        if decoded_motion_points is None:
            if query_latents is None:
                raise ValueError(
                    "over-parameterized joint decoding requires query_latents when decoded_motion_points are not provided"
                )
            decoded_motion_points = self._decode_joint_motion_points(
                query_latents=query_latents,
                link_latents=link_latents,
                assigned_link_ids=assigned_link_ids,
                joint_connections=joint_connections,
            )
        if self.overparam_predicts_single_axis_direction and decoded_axis_directions is None:
            decoded_axis_directions = self._decode_joint_axis_directions(
                link_latents=link_latents,
                joint_connections=joint_connections,
                joint_valid_flag=joint_valid_flag,
                is_revolute=is_revolute,
                is_prismatic=is_prismatic,
            )
        if not decoded_motion_points_are_world:
            decoded_motion_points = (
                self._convert_overparam_motion_points_to_world_coordinates(
                    motion_points=decoded_motion_points[0],
                    query_points=query_points,
                    assigned_link_ids=assigned_link_ids,
                ),
                self._convert_overparam_motion_points_to_world_coordinates(
                    motion_points=decoded_motion_points[1],
                    query_points=query_points,
                    assigned_link_ids=assigned_link_ids,
                ),
            )
        try:
            autocast_context = torch.autocast(device_type=query_points.device.type, enabled=False)
        except (RuntimeError, TypeError, ValueError):
            autocast_context = nullcontext()
        with autocast_context:
            return self._recover_overparam_joint_parameters(
                query_points=query_points,
                assigned_link_ids=assigned_link_ids,
                joint_connections=joint_connections,
                joint_valid_flag=joint_valid_flag,
                is_revolute=is_revolute,
                is_prismatic=is_prismatic,
                revolute_motion_points=decoded_motion_points[0],
                prismatic_motion_points=decoded_motion_points[1],
                revolute_axis_directions=(
                    None if decoded_axis_directions is None else decoded_axis_directions[0]
                ),
                prismatic_axis_directions=(
                    None if decoded_axis_directions is None else decoded_axis_directions[1]
                ),
            )

    def _decode_joint_motion_points(
        self,
        *,
        query_latents: Tensor,
        link_latents: Tensor,
        assigned_link_ids: Tensor,
        joint_connections: Tensor,
    ) -> Tuple[Tensor, Tensor]:
        if self.joint_decode_type not in _OVERPARAM_JOINT_DECODE_TYPES:
            raise ValueError(
                "_decode_joint_motion_points is only available in over-parameterized joint decoding mode"
            )
        return self.joint_decoder(
            query_latents=query_latents,
            link_latents=link_latents,
            assigned_link_ids=assigned_link_ids,
            joint_connections=joint_connections,
        )

    def _decode_joint_axis_directions(
        self,
        *,
        link_latents: Tensor,
        joint_connections: Tensor,
        joint_valid_flag: Tensor,
        is_revolute: Tensor,
        is_prismatic: Tensor,
    ) -> Tuple[Tensor, Tensor]:
        if not self.overparam_predicts_single_axis_direction or self.joint_direction_decoder is None:
            raise ValueError(
                "_decode_joint_axis_directions is only available for joint_decode_type='overparam+singledir'"
            )
        return self.joint_direction_decoder.predict(
            link_latents=link_latents,
            joint_connections=joint_connections,
            joint_valid_flag=joint_valid_flag,
            is_revolute=is_revolute,
            is_prismatic=is_prismatic,
        )

    def _compute_query_link_aabb_parameters(
        self,
        *,
        query_points: Tensor,
        link_ids: Tensor,
    ) -> Tuple[Tensor, Tensor]:
        """Returns per-query AABB centers and half-extents for the assigned link ID."""
        if query_points.ndim != 3 or query_points.shape[-1] != 3:
            raise ValueError(
                f"query_points must have shape (B, Q, 3), got {tuple(query_points.shape)}"
            )
        if link_ids.shape != query_points.shape[:2]:
            raise ValueError(
                "link_ids must match query_points batch/query dims, "
                f"got {tuple(link_ids.shape)} and {tuple(query_points.shape)}"
            )
        query_points = query_points.float()
        centers = query_points.new_zeros(query_points.shape)
        half_extents = query_points.new_ones(query_points.shape)
        for batch_idx in range(query_points.shape[0]):
            batch_link_ids = link_ids[batch_idx]
            valid_mask = batch_link_ids >= 0
            if not bool(valid_mask.any().item()):
                continue
            unique_link_ids = torch.unique(batch_link_ids[valid_mask])
            for link_id in unique_link_ids.tolist():
                query_mask = batch_link_ids == int(link_id)
                link_query_points = query_points[batch_idx][query_mask]
                min_corner = link_query_points.min(dim=0).values
                max_corner = link_query_points.max(dim=0).values
                centers[batch_idx][query_mask] = 0.5 * (min_corner + max_corner)
                half_extents[batch_idx][query_mask] = (
                    0.5 * (max_corner - min_corner)
                ).clamp_min(_OVERPARAM_AXIS_AABB_HALF_EXTENT_MIN)
        return centers, half_extents

    def _denormalize_overparam_axis_points(
        self,
        *,
        axis_points: Tensor,
        query_points: Tensor,
        link_ids: Tensor,
    ) -> Tensor:
        """Converts normalized closest-axis points back into sample/world coordinates."""
        if not self.overparam_closest_axis_uses_part_aabb:
            return axis_points
        centers, half_extents = self._compute_query_link_aabb_parameters(
            query_points=query_points,
            link_ids=link_ids,
        )
        return axis_points.float() * half_extents + centers

    def _convert_overparam_motion_points_to_world_coordinates(
        self,
        *,
        motion_points: Tensor,
        query_points: Tensor,
        assigned_link_ids: Tensor,
    ) -> Tensor:
        """Returns motion-point predictions in sample/world coordinates."""
        if not self.overparam_closest_axis_uses_part_aabb:
            return motion_points
        closest_axis_points = self._denormalize_overparam_axis_points(
            axis_points=motion_points[..., :3],
            query_points=query_points,
            link_ids=assigned_link_ids,
        )
        return torch.cat((closest_axis_points, motion_points[..., 3:]), dim=-1)

    def _fit_revolute_joint_parameters(
        self,
        query_points: Tensor,
        revolute_motion_points: Tensor,
    ) -> Tuple[Tensor, Tensor]:
        """Fits one revolute axis and low/high angles from query-wise motion targets."""
        # This solver is small and numerically sensitive; keep it in fp32 even
        # when the surrounding network runs under AMP.
        query_points = query_points.float()
        revolute_motion_points = revolute_motion_points.float()
        if query_points.numel() == 0:
            zero_axis = revolute_motion_points.new_zeros(6)
            zero_range = revolute_motion_points.new_zeros(2)
            return zero_axis, zero_range

        closest_axis_points = revolute_motion_points[..., :3]
        low_points = revolute_motion_points[..., 3:6]
        high_points = revolute_motion_points[..., 6:9]
        direction_hint = (
            torch.linalg.cross(
                query_points - closest_axis_points,
                low_points - closest_axis_points,
                dim=-1,
            )
            + torch.linalg.cross(
                query_points - closest_axis_points,
                high_points - closest_axis_points,
                dim=-1,
            )
            + torch.linalg.cross(
                low_points - closest_axis_points,
                high_points - closest_axis_points,
                dim=-1,
            )
        ).mean(dim=0)
        axis_direction, axis_point = fit_axis_to_closest_points_torch(
            query_points,
            closest_axis_points,
            direction_hint=direction_hint,
        )
        revolute_axis = axis_point_to_plucker_torch(axis_direction, axis_point)
        low_limit = estimate_revolute_limit_torch(
            query_points,
            low_points,
            axis_direction,
            axis_point,
        )
        high_limit = estimate_revolute_limit_torch(
            query_points,
            high_points,
            axis_direction,
            axis_point,
        )
        return revolute_axis, torch.stack((low_limit, high_limit))

    def _aggregate_flip_invariant_axis_direction(
        self,
        predicted_directions: Tensor,
        *,
        sign_hint: Tensor | None = None,
    ) -> Tensor:
        """Returns one averaged unit direction while treating per-query flips as equivalent."""
        predicted_directions = predicted_directions.float()
        if predicted_directions.numel() == 0:
            return predicted_directions.new_zeros(3)

        direction_norms = torch.linalg.vector_norm(predicted_directions, dim=-1)
        valid_mask = direction_norms > 1e-8
        if not bool(valid_mask.any().item()):
            return predicted_directions.new_zeros(3)

        unit_directions = predicted_directions[valid_mask] / direction_norms[valid_mask].unsqueeze(-1)
        direction_covariance = unit_directions.transpose(0, 1) @ unit_directions
        _, eigenvectors = torch.linalg.eigh(direction_covariance)
        anchor_direction = eigenvectors[:, -1]
        alignment = torch.sign(unit_directions @ anchor_direction)
        alignment = torch.where(
            alignment == 0,
            torch.ones_like(alignment),
            alignment,
        )
        aligned_mean_direction = (unit_directions * alignment.unsqueeze(-1)).mean(dim=0)
        if float(torch.linalg.vector_norm(aligned_mean_direction).item()) <= 1e-8:
            axis_direction = F.normalize(anchor_direction, dim=0, eps=1e-8)
        else:
            axis_direction = F.normalize(aligned_mean_direction, dim=0, eps=1e-8)

        if sign_hint is not None:
            sign_hint = sign_hint.float()
            if float(torch.linalg.vector_norm(sign_hint).item()) > 1e-8:
                if float(torch.dot(axis_direction, sign_hint).item()) < 0.0:
                    axis_direction = -axis_direction
        return axis_direction

    def _fit_revolute_joint_parameters_with_direction(
        self,
        query_points: Tensor,
        revolute_motion_points: Tensor,
    ) -> Tuple[Tensor, Tensor]:
        """Fits one revolute axis from predicted axis directions plus point targets."""
        query_points = query_points.float()
        revolute_motion_points = revolute_motion_points.float()
        if query_points.numel() == 0:
            zero_axis = revolute_motion_points.new_zeros(6)
            zero_range = revolute_motion_points.new_zeros(2)
            return zero_axis, zero_range

        closest_axis_points = revolute_motion_points[..., :3]
        low_points = revolute_motion_points[..., 3:6]
        high_points = revolute_motion_points[..., 6:9]
        predicted_axis_directions = revolute_motion_points[..., 9:12]
        direction_sign_hint = (
            torch.linalg.cross(
                query_points - closest_axis_points,
                low_points - closest_axis_points,
                dim=-1,
            )
            + torch.linalg.cross(
                query_points - closest_axis_points,
                high_points - closest_axis_points,
                dim=-1,
            )
            + torch.linalg.cross(
                low_points - closest_axis_points,
                high_points - closest_axis_points,
                dim=-1,
            )
        ).mean(dim=0)
        axis_direction = self._aggregate_flip_invariant_axis_direction(
            predicted_axis_directions,
            sign_hint=direction_sign_hint,
        )
        if float(torch.linalg.vector_norm(axis_direction).item()) <= 1e-8:
            return self._fit_revolute_joint_parameters(
                query_points=query_points,
                revolute_motion_points=revolute_motion_points[..., :9],
            )

        axis_point = torch.quantile(closest_axis_points, 0.5, dim=0)
        low_limit = estimate_revolute_limit_torch(
            query_points,
            low_points,
            axis_direction,
            axis_point,
        )
        high_limit = estimate_revolute_limit_torch(
            query_points,
            high_points,
            axis_direction,
            axis_point,
        )
        if float(low_limit.item()) > float(high_limit.item()):
            axis_direction = -axis_direction
            low_limit = estimate_revolute_limit_torch(
                query_points,
                low_points,
                axis_direction,
                axis_point,
            )
            high_limit = estimate_revolute_limit_torch(
                query_points,
                high_points,
                axis_direction,
                axis_point,
            )
        revolute_axis = axis_point_to_plucker_torch(axis_direction, axis_point)
        return revolute_axis, torch.stack((low_limit, high_limit))

    def _fit_prismatic_joint_parameters(
        self,
        query_points: Tensor,
        prismatic_motion_points: Tensor,
    ) -> Tuple[Tensor, Tensor]:
        """Fits one prismatic axis and low/high displacements from query-wise targets."""
        # This solver is small and numerically sensitive; keep it in fp32 even
        # when the surrounding network runs under AMP.
        query_points = query_points.float()
        prismatic_motion_points = prismatic_motion_points.float()
        if query_points.numel() == 0:
            zero_axis = prismatic_motion_points.new_zeros(6)
            zero_range = prismatic_motion_points.new_zeros(2)
            return zero_axis, zero_range

        closest_axis_points = prismatic_motion_points[..., :3]
        low_points = prismatic_motion_points[..., 3:6]
        high_points = prismatic_motion_points[..., 6:9]
        direction_hint = (high_points - low_points).mean(dim=0)
        axis_direction, axis_point = fit_axis_to_closest_points_torch(
            query_points,
            closest_axis_points,
            direction_hint=direction_hint,
        )
        prismatic_axis = axis_point_to_plucker_torch(axis_direction, axis_point)
        low_limit = estimate_prismatic_limit_torch(
            query_points,
            low_points,
            axis_direction,
        )
        high_limit = estimate_prismatic_limit_torch(
            query_points,
            high_points,
            axis_direction,
        )
        return prismatic_axis, torch.stack((low_limit, high_limit))

    def _fit_prismatic_joint_parameters_with_direction(
        self,
        query_points: Tensor,
        prismatic_motion_points: Tensor,
    ) -> Tuple[Tensor, Tensor]:
        """Fits one prismatic axis from predicted axis directions plus point targets."""
        query_points = query_points.float()
        prismatic_motion_points = prismatic_motion_points.float()
        if query_points.numel() == 0:
            zero_axis = prismatic_motion_points.new_zeros(6)
            zero_range = prismatic_motion_points.new_zeros(2)
            return zero_axis, zero_range

        closest_axis_points = prismatic_motion_points[..., :3]
        low_points = prismatic_motion_points[..., 3:6]
        high_points = prismatic_motion_points[..., 6:9]
        predicted_axis_directions = prismatic_motion_points[..., 9:12]
        axis_direction = self._aggregate_flip_invariant_axis_direction(
            predicted_axis_directions,
            sign_hint=(high_points - low_points).mean(dim=0),
        )
        if float(torch.linalg.vector_norm(axis_direction).item()) <= 1e-8:
            return self._fit_prismatic_joint_parameters(
                query_points=query_points,
                prismatic_motion_points=prismatic_motion_points[..., :9],
            )

        axis_point = torch.quantile(closest_axis_points, 0.5, dim=0)
        low_limit = estimate_prismatic_limit_torch(
            query_points,
            low_points,
            axis_direction,
        )
        high_limit = estimate_prismatic_limit_torch(
            query_points,
            high_points,
            axis_direction,
        )
        if float(low_limit.item()) > float(high_limit.item()):
            axis_direction = -axis_direction
            low_limit = estimate_prismatic_limit_torch(
                query_points,
                low_points,
                axis_direction,
            )
            high_limit = estimate_prismatic_limit_torch(
                query_points,
                high_points,
                axis_direction,
            )
        prismatic_axis = axis_point_to_plucker_torch(axis_direction, axis_point)
        return prismatic_axis, torch.stack((low_limit, high_limit))

    def _fit_revolute_joint_parameters_with_single_direction(
        self,
        query_points: Tensor,
        revolute_motion_points: Tensor,
        axis_direction: Tensor,
    ) -> Tuple[Tensor, Tensor]:
        """Fits one revolute axis from a single predicted axis direction plus point targets."""
        query_points = query_points.float()
        revolute_motion_points = revolute_motion_points.float()
        axis_direction = axis_direction.float()
        if query_points.numel() == 0:
            zero_axis = revolute_motion_points.new_zeros(6)
            zero_range = revolute_motion_points.new_zeros(2)
            return zero_axis, zero_range

        if float(torch.linalg.vector_norm(axis_direction).item()) <= 1e-8:
            return self._fit_revolute_joint_parameters(
                query_points=query_points,
                revolute_motion_points=revolute_motion_points,
            )
        axis_direction = F.normalize(axis_direction, dim=0, eps=1e-8)

        closest_axis_points = revolute_motion_points[..., :3]
        low_points = revolute_motion_points[..., 3:6]
        high_points = revolute_motion_points[..., 6:9]
        axis_point = torch.quantile(closest_axis_points, 0.5, dim=0)
        low_limit = estimate_revolute_limit_torch(
            query_points,
            low_points,
            axis_direction,
            axis_point,
        )
        high_limit = estimate_revolute_limit_torch(
            query_points,
            high_points,
            axis_direction,
            axis_point,
        )
        if float(low_limit.item()) > float(high_limit.item()):
            axis_direction = -axis_direction
            low_limit = estimate_revolute_limit_torch(
                query_points,
                low_points,
                axis_direction,
                axis_point,
            )
            high_limit = estimate_revolute_limit_torch(
                query_points,
                high_points,
                axis_direction,
                axis_point,
            )
        revolute_axis = axis_point_to_plucker_torch(axis_direction, axis_point)
        return revolute_axis, torch.stack((low_limit, high_limit))

    def _fit_prismatic_joint_parameters_with_single_direction(
        self,
        query_points: Tensor,
        prismatic_motion_points: Tensor,
        axis_direction: Tensor,
    ) -> Tuple[Tensor, Tensor]:
        """Fits one prismatic axis from a single predicted axis direction plus point targets."""
        query_points = query_points.float()
        prismatic_motion_points = prismatic_motion_points.float()
        axis_direction = axis_direction.float()
        if query_points.numel() == 0:
            zero_axis = prismatic_motion_points.new_zeros(6)
            zero_range = prismatic_motion_points.new_zeros(2)
            return zero_axis, zero_range

        if float(torch.linalg.vector_norm(axis_direction).item()) <= 1e-8:
            return self._fit_prismatic_joint_parameters(
                query_points=query_points,
                prismatic_motion_points=prismatic_motion_points,
            )
        axis_direction = F.normalize(axis_direction, dim=0, eps=1e-8)

        closest_axis_points = prismatic_motion_points[..., :3]
        low_points = prismatic_motion_points[..., 3:6]
        high_points = prismatic_motion_points[..., 6:9]
        axis_point = torch.quantile(closest_axis_points, 0.5, dim=0)
        low_limit = estimate_prismatic_limit_torch(
            query_points,
            low_points,
            axis_direction,
        )
        high_limit = estimate_prismatic_limit_torch(
            query_points,
            high_points,
            axis_direction,
        )
        if float(low_limit.item()) > float(high_limit.item()):
            axis_direction = -axis_direction
            low_limit = estimate_prismatic_limit_torch(
                query_points,
                low_points,
                axis_direction,
            )
            high_limit = estimate_prismatic_limit_torch(
                query_points,
                high_points,
                axis_direction,
            )
        prismatic_axis = axis_point_to_plucker_torch(axis_direction, axis_point)
        return prismatic_axis, torch.stack((low_limit, high_limit))

    def _recover_overparam_joint_parameters(
        self,
        *,
        query_points: Tensor,
        assigned_link_ids: Tensor,
        joint_connections: Tensor,
        joint_valid_flag: Tensor,
        is_revolute: Tensor,
        is_prismatic: Tensor,
        revolute_motion_points: Tensor,
        prismatic_motion_points: Tensor,
        revolute_axis_directions: Tensor | None = None,
        prismatic_axis_directions: Tensor | None = None,
    ) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
        """Recovers per-joint parameters by fitting each child link's query-wise targets."""
        query_points = query_points.float()
        revolute_motion_points = revolute_motion_points.float()
        prismatic_motion_points = prismatic_motion_points.float()
        batch_size, max_joints = joint_connections.shape[:2]
        revolute_axis = query_points.new_zeros((batch_size, max_joints, 6))
        prismatic_axis = query_points.new_zeros((batch_size, max_joints, 6))
        revolute_range = query_points.new_zeros((batch_size, max_joints, 2))
        prismatic_range = query_points.new_zeros((batch_size, max_joints, 2))

        child_link_ids = joint_connections[..., 1]
        for batch_idx in range(batch_size):
            for joint_idx in range(max_joints):
                query_mask = assigned_link_ids[batch_idx] == child_link_ids[batch_idx, joint_idx]
                joint_query_points = query_points[batch_idx][query_mask]
                if self.overparam_uses_axis_direction:
                    if self.joint_decode_type == "overparam+dir":
                        (
                            revolute_axis[batch_idx, joint_idx],
                            revolute_range[batch_idx, joint_idx],
                        ) = self._fit_revolute_joint_parameters_with_direction(
                            joint_query_points,
                            revolute_motion_points[batch_idx][query_mask],
                        )
                        (
                            prismatic_axis[batch_idx, joint_idx],
                            prismatic_range[batch_idx, joint_idx],
                        ) = self._fit_prismatic_joint_parameters_with_direction(
                            joint_query_points,
                            prismatic_motion_points[batch_idx][query_mask],
                        )
                    else:
                        if revolute_axis_directions is None or prismatic_axis_directions is None:
                            raise ValueError(
                                "overparam+singledir recovery requires per-joint axis directions"
                            )
                        (
                            revolute_axis[batch_idx, joint_idx],
                            revolute_range[batch_idx, joint_idx],
                        ) = self._fit_revolute_joint_parameters_with_single_direction(
                            joint_query_points,
                            revolute_motion_points[batch_idx][query_mask],
                            revolute_axis_directions[batch_idx, joint_idx],
                        )
                        (
                            prismatic_axis[batch_idx, joint_idx],
                            prismatic_range[batch_idx, joint_idx],
                        ) = self._fit_prismatic_joint_parameters_with_single_direction(
                            joint_query_points,
                            prismatic_motion_points[batch_idx][query_mask],
                            prismatic_axis_directions[batch_idx, joint_idx],
                        )
                else:
                    (
                        revolute_axis[batch_idx, joint_idx],
                        revolute_range[batch_idx, joint_idx],
                    ) = self._fit_revolute_joint_parameters(
                        joint_query_points,
                        revolute_motion_points[batch_idx][query_mask],
                    )
                    (
                        prismatic_axis[batch_idx, joint_idx],
                        prismatic_range[batch_idx, joint_idx],
                    ) = self._fit_prismatic_joint_parameters(
                        joint_query_points,
                        prismatic_motion_points[batch_idx][query_mask],
                    )

        revolute_mask = (joint_valid_flag & is_revolute).unsqueeze(-1)
        prismatic_mask = (joint_valid_flag & is_prismatic).unsqueeze(-1)
        return (
            revolute_axis.masked_fill(~revolute_mask, 0),
            prismatic_axis.masked_fill(~prismatic_mask, 0),
            revolute_range.masked_fill(~revolute_mask[..., :1], 0),
            prismatic_range.masked_fill(~prismatic_mask[..., :1], 0),
        )

    def _resolve_joint_decoding_link_ids(
        self,
        *,
        segmentation_logits: Tensor,
        link_ids: Tensor | None,
    ) -> Tensor:
        if link_ids is not None:
            return link_ids
        if self.training:
            raise ValueError(
                "forward requires link_ids when joint_decode_type uses over-parameterized decoding during training"
            )
        return segmentation_logits.argmax(dim=-1)

    def _build_parent_index(
        self,
        *,
        joint_connections: Tensor,
        joint_valid_flag: Tensor,
        num_links: int,
    ) -> Tensor:
        """Returns one parent-link index per link, using `-1` for roots/padding."""
        device = joint_connections.device
        parent_index = torch.full(
            (joint_connections.shape[0], num_links),
            fill_value=-1,
            dtype=joint_connections.dtype,
            device=device,
        )
        if joint_connections.shape[1] == 0 or num_links == 0:
            return parent_index

        parent_indices = joint_connections[..., 0]
        child_indices = joint_connections[..., 1]
        valid_joint_mask = (
            joint_valid_flag
            & (parent_indices >= 0)
            & (parent_indices < num_links)
            & (child_indices >= 0)
            & (child_indices < num_links)
        )
        if not bool(valid_joint_mask.any().item()):
            return parent_index

        valid_child_mask = torch.zeros(
            (joint_connections.shape[0], num_links),
            dtype=torch.bool,
            device=device,
        )
        valid_child_mask.scatter_(
            dim=1,
            index=child_indices.clamp_min(0),
            src=valid_joint_mask,
        )
        parent_values = torch.where(valid_joint_mask, parent_indices, torch.zeros_like(parent_indices))
        parent_index.scatter_(
            dim=1,
            index=child_indices.clamp_min(0),
            src=parent_values,
        )
        return parent_index.masked_fill(~valid_child_mask, -1)

    def _build_depth_decayed_ancestor_context(
        self,
        *,
        link_latents: Tensor,
        joint_connections: Tensor,
        joint_valid_flag: Tensor,
    ) -> Tensor:
        """Returns a depth-decayed ancestor average for each link latent."""
        if not self.use_ancestor_context_for_segmentation:
            return link_latents

        batch_size, num_links, _ = link_latents.shape
        if num_links == 0:
            return torch.zeros_like(link_latents)

        parent_index = self._build_parent_index(
            joint_connections=joint_connections,
            joint_valid_flag=joint_valid_flag,
            num_links=num_links,
        )
        valid_parent_mask = parent_index >= 0
        if self.ancestor_context_decay == 0.0:
            parent_latents = link_latents.gather(
                dim=1,
                index=parent_index.clamp_min(0).unsqueeze(-1).expand_as(link_latents),
            )
            return parent_latents * valid_parent_mask.unsqueeze(-1).to(dtype=link_latents.dtype)
        if not bool(valid_parent_mask.any().item()):
            return torch.zeros_like(link_latents)
        solve_dtype = (
            torch.float64
            if link_latents.device.type == "cpu"
            else torch.float32
        )
        parent_adjacency = torch.zeros(
            (batch_size, num_links, num_links),
            device=link_latents.device,
            dtype=solve_dtype,
        )
        parent_adjacency.scatter_(
            dim=2,
            index=parent_index.clamp_min(0).unsqueeze(-1),
            src=valid_parent_mask.unsqueeze(-1).to(dtype=solve_dtype),
        )
        system_matrix = (
            torch.eye(num_links, device=link_latents.device, dtype=solve_dtype).unsqueeze(0)
            - self.ancestor_context_decay * parent_adjacency
        )
        weighted_ancestor_matrix = torch.linalg.solve(system_matrix, parent_adjacency)
        normalization = weighted_ancestor_matrix.sum(dim=-1, keepdim=True)
        normalized_ancestor_matrix = weighted_ancestor_matrix / normalization.clamp_min(1.0)
        normalized_ancestor_matrix = normalized_ancestor_matrix.masked_fill(
            normalization <= 0.0,
            0.0,
        )
        return torch.matmul(
            normalized_ancestor_matrix.to(dtype=link_latents.dtype),
            link_latents,
        )

    def build_segmentation_link_latents(
        self,
        *,
        link_latents: Tensor,
        joint_connections: Tensor,
        joint_valid_flag: Tensor,
    ) -> Tensor:
        """Applies optional ancestor-context refinement for segmentation only."""
        if not self.use_ancestor_context_for_segmentation:
            return link_latents
        if self.ancestor_context_gate is None or self.ancestor_context_projector is None:
            raise RuntimeError(
                "ancestor-context projector and gate must exist when "
                "use_ancestor_context_for_segmentation=True"
            )
        ancestor_context = self._build_depth_decayed_ancestor_context(
            link_latents=link_latents,
            joint_connections=joint_connections,
            joint_valid_flag=joint_valid_flag,
        )
        projected_ancestor_context = self.ancestor_context_projector(ancestor_context)
        gate = torch.sigmoid(
            self.ancestor_context_gate(
                torch.cat((link_latents, projected_ancestor_context), dim=-1)
            )
        )
        return link_latents + gate * projected_ancestor_context

    def decode(
        self,
        *,
        query_latents: Tensor,
        query_points: Tensor,
        link_latents: Tensor,
        link_valid_flag: Tensor,
        joint_connections: Tensor,
        joint_valid_flag: Tensor,
        is_revolute: Tensor,
        is_prismatic: Tensor,
        link_ids: Tensor | None = None,
    ) -> Dict[str, Any]:
        """Decodes predictions while preserving the caller-provided joint layout."""
        link_latents = link_latents.masked_fill(~link_valid_flag.unsqueeze(-1), 0)
        segmentation_link_latents = self.build_segmentation_link_latents(
            link_latents=link_latents,
            joint_connections=joint_connections,
            joint_valid_flag=joint_valid_flag,
        )
        segmentation_logits = self.decode_segmentation(
            query_latents=query_latents,
            link_latents=segmentation_link_latents,
            link_valid_flag=link_valid_flag,
        )
        revolute_axis = None
        prismatic_axis = None
        revolute_range = None
        prismatic_range = None
        revolute_closest_axis_points = None
        revolute_low_points = None
        revolute_high_points = None
        revolute_axis_directions = None
        revolute_closest_axis_points_decoder = None
        prismatic_closest_axis_points = None
        prismatic_low_points = None
        prismatic_high_points = None
        prismatic_axis_directions = None
        prismatic_closest_axis_points_decoder = None
        joint_decoding_link_ids = None
        decoded_motion_points = None
        if self.joint_decode_type in _PLAIN_JOINT_DECODE_TYPES:
            if not self.training:
                revolute_axis, prismatic_axis, revolute_range, prismatic_range = (
                    self.decode_joint_parameters(
                        link_latents=link_latents,
                        joint_connections=joint_connections,
                        joint_valid_flag=joint_valid_flag,
                        is_revolute=is_revolute,
                        is_prismatic=is_prismatic,
                    )
                )
        else:
            joint_decoding_link_ids = self._resolve_joint_decoding_link_ids(
                segmentation_logits=segmentation_logits,
                link_ids=link_ids,
            )
            decoded_motion_points = self._decode_joint_motion_points(
                query_latents=query_latents,
                link_latents=link_latents,
                assigned_link_ids=joint_decoding_link_ids,
                joint_connections=joint_connections,
            )
            revolute_motion_points_decoder_raw, prismatic_motion_points_decoder_raw = (
                decoded_motion_points
            )
            revolute_joint_axis_directions = None
            prismatic_joint_axis_directions = None
            if self.overparam_predicts_single_axis_direction:
                (
                    revolute_joint_axis_directions,
                    prismatic_joint_axis_directions,
                ) = self._decode_joint_axis_directions(
                    link_latents=link_latents,
                    joint_connections=joint_connections,
                    joint_valid_flag=joint_valid_flag,
                    is_revolute=is_revolute,
                    is_prismatic=is_prismatic,
                )
            revolute_motion_points_decoder, prismatic_motion_points_decoder = decoded_motion_points
            revolute_motion_points = self._convert_overparam_motion_points_to_world_coordinates(
                motion_points=revolute_motion_points_decoder,
                query_points=query_points,
                assigned_link_ids=joint_decoding_link_ids,
            )
            prismatic_motion_points = self._convert_overparam_motion_points_to_world_coordinates(
                motion_points=prismatic_motion_points_decoder,
                query_points=query_points,
                assigned_link_ids=joint_decoding_link_ids,
            )
            revolute_axis, prismatic_axis, revolute_range, prismatic_range = (
                self.decode_joint_parameters(
                    link_latents=link_latents,
                    joint_connections=joint_connections,
                    joint_valid_flag=joint_valid_flag,
                    is_revolute=is_revolute,
                    is_prismatic=is_prismatic,
                    query_points=query_points,
                    assigned_link_ids=joint_decoding_link_ids,
                    decoded_motion_points=(revolute_motion_points, prismatic_motion_points),
                    decoded_axis_directions=(
                        None
                        if revolute_joint_axis_directions is None
                        or prismatic_joint_axis_directions is None
                        else (
                            revolute_joint_axis_directions,
                            prismatic_joint_axis_directions,
                        )
                    ),
                    decoded_motion_points_are_world=True,
                )
            )
            revolute_closest_axis_points_decoder = revolute_motion_points_decoder_raw[..., :3]
            revolute_closest_axis_points = revolute_motion_points[..., :3]
            revolute_low_points = revolute_motion_points[..., 3:6]
            revolute_high_points = revolute_motion_points[..., 6:9]
            if self.overparam_predicts_query_axis_direction:
                revolute_axis_directions = revolute_motion_points[..., 9:12]
            elif self.overparam_predicts_single_axis_direction:
                revolute_axis_directions = revolute_joint_axis_directions
            prismatic_closest_axis_points_decoder = prismatic_motion_points_decoder_raw[..., :3]
            prismatic_closest_axis_points = prismatic_motion_points[..., :3]
            prismatic_low_points = prismatic_motion_points[..., 3:6]
            prismatic_high_points = prismatic_motion_points[..., 6:9]
            if self.overparam_predicts_query_axis_direction:
                prismatic_axis_directions = prismatic_motion_points[..., 9:12]
            elif self.overparam_predicts_single_axis_direction:
                prismatic_axis_directions = prismatic_joint_axis_directions
        return {
            "segmentation_logits": segmentation_logits,
            "revolute_axis": revolute_axis,
            "prismatic_axis": prismatic_axis,
            "revolute_range": revolute_range,
            "prismatic_range": prismatic_range,
            "revolute_closest_axis_points": revolute_closest_axis_points,
            "revolute_closest_axis_points_decoder": revolute_closest_axis_points_decoder,
            "revolute_low_points": revolute_low_points,
            "revolute_high_points": revolute_high_points,
            "revolute_axis_directions": revolute_axis_directions,
            "prismatic_closest_axis_points": prismatic_closest_axis_points,
            "prismatic_closest_axis_points_decoder": prismatic_closest_axis_points_decoder,
            "prismatic_low_points": prismatic_low_points,
            "prismatic_high_points": prismatic_high_points,
            "prismatic_axis_directions": prismatic_axis_directions,
            "joint_decoding_link_ids": joint_decoding_link_ids,
            "joint_connections": joint_connections,
            "joint_valid_flag": joint_valid_flag,
            "is_revolute": is_revolute,
            "is_prismatic": is_prismatic,
            "query_points": query_points,
        }

    def forward(
        self,
        shape_points: Tensor,
        shape_point_normals: Tensor,
        query_points: Tensor,
        query_point_normals: Tensor,
        link_point_prompts: Tensor | None,
        link_point_prompt_normals: Tensor | None,
        link_valid_flag: Tensor,
        joint_connections: Tensor,
        joint_valid_flag: Tensor,
        is_revolute: Tensor,
        is_prismatic: Tensor,
        link_point_prompt_dropout_eligible: Tensor | None = None,
        link_text_prompts: Sequence[Sequence[str]] | None = None,
        link_text_embeddings: Tensor | None = None,
        link_ids: Tensor | None = None,
    ) -> Dict[str, Any]:
        """Encodes inputs and decodes using padded joint tensors supplied by the caller."""
        encoder_output = self.encoder(
            shape_points=shape_points,
            shape_point_normals=shape_point_normals,
            query_points=query_points,
            query_point_normals=query_point_normals,
            link_point_prompts=link_point_prompts,
            link_point_prompt_normals=link_point_prompt_normals,
            link_valid_flag=link_valid_flag,
            link_point_prompt_dropout_eligible=link_point_prompt_dropout_eligible,
            link_text_prompts=link_text_prompts,
            link_text_embeddings=link_text_embeddings,
        )
        decoded_output = self.decode(
            query_latents=encoder_output["query_latents"],
            query_points=query_points,
            link_latents=encoder_output["link_latents"],
            link_valid_flag=link_valid_flag,
            joint_connections=joint_connections,
            joint_valid_flag=joint_valid_flag,
            is_revolute=is_revolute,
            is_prismatic=is_prismatic,
            link_ids=link_ids,
        )
        return {**encoder_output, **decoded_output}


__all__ = [
    "Particulate2ArticulationModel",
    "Particulate2Encoder",
    "JointDecoderPlainFlowMatching",
    "JointDecoderOverParametrized",
    "JointDecoderSingleDirection",
    "JointDecoderPlain",
    "SegmentationDecoder",
]