import os
from typing import Any, Callable, Sequence
from warnings import warn

import attr
import torch
from tqdm import tqdm

from src.data.esm.sdk.api import (
    ESM3InferenceClient,
    ESMProtein,
    ESMProteinError,
    ESMProteinTensor,
    ForwardAndSampleOutput,
    ForwardTrackData,
    GenerationConfig,
    LogitsConfig,
    LogitsOutput,
    SamplingConfig,
    SamplingTrackConfig,
)
from src.data.esm.tokenization import (
    EsmTokenizerBase,
    TokenizerCollectionProtocol,
)
from src.data.esm.tokenization.function_tokenizer import (
    InterProQuantizedTokenizer,
)
from src.data.esm.utils.constants import esm3 as C
from src.data.esm.utils.misc import stack_variable_length_tensors
from src.data.esm.utils.noise_schedules import NOISE_SCHEDULE_REGISTRY
from src.data.esm.utils.sampling import (
    _BatchedESMProteinTensor,
    get_sampling_mask,
    sample_function_logits,
    sample_logits,
    sample_residue_annotation_logits,
    sample_sasa_logits,
)


def _trim_sequence_tensor_dataclass(o: Any, sequence_len: int):
    """Trim tensors on the sequence dimension.

    This util assume that input tensor class has batch dimension.
    """
    assert attr.has(o.__class__)

    sliced = {}
    for k, v in attr.asdict(o, recurse=False).items():
        if v is None:
            sliced[k] = None
        elif isinstance(v, torch.Tensor):
            # Trim padding.
            sliced[k] = v[:, :sequence_len]
        elif isinstance(v, tuple) and all(isinstance(t, torch.Tensor) for t in v):
            # Trim padding for a list of tensors
            sliced[k] = [t[:, :sequence_len] for t in v]
        elif attr.has(v.__class__):
            # Recursively slice the child attribute.
            sliced[k] = _trim_sequence_tensor_dataclass(v, sequence_len)
        else:
            # Otherwise, simply copy the entire data bit over.
            sliced[k] = v

    return attr.evolve(o, **sliced)


def _slice_tensor_dataclass(o: Any, i: int, keep_dim: bool = False) -> Any:
    """Take a slice out of any attr defined Tensor objects along the batch dimension.

    Args:
        o: input tensor object to be sliced.
        i: index of the row to be sliced.
        keep_dim: whether to keep the batch dim after slicing.
            For example, given a tensor of shape (5, 8), if keep_dim is True,
            return a sliced tensor of shape (1, 8). Return a tensor of shape
            (8,) instead if keep_dim is False. The default is False.
    """
    assert attr.has(o.__class__)

    sliced = {}
    for k, v in attr.asdict(o, recurse=False).items():
        if v is None:
            sliced[k] = None
        elif isinstance(v, torch.Tensor):
            # Select the i-th row of each tensor.
            row = v.select(0, i)
            if keep_dim:
                row = row.unsqueeze(0)
            sliced[k] = row
        elif attr.has(v.__class__):
            # Recursively slice the child attribute.
            sliced[k] = _slice_tensor_dataclass(v, i, keep_dim)
        else:
            # Otherwise, simply copy the entire data bit over.
            sliced[k] = v

    return attr.evolve(o, **sliced)


def iterative_sampling_raw(
    client: ESM3InferenceClient,
    proteins: list[ESMProtein],
    configs: list[GenerationConfig],
) -> list[ESMProtein | ESMProteinError]:
    # Keep structure tokens
    input_tokens = [client.encode(protein) for protein in proteins]

    output_tokens_list = client.batch_generate(input_tokens, configs)

    raw_proteins: list[ESMProtein | ESMProteinError] = []
    for output_tokens in output_tokens_list:
        if isinstance(output_tokens, ESMProteinTensor):
            raw_proteins.append(client.decode(output_tokens))
        elif isinstance(output_tokens, ESMProteinError):
            raw_proteins.append(output_tokens)
        else:
            raise ValueError(f"Unknown output type {type(output_tokens)}")

    for input_protein, raw_protein, config in zip(proteins, raw_proteins, configs):
        if isinstance(raw_protein, ESMProteinError):
            # If this generation errored out.
            continue
        if config.track not in ["function", "residue_annotations"]:
            # Function and residue annotation encoding/decoding is lossy
            # There is no guarantee that decoding encoded tokens will yield the same input
            raw_protein.function_annotations = input_protein.function_annotations

    return raw_proteins


def _make_masked_inputs(
    track: str, sequence_length: int, tokenizers: TokenizerCollectionProtocol
):
    get_tokenizer: Callable[[str], EsmTokenizerBase] = lambda s: getattr(tokenizers, s)
    has_tokenizer: Callable[[str], bool] = lambda s: hasattr(tokenizers, s)

    if track == "coordinates":
        dims = (sequence_length, 3, 3)
    elif track == "confidence":
        dims = (sequence_length,)
    elif track == "attention_mask":
        dims = (sequence_length,)
    elif track == "function":
        dims = (sequence_length, tokenizers.function.depth)
    elif track == "residue_annotations":
        dims = (sequence_length, C.MAX_RESIDUE_ANNOTATIONS)
    else:
        dims = (sequence_length,)

    if track == "coordinates":
        masked_tokens = torch.full(dims, torch.inf, dtype=torch.float)
    elif track == "confidence":
        # All-mask dummy input for confidence track.
        masked_tokens = torch.full(dims, 0.0)
    elif track == "attention_mask":
        masked_tokens = torch.full(dims, 1, dtype=torch.bool)
    elif has_tokenizer(track):
        masked_tokens = torch.full(
            dims, get_tokenizer(track).mask_token_id, dtype=torch.long
        )
        masked_tokens[0] = get_tokenizer(track).bos_token_id
        masked_tokens[-1] = get_tokenizer(track).eos_token_id
    else:
        # Does not know how to create the dummy all masked input.
        return None

    return masked_tokens


def _stack_protein_tensors(
    input_tokens: list[ESMProteinTensor],
    sequence_lengths: list[int],
    tokenizers: TokenizerCollectionProtocol,
    device: str | torch.device,
) -> _BatchedESMProteinTensor:
    o = _BatchedESMProteinTensor()

    def _maybe_mock_input(fn, t, l):
        if t is not None:
            return t

        # Try create dummy masked input for this prompt.
        t = _make_masked_inputs(fn, l, tokenizers)
        if t is not None:
            t = t.to(device)

        return t

    def _stack_field(fn: str):
        tensors = [getattr(tokens, fn) for tokens in input_tokens]

        # Create all mask mock inputs for any tensors that are None.
        tensors = [
            _maybe_mock_input(fn, t, l) for t, l in zip(tensors, sequence_lengths)
        ]

        # Handle any track that has all None as the input.
        # We can't meaningfully stack tensors in this case, so simply batched
        # them as None in _BatchedESMProteinTensor.
        if all([t is None for t in tensors]):
            setattr(o, fn, None)
            return

        if fn == "coordinates":
            mask_token_id = torch.inf
        else:
            mask_token_id = getattr(tokenizers, fn).pad_token_id

        setattr(
            o,
            fn,
            stack_variable_length_tensors(
                sequences=tensors,  # type: ignore
                constant_value=mask_token_id,
            ),
        )

    for f in attr.fields(ESMProteinTensor):
        # We do not batch potential_sequence_of_concern field.
        if f.name == "potential_sequence_of_concern":
            continue
        _stack_field(f.name)

    return o


def _get_masked_positions(
    track: str, tokens: torch.Tensor, mask_token_id: int
) -> torch.Tensor:
    if track == "function":
        mask = torch.all(tokens == mask_token_id, dim=-1).to(tokens.device)
    else:
        mask = tokens == mask_token_id

    # Should not sample BOS and EOS positions.
    mask[..., 0] = False
    mask[..., -1] = False

    return mask


def _get_iterative_sampling_mask_for_prompt_and_step(
    cur_sampled: _BatchedESMProteinTensor,
    sequence_lengths: torch.Tensor,
    total_to_sample: torch.Tensor,
    step: int,
    entropy: ForwardTrackData,
    config: GenerationConfig,
    tokenizers: TokenizerCollectionProtocol,
) -> torch.Tensor:
    """Get sampling mask based on forward output and config.

    Returns:
        Sampling mask and num of positions sampled.
    """
    track_to_sample = config.track
    tokens = getattr(cur_sampled, track_to_sample)
    device = tokens.device

    shape = tokens.shape
    B, L = shape[0], shape[1]

    # TODO: figure out why we want this function to work with
    # _BatchedESMProteinTensor in the first place. Logics below
    # don't really work for batched tensors.
    assert B == 1

    sampling_mask = torch.ones((B, L), dtype=torch.bool, device=device)
    sampling_mask[:, 0] = False  # BOS
    # EOS and all padding tokens.
    sampling_mask &= (
        torch.arange(L).repeat(B, 1) < (sequence_lengths - 1).unsqueeze(-1)
    ).to(device)

    is_mask = _get_masked_positions(
        track_to_sample, tokens, getattr(tokenizers, track_to_sample).mask_token_id
    )
    if not is_mask.any().item():
        raise ValueError(f"Cannot sample {config.track} when input has no masks.")
    sampling_mask = sampling_mask & is_mask

    # Initialize schedule and masks
    decoding_schedule = NOISE_SCHEDULE_REGISTRY[config.schedule]

    # Calculate number of tokens to sample
    still_masked = torch.sum(sampling_mask).int()
    perc_masked_after_this_step = decoding_schedule(
        torch.tensor((step + 1) / config.num_steps)
    )
    num_tokens_masked_after_this_step = (
        # To avoid rounding errors, add a small epsilon.
        # NOTE: Tensor.round does not cast to int,
        # so it actually leads to rounding down.
        # e.g. tensor(67.0000).int() = 66
        perc_masked_after_this_step * total_to_sample + 0.1
    ).int()
    num_to_sample = still_masked - num_tokens_masked_after_this_step

    if config.strategy == "entropy":
        track_entropy: torch.Tensor = getattr(entropy, track_to_sample).to(
            device
        )  # (B, L) or (B, L, D)

        if track_to_sample == "function":
            track_entropy = track_entropy.sum(-1)  # (B, L, D) -> (B, L)

        track_entropy = track_entropy.masked_fill(
            ~sampling_mask, torch.finfo(track_entropy.dtype).max
        )
        _, indices = track_entropy.topk(num_to_sample, dim=-1, largest=False)
        is_top_k = torch.zeros((B, L), dtype=torch.bool, device=device).scatter(
            1, indices, True
        )
        where_to_sample = sampling_mask & is_top_k
    elif config.strategy == "random":
        # Skip B since we know there is only 1 prompt here.
        _, masked_indices = sampling_mask.nonzero(as_tuple=True)
        # Random shuffle the masked indices then select the first num_to_sample.
        rnd_indices = masked_indices[torch.randperm(len(masked_indices))][
            :num_to_sample
        ]
        rnd_mask = torch.zeros_like(sampling_mask)
        rnd_mask[:, rnd_indices] = True
        where_to_sample = sampling_mask & rnd_mask

    if track_to_sample == "function":
        where_to_sample = where_to_sample.unsqueeze(-1).expand(
            B, L, tokenizers.function.depth
        )  # (B, L) -> (B, L, D)

    return where_to_sample


def _get_non_special_tokens(
    protein: ESMProteinTensor, tokenizers: TokenizerCollectionProtocol
) -> int:
    if protein.sequence is None:
        # There is no sequence to infer the number of tokens to decode.
        # So we assume the entire sequence minus bos and eos are for decoding.
        return len(protein) - 2

    mask = torch.ones_like(protein.sequence)
    for special_token in tokenizers.sequence.special_token_ids:
        if special_token == tokenizers.sequence.mask_token_id:
            continue  # MASK tokens need to be sampled.
        mask[protein.sequence == special_token] = 0

    return int(torch.sum(mask).item())


def _get_annealed_temperature(step: int, num_steps: int, initial_temperature: float):
    step_ratio = step / max(1, (num_steps - 1))
    return max(initial_temperature - step_ratio, 0.001) ** 2


def iterative_sampling_tokens(
    client: ESM3InferenceClient,
    input_tokens: list[ESMProteinTensor],
    configs: list[GenerationConfig],
    tokenizers: TokenizerCollectionProtocol,
) -> Sequence[ESMProteinTensor | ESMProteinError]:
    devices = set([t.device for t in input_tokens])
    if len(devices) > 1:
        raise AttributeError(f"Input tokens on multiple devices {devices}")

    sampled_tokens = [attr.evolve(tokens) for tokens in input_tokens]

    # Clear structure tokens if user would like to condition only on coordinates.
    for tokens, config in zip(sampled_tokens, configs):
        if config.condition_on_coordinates_only and tokens.coordinates is not None:
            tokens.structure = None

    # Total sequence lengths.
    sequence_lengths = [len(tokens) for tokens in sampled_tokens]
    # Figure out the number of tokens to be sampled for each prompt.
    total_to_sample = []
    for protein, config in zip(sampled_tokens, configs):
        track = config.track

        if getattr(protein, track) is None:
            # We need to sample the entire track.
            num_sampling_steps = _get_non_special_tokens(protein, tokenizers)
        else:
            masked = _get_masked_positions(
                track, getattr(protein, track), getattr(tokenizers, track).mask_token_id
            )
            num_sampling_steps = torch.sum(masked).item()

        total_to_sample.append(num_sampling_steps)

        # Users might over-specify the number of sampling steps for a given prompt
        # TODO: Give a warning about mismatched num_steps and number of masks.
        if (num_sampling_steps > 0) and (num_sampling_steps < config.num_steps):
            config.num_steps = int(num_sampling_steps)

    # Different prompts may ask for different number of decoding steps.
    # For now, we simply run the max number of steps.
    # TODO: return completed proteins as soon as they are finished sampling.
    max_num_steps = max([config.num_steps for config in configs])

    # Now stack the list to make a single batched ESMProteinTensor.
    batched_tokens = _stack_protein_tensors(
        sampled_tokens, sequence_lengths, tokenizers, devices.pop()
    )

    # Remember sampled prompts that has somehow errored out.
    errors: dict[int, ESMProteinError] = {}

    # Decode
    disable_tqdm = bool(os.environ.get("DISABLE_ITERATIVE_SAMPLING_TQDM", False))
    for t in tqdm(range(max_num_steps), disable=disable_tqdm):
        forward_out = _batch_forward(client, batched_tokens)

        # Sample each prompt individually, since their configuration may
        # be very different.
        # TODO: downstream utils work with batch dimsension.
        # Group by sampling configurations and sample those prompts together.
        for i, config in enumerate(configs):  # B
            if i in errors:
                # This prompts has errored out in previous steps.
                # Skip.
                continue

            if config.track in ["coordinates", "residue_annotations"]:
                errors[i] = ESMProteinError(
                    error_code=500,
                    error_msg=f"Iterative sampling {config.track} is not supported.",
                )
                continue

            if t >= config.num_steps:
                # Done sampling for this row.
                continue

            per_prompt_cur_sampled = _BatchedESMProteinTensor.from_protein_tensor(
                batched_tokens.slice(i)
            )
            per_prompt_forward_out: LogitsOutput = _slice_tensor_dataclass(
                forward_out, i, keep_dim=True
            )
            # Trim logits to proper sequence length for this prompt.
            per_prompt_forward_out = _trim_sequence_tensor_dataclass(
                per_prompt_forward_out,
                # Note(jungong) : we can not smiply use sequence_lenths[i] here,
                # what we want is for the sequence length of the logits to match
                # that of the prompt, which may or may not be padded, depending on
                # whether the padding was done locally with the open source model
                # (where per_prompt_cur_sampled is already padded) or by
                # BatchedESM3ModelRunner (where per_prompt_cur_sampled is not padded).
                len(per_prompt_cur_sampled),
            )

            # Handle temperature annealing, since _sample_per_prompt() doesn't have
            # the concept of decoding steps.
            if config.temperature_annealing:
                temperature = _get_annealed_temperature(
                    t, config.num_steps, config.temperature
                )
            else:
                temperature = config.temperature

            track_sample_config = SamplingTrackConfig()
            track_sample_config.invalid_ids = config.invalid_ids
            track_sample_config.temperature = temperature
            track_sample_config.top_p = config.top_p
            sampling_config = SamplingConfig(**{config.track: track_sample_config})  # type: ignore

            # Sampling has to be done per-prompt, since sampling configs
            # are likely be different for different prompts.
            per_prompt_forward_and_sample_output = _sample_per_prompt(
                per_prompt_cur_sampled,
                per_prompt_forward_out,
                sampling_config,
                tokenizers,
                decode_sasa_tokens=False,
            )

            # All positions sampled after _sample_per_prompt() above.
            # (B, L) & (B, L, D)
            per_prompt_new_sampled = per_prompt_forward_and_sample_output.protein_tensor

            # Find the positions we should sample this round.
            assert per_prompt_forward_and_sample_output.entropy is not None
            try:
                where_to_sample = _get_iterative_sampling_mask_for_prompt_and_step(
                    per_prompt_cur_sampled,
                    torch.tensor(sequence_lengths[i]),
                    torch.tensor(total_to_sample[i]),
                    t,
                    per_prompt_forward_and_sample_output.entropy,
                    config,
                    tokenizers,
                )
            except ValueError as e:
                errors[i] = ESMProteinError(error_code=500, error_msg=str(e))
                continue

            where_to_sample.to(input_tokens[0].device)

            old_track_samples = getattr(per_prompt_cur_sampled, config.track)
            new_track_samples = getattr(per_prompt_new_sampled, config.track)

            # Iterative sampling by picking the tokens sampled this round
            # from new_track_samples to old_track_samples.
            new_track_samples = torch.where(
                where_to_sample, new_track_samples, old_track_samples
            )

            # Update the corresponding row with new data.
            getattr(batched_tokens, config.track)[i, ...] = new_track_samples[0]

    # Un-pack to a list of single ProteinTypes.
    output_tokens = [
        batched_tokens.slice(i, sequence_len=sequence_lengths[i])
        if i not in errors
        else errors[i]
        for i in range(len(input_tokens))
    ]

    # Do not update tracks that were not sampled (e.g. keep None instead of masks)
    for inputs, outputs, config in zip(input_tokens, output_tokens, configs):
        if isinstance(outputs, ESMProteinError):
            continue

        # First restore coordinates field.
        # We know coordinates can never be iteratively sampled.
        setattr(outputs, "coordinates", getattr(inputs, "coordinates"))
        # Maybe restore all the other fields.
        for f in attr.fields(SamplingConfig):
            if "embedding" in f.name or f.name == "return_hidden_states":
                continue
            if f.name != config.track:
                setattr(outputs, f.name, getattr(inputs, f.name))

    return output_tokens


def _batch_forward(client: ESM3InferenceClient, protein: _BatchedESMProteinTensor):
    # Forward pass
    return client.logits(
        protein,
        LogitsConfig(
            sequence=True,
            structure=True,
            secondary_structure=True,
            sasa=True,
            function=True,
            residue_annotations=True,
            return_embeddings=True,
        ),
    )


def _sample_per_prompt(
    protein: _BatchedESMProteinTensor,
    logits_output: LogitsOutput,
    sampling_config: SamplingConfig,
    tokenizers: TokenizerCollectionProtocol,
    decode_sasa_tokens: bool = True,
    mask_logits_of_invalid_ids: bool = True,
) -> ForwardAndSampleOutput:
    assert logits_output.logits is not None

    def maybe_clone(x: torch.Tensor | None) -> torch.Tensor | None:
        return x.clone() if x is not None else None

    # Sampling
    tokens_dir = {}
    track_sampling_metadata_dir: dict[str, dict | None] = {}
    integer_sampling_tracks = ["sequence", "structure", "secondary_structure"]
    if not decode_sasa_tokens:
        integer_sampling_tracks.append("sasa")

    for track in integer_sampling_tracks:
        config = getattr(sampling_config, track)
        if config is None:
            tokens_dir[track] = maybe_clone(getattr(protein, track))
            continue
        tokenizer = getattr(tokenizers, track)
        valid_ids = (
            set(tokenizer.all_token_ids)
            - set(tokenizer.special_token_ids)
            - set(config.invalid_ids)
        )
        sampling_metadata = _sample_track(
            logits=getattr(logits_output.logits, track),
            tokens=getattr(protein, track),
            sampling_track_config=config,
            mask_idx=getattr(tokenizers, track).mask_token_id,
            valid_ids=list(valid_ids),
            mask_logits_of_invalid_ids=mask_logits_of_invalid_ids,
        )
        tokens_dir[track] = sampling_metadata.pop("sampled_tokens")  # (L,)
        track_sampling_metadata_dir[track] = sampling_metadata

    # Sample SASA seperately (if needed)
    if decode_sasa_tokens:
        config = getattr(sampling_config, "sasa")
        track_sampling_metadata_dir["sasa"] = None

        if config is None:
            tokens_dir["sasa"] = maybe_clone(getattr(protein, "sasa"))
        else:
            if config.topk_logprobs > 0:
                warn("For SASA sampling, 'topk_logprobs' is expected to be 0.")

            assert logits_output.logits.sasa is not None
            assert protein.sasa is not None

            valid_ids = (
                set(tokenizers.sasa.all_token_ids)
                - set(tokenizers.sasa.special_token_ids)
                - set(config.invalid_ids)
            )
            sasa_logits = logits_output.logits.sasa
            sasa_value = sample_sasa_logits(
                sasa_logits,
                protein.sasa,
                sampling_track_config=config,
                mask_idx=tokenizers.sasa.mask_token_id,
                valid_ids=list(valid_ids),
                mask_logits_of_invalid_ids=mask_logits_of_invalid_ids,
            )
            tokens_dir["sasa"] = sasa_value

            probs = sasa_logits.softmax(dim=-1)
            # Note(tjia): sasa_logits can have -inf because of invalid ids, so
            # probs * sasa_logits.log_softmax(-1) is nan. We need to set
            # those positions to 0 to get the correct entropy value
            entropy = -(torch.nan_to_num(probs * sasa_logits.log_softmax(-1))).sum(-1)

            track_sampling_metadata_dir["sasa"] = {"entropy": entropy}

    # Sample function and residue annotations separately
    config = getattr(sampling_config, "function")
    function_logits = getattr(logits_output.logits, "function")
    if config is None or function_logits is None:
        tokens_dir["function"] = maybe_clone(getattr(protein, "function"))
        tokens_dir["residue_annotations"] = maybe_clone(
            getattr(protein, "residue_annotations")
        )
    else:
        if config.invalid_ids is not None and len(config.invalid_ids) > 0:
            warn("For function sampling, invalid_ids sampling config is not supported.")

        sampling_metadata = _sample_function_track(
            tokenizers.function,
            tokens=getattr(protein, "function"),
            logits=function_logits,
            sampling_track_config=config,
        )
        tokens_dir["function"] = sampling_metadata.pop("sampled_tokens")  # (L, D)
        track_sampling_metadata_dir["function"] = sampling_metadata

        sampled_tokens, _ = sample_residue_annotation_logits(
            logits=logits_output.residue_annotation_logits  # type: ignore
        )
        tokens_dir["residue_annotations"] = sampled_tokens  # (L, MAX_R)

    # Format output
    forward_and_sample_output_dir = {}
    forward_and_sample_output_dir["protein_tensor"] = ESMProteinTensor(**tokens_dir)
    for property in [
        "entropy",
        "prob",
        "logprob",
        "top_prob",
        "topk_logprob",
        "topk_tokens",
    ]:
        is_all_none = True
        forward_track_data_dir = {}
        for track in track_sampling_metadata_dir.keys():
            values = track_sampling_metadata_dir[track]
            if values is not None and values.get(property, None) is not None:
                forward_track_data_dir[track] = values.get(property, None)
                is_all_none = False
        if not is_all_none:
            forward_and_sample_output_dir[property] = ForwardTrackData(
                **forward_track_data_dir
            )
        else:
            forward_and_sample_output_dir[property] = None

    per_res_embed = (
        logits_output.embeddings  # type: ignore
        if sampling_config.return_per_residue_embeddings
        else None
    )
    mean_embedding = (
        # [B, L, D] -> [B, D]
        logits_output.embeddings.mean(dim=1)  # type: ignore
        if sampling_config.return_mean_embedding
        else None
    )

    return ForwardAndSampleOutput(
        per_residue_embedding=per_res_embed,
        mean_embedding=mean_embedding,
        **forward_and_sample_output_dir,
    )


def _sample_track(
    logits: torch.Tensor,
    tokens: torch.Tensor,
    sampling_track_config: SamplingTrackConfig,
    mask_idx: int,
    valid_ids: list[int],
    mask_logits_of_invalid_ids: bool = True,
) -> dict[str, torch.Tensor]:
    """Works with inputs that have batch dimension."""
    # Sample in all positions
    temperature = sampling_track_config.temperature
    # We have to trim the logits and sampled tokens at potentially padded slots
    # since the logits may be computed with a longer padded batch, while tokens
    # are the original input sequence.
    sampled_tokens = sample_logits(
        logits,
        temperature=temperature,
        valid_ids=valid_ids,
        top_p=sampling_track_config.top_p,
        mask_logits_of_invalid_ids=mask_logits_of_invalid_ids,
    )
    log_probs = logits.log_softmax(-1)
    sampling_mask = get_sampling_mask(tokens, sampling_track_config, mask_idx)
    sampled_tokens = torch.where(sampling_mask, sampled_tokens, tokens)

    return _compute_track_metadata(
        sampled_tokens,
        log_probs,
        sampling_mask,
        top_k=sampling_track_config.topk_logprobs,
    )


def _sample_function_track(
    function_tokenizer: InterProQuantizedTokenizer,
    tokens: torch.Tensor,
    logits: torch.Tensor,
    sampling_track_config: SamplingTrackConfig,
) -> dict[str, torch.Tensor]:
    """Works with inputs that have batch dimension."""
    # Do not sample at BOS and EOS tokens
    sampling_mask = torch.ones_like(tokens, dtype=torch.bool)[..., 0]  # (B, L)
    sampling_mask[..., 0] = False
    sampling_mask[..., -1] = False

    sampled_tokens, logprobs = sample_function_logits(
        logits,
        function_tokenizer,
        top_p=sampling_track_config.top_p,
        temperature=sampling_track_config.temperature,
    )
    if sampling_track_config.only_sample_masked_tokens:
        is_mask = torch.all(
            tokens == function_tokenizer.mask_token_id, dim=-1
        )  # (B, L)
        sampling_mask = sampling_mask & is_mask

    sampled_tokens = torch.where(
        sampling_mask[..., None].expand_as(sampled_tokens), sampled_tokens, tokens
    )  # (B, L, D)

    # Set logprobs for non-sampled tokens to 0
    logprobs_null = torch.full_like(logprobs, -torch.inf)  # (B, L, D, V)
    logprobs_null = torch.scatter(
        logprobs_null, -1, tokens[..., None], torch.zeros_like(logprobs_null)[..., [0]]
    )
    logprobs = torch.where(
        sampling_mask[..., None, None].expand_as(logprobs), logprobs, logprobs_null
    )  # (B, L, D, V)

    function_metadata = _compute_track_metadata(
        sampled_tokens,
        logprobs,
        sampling_mask,
        top_k=sampling_track_config.topk_logprobs,
    )
    # Consider the entropy of the joint distribution of all function tokens at each position
    function_metadata["entropy"] = function_metadata["entropy"].sum(
        -1
    )  # (B, L, D) -> (B, L)
    return function_metadata


def _compute_track_metadata(
    sampled_tokens: torch.Tensor,
    log_probs: torch.Tensor,
    sampling_mask: torch.Tensor,
    top_k: int,
) -> dict:
    """Works with inputs that have batch dimension."""
    probs = torch.exp(log_probs)  # (B, L)
    entropy = torch.distributions.Categorical(logits=log_probs).entropy()  # (B, L)

    # Only compute probabilities for sampled tokens
    sampled_logprob = torch.zeros_like(sampled_tokens, dtype=log_probs.dtype)  # (B, L)

    if sampled_tokens.dim() > sampling_mask.dim():
        assert sampled_tokens.dim() == 3  # (B, L, D)
        assert sampling_mask.dim() == 2  # (B, L)
        sampling_mask = sampling_mask[..., None].expand_as(sampled_tokens)

    sampled_tokens_valid = sampled_tokens[sampling_mask]
    sampled_log_probs_valid = log_probs[sampling_mask, sampled_tokens_valid]
    sampled_logprob[sampling_mask] = sampled_log_probs_valid

    # Calculate extra metadata
    sampled_prob = torch.exp(sampled_logprob)
    top_prob = torch.max(probs, dim=-1).values
    topk_logprobs, topk_tokens = torch.topk(log_probs, top_k, dim=-1)
    topk_logprobs = None if top_k == 0 else topk_logprobs
    topk_tokens = None if top_k == 0 else topk_tokens

    return {
        "entropy": entropy,
        "sampled_tokens": sampled_tokens,
        "prob": sampled_prob,
        "logprob": sampled_logprob,
        "top_prob": top_prob,
        "topk_logprob": topk_logprobs,
        "topk_tokens": topk_tokens,
    }