ESM3_small / modeling_esm3.py

Upload modeling_esm3.py with huggingface_hub

3797259 verified 20 days ago

62.8 kB

	from __future__ import annotations

	"""
	Hugging Face compatible ESM3 wrapper.

	This module keeps Biohub's ESM3 implementation as the execution core and adds
	the FastPLMs conventions around it: AutoModel loading, sequence-only
	`input_ids` forwarding, and direct multimodal track arguments.
	"""

	import sys
	import math
	import functools
	import importlib
	import os
	from dataclasses import dataclass
	from enum import Enum
	from pathlib import Path
	from typing import Dict, List, Optional, Union

	import einops
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from einops import rearrange
	from huggingface_hub import snapshot_download
	from tokenizers import Tokenizer
	from tokenizers.models import BPE
	from tokenizers.processors import TemplateProcessing
	from transformers import PreTrainedModel, PreTrainedTokenizerFast, PretrainedConfig
	from transformers.modeling_outputs import ModelOutput

	try:
	from torch.nn.attention.flex_attention import (
	BlockMask,
	create_block_mask,
	flex_attention,
	)
	except ImportError:
	BlockMask = None
	create_block_mask = None
	flex_attention = None

	try:
	from fastplms.test_time_training import FastPLMTestTimeTrainingMixin
	except ImportError:
	pass # Running as HF Hub composite; shared definitions are above


	ESM3_OPEN_SMALL = "esm3_sm_open_v1"
	ESM3_OPEN_SMALL_ALIASES = {
	"ESM3_small",
	"esm3_small",
	"esm3_sm_open_v1",
	"esm3-open-2024-03",
	"esm3-sm-open-v1",
	"esm3-open",
	}

	SEQUENCE_BOS_TOKEN = 0
	SEQUENCE_PAD_TOKEN = 1
	SEQUENCE_EOS_TOKEN = 2
	SEQUENCE_CHAINBREAK_TOKEN = 31
	SEQUENCE_MASK_TOKEN = 32

	VQVAE_CODEBOOK_SIZE = 4096
	STRUCTURE_MASK_TOKEN = VQVAE_CODEBOOK_SIZE
	STRUCTURE_EOS_TOKEN = VQVAE_CODEBOOK_SIZE + 1
	STRUCTURE_BOS_TOKEN = VQVAE_CODEBOOK_SIZE + 2
	STRUCTURE_PAD_TOKEN = VQVAE_CODEBOOK_SIZE + 3
	STRUCTURE_CHAINBREAK_TOKEN = VQVAE_CODEBOOK_SIZE + 4

	SASA_PAD_TOKEN = 0
	SS8_PAD_TOKEN = 0
	INTERPRO_PAD_TOKEN = 0
	RESIDUE_PAD_TOKEN = 0
	MAX_RESIDUE_ANNOTATIONS = 16
	FUNCTION_TOKENS_DEPTH = 8

	SEQUENCE_VOCAB = [
	"<cls>",
	"<pad>",
	"<eos>",
	"<unk>",
	"L",
	"A",
	"G",
	"V",
	"S",
	"E",
	"R",
	"T",
	"I",
	"D",
	"P",
	"K",
	"Q",
	"N",
	"F",
	"Y",
	"M",
	"H",
	"W",
	"C",
	"X",
	"B",
	"U",
	"Z",
	"O",
	".",
	"-",
	"\|",
	"<mask>",
	]

	_SUPPORTED_ATTENTION_BACKENDS = ("auto", "flex", "sdpa")
	_compiled_flex_attention = None


	class AttentionBackend(Enum):
	AUTO = "auto"
	FLEX = "flex"
	SDPA = "sdpa"


	def _get_flex_attention_fn():
	global _compiled_flex_attention
	if flex_attention is None:
	return None
	flex_mod = torch.nn.attention.flex_attention
	if getattr(flex_mod, "_FLEX_ATTENTION_DISABLE_COMPILE_DEBUG", False):
	return flex_attention
	if _compiled_flex_attention is None:
	_compiled_flex_attention = torch.compile(
	flex_attention,
	dynamic=False,
	)
	return _compiled_flex_attention


	def resolve_attention_backend(requested_backend: str) -> AttentionBackend:
	assert requested_backend in _SUPPORTED_ATTENTION_BACKENDS, (
	f"Unsupported ESM3 attention backend: {requested_backend}. "
	f"Expected one of {_SUPPORTED_ATTENTION_BACKENDS}."
	)
	if requested_backend == AttentionBackend.AUTO.value:
	if flex_attention is not None:
	return AttentionBackend.FLEX
	return AttentionBackend.SDPA
	if requested_backend == AttentionBackend.FLEX.value:
	assert flex_attention is not None, "Flex Attention is not available in this environment."
	return AttentionBackend.FLEX
	if requested_backend == AttentionBackend.SDPA.value:
	return AttentionBackend.SDPA
	raise AssertionError(f"Unsupported ESM3 attention backend: {requested_backend}")

	_ESM3_CHECKPOINT_SPECS = {
	ESM3_OPEN_SMALL: {
	"repo_id": "biohub/esm3-sm-open-v1",
	"hidden_size": 1536,
	"num_attention_heads": 24,
	"num_vector_heads": 256,
	"num_hidden_layers": 48,
	},
	}


	class FastESM3Config(PretrainedConfig):
	model_type = "fast_esm3"

	def __init__(
	self,
	vocab_size: int = 64,
	hidden_size: int = 1536,
	num_attention_heads: int = 24,
	num_vector_heads: int = 256,
	num_hidden_layers: int = 48,
	initializer_range: float = 0.02,
	attn_backend: str = "sdpa",
	model_name: str = ESM3_OPEN_SMALL,
	**kwargs,
	):
	super().__init__(**kwargs)
	assert hidden_size % FUNCTION_TOKENS_DEPTH == 0
	assert hidden_size % num_attention_heads == 0
	self.vocab_size = vocab_size
	self.hidden_size = hidden_size
	self.num_attention_heads = num_attention_heads
	self.num_vector_heads = num_vector_heads
	self.num_hidden_layers = num_hidden_layers
	self.initializer_range = initializer_range
	self.attn_backend = attn_backend
	self.model_name = _resolve_esm3_checkpoint_key(model_name)
	self.tie_word_embeddings = False


	@dataclass
	class FastESM3Output(ModelOutput):
	loss: Optional[torch.Tensor] = None
	logits: Optional[torch.Tensor] = None
	last_hidden_state: Optional[torch.Tensor] = None
	sequence_logits: Optional[torch.Tensor] = None
	structure_logits: Optional[torch.Tensor] = None
	secondary_structure_logits: Optional[torch.Tensor] = None
	sasa_logits: Optional[torch.Tensor] = None
	function_logits: Optional[torch.Tensor] = None
	residue_logits: Optional[torch.Tensor] = None
	embeddings: Optional[torch.Tensor] = None
	hidden_states: Optional[tuple[torch.Tensor, ...]] = None
	attentions: Optional[tuple[torch.Tensor, ...]] = None


	class EsmSequenceTokenizer(PreTrainedTokenizerFast):
	model_input_names = ["input_ids", "attention_mask"]

	def __init__(
	self,
	unk_token: str = "<unk>",
	cls_token: str = "<cls>",
	pad_token: str = "<pad>",
	mask_token: str = "<mask>",
	eos_token: str = "<eos>",
	chain_break_token: str = "\|",
	**kwargs,
	):
	token_to_id = {token: index for index, token in enumerate(SEQUENCE_VOCAB)}
	bpe = BPE(token_to_id, merges=[], unk_token=unk_token)
	tokenizer = Tokenizer(bpe)
	special_tokens = [
	cls_token,
	pad_token,
	mask_token,
	eos_token,
	chain_break_token,
	]
	self.cb_token = chain_break_token
	tokenizer.add_special_tokens(special_tokens)
	tokenizer.post_processor = TemplateProcessing(
	single="<cls> $A <eos>",
	pair="<cls>:0 $A:0 <eos>:0 $B:1 <eos>:1",
	special_tokens=[
	("<cls>", tokenizer.token_to_id("<cls>")),
	("<eos>", tokenizer.token_to_id("<eos>")),
	],
	)
	super().__init__(
	tokenizer_object=tokenizer,
	unk_token=unk_token,
	cls_token=cls_token,
	pad_token=pad_token,
	mask_token=mask_token,
	eos_token=eos_token,
	additional_special_tokens=[chain_break_token],
	**kwargs,
	)

	@property
	def bos_token(self) -> str:
	return self.cls_token

	@property
	def bos_token_id(self) -> int:
	return self.cls_token_id

	@property
	def chain_break_token(self) -> str:
	return self.cb_token

	@property
	def chain_break_token_id(self) -> int:
	token_id = self.convert_tokens_to_ids(self.chain_break_token)
	assert isinstance(token_id, int)
	return token_id

	@property
	def all_token_ids(self) -> list[int]:
	return list(range(self.vocab_size))

	@property
	def special_token_ids(self) -> list[int]:
	return self.all_special_ids


	@dataclass
	class FastESM3TokenizerCollection:
	sequence: EsmSequenceTokenizer
	structure: Optional[object] = None
	secondary_structure: Optional[object] = None
	sasa: Optional[object] = None
	function: Optional[object] = None
	residue_annotations: Optional[object] = None


	def rbf(values: torch.Tensor, v_min: float, v_max: float, n_bins: int = 16) -> torch.Tensor:
	centers = torch.linspace(
	v_min,
	v_max,
	n_bins,
	device=values.device,
	dtype=values.dtype,
	)
	centers = centers.view([1] * len(values.shape) + [-1])
	std = (v_max - v_min) / n_bins
	z = (values.unsqueeze(-1) - centers) / std
	return torch.exp(-(z**2))


	def RegressionHead(
	d_model: int,
	output_dim: int,
	hidden_dim: Optional[int] = None,
	) -> nn.Module:
	hidden_dim = hidden_dim if hidden_dim is not None else d_model
	return nn.Sequential(
	nn.Linear(d_model, hidden_dim),
	nn.GELU(),
	nn.LayerNorm(hidden_dim),
	nn.Linear(hidden_dim, output_dim),
	)


	def rotate_half(x: torch.Tensor, interleaved: bool = False) -> torch.Tensor:
	if not interleaved:
	x1, x2 = x.chunk(2, dim=-1)
	return torch.cat((-x2, x1), dim=-1)
	x1, x2 = x[..., ::2], x[..., 1::2]
	return rearrange(
	torch.stack((-x2, x1), dim=-1),
	"... d two -> ... (d two)",
	two=2,
	)


	def apply_rotary_emb_torch(
	x: torch.Tensor,
	cos: torch.Tensor,
	sin: torch.Tensor,
	interleaved: bool = False,
	) -> torch.Tensor:
	ro_dim = cos.shape[-1] * 2
	assert ro_dim <= x.shape[-1]
	seqlen = x.size(1)
	cos = cos[:seqlen]
	sin = sin[:seqlen]
	cos = einops.repeat(cos, "s d -> s 1 (2 d)")
	sin = einops.repeat(sin, "s d -> s 1 (2 d)")
	return torch.cat(
	[
	x[..., :ro_dim] * cos + rotate_half(x[..., :ro_dim], interleaved) * sin,
	x[..., ro_dim:],
	],
	dim=-1,
	)


	class RotaryEmbedding(nn.Module):
	def __init__(
	self,
	dim: int,
	base: float = 10000.0,
	interleaved: bool = False,
	scale_base: Optional[float] = None,
	scaling_factor: float = 1.0,
	pos_idx_in_fp32: bool = True,
	device: Optional[torch.device] = None,
	):
	super().__init__()
	self.dim = dim
	self.base = float(base)
	self.pos_idx_in_fp32 = pos_idx_in_fp32
	self.interleaved = interleaved
	self.scale_base = scale_base
	self.scaling_factor = scaling_factor
	self.device = device
	self._seq_len_cached = 0
	self._cos_cached = None
	self._sin_cached = None
	self.reset_parameters()

	def reset_parameters(self) -> None:
	inv_freq = self._compute_inv_freq(self.device)
	self.register_buffer("inv_freq", inv_freq, persistent=False)
	arange = torch.arange(0, self.dim, 2, device=self.device, dtype=torch.float32)
	scale = (
	(arange + 0.4 * self.dim) / (1.4 * self.dim)
	if self.scale_base is not None
	else None
	)
	self.register_buffer("scale", scale)

	def _compute_inv_freq(self, device: Optional[torch.device] = None) -> torch.Tensor:
	return 1 / (
	self.base
	** (
	torch.arange(0, self.dim, 2, device=device, dtype=torch.float32)
	/ self.dim
	)
	)

	def _update_cos_sin_cache(
	self,
	seqlen: int,
	device: Optional[torch.device] = None,
	dtype: Optional[torch.dtype] = None,
	) -> None:
	if (
	seqlen > self._seq_len_cached
	or self._cos_cached is None
	or self._cos_cached.device != device
	or self._cos_cached.dtype != dtype
	or (self.training and self._cos_cached.is_inference())
	):
	self._seq_len_cached = seqlen
	if self.pos_idx_in_fp32:
	t = torch.arange(seqlen, device=device, dtype=torch.float32)
	t /= self.scaling_factor
	inv_freq = self.inv_freq.to(torch.float32)
	else:
	t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
	t /= self.scaling_factor
	inv_freq = self.inv_freq
	freqs = torch.outer(t, inv_freq)

	if self.scale is None:
	self._cos_cached = torch.cos(freqs).to(dtype)
	self._sin_cached = torch.sin(freqs).to(dtype)
	else:
	raise NotImplementedError("Scaled rotary embeddings are not used by ESM3.")

	def forward(
	self,
	q: torch.Tensor,
	k: torch.Tensor,
	seqlen_offset: int = 0,
	) -> tuple[torch.Tensor, torch.Tensor]:
	self._update_cos_sin_cache(
	q.shape[1] + seqlen_offset,
	device=q.device,
	dtype=q.dtype,
	)
	assert self._cos_cached is not None
	assert self._sin_cached is not None
	return (
	apply_rotary_emb_torch(
	q,
	self._cos_cached[seqlen_offset:],
	self._sin_cached[seqlen_offset:],
	self.interleaved,
	),
	apply_rotary_emb_torch(
	k,
	self._cos_cached[seqlen_offset:],
	self._sin_cached[seqlen_offset:],
	self.interleaved,
	),
	)


	def fp32_autocast_context(device_type: str):
	if device_type == "cuda":
	return torch.autocast(device_type="cuda", enabled=False)
	return torch.autocast(device_type=device_type, enabled=False)


	class RotationMatrix:
	def __init__(self, rots: torch.Tensor):
	if rots.shape[-1] == 9:
	rots = rots.unflatten(-1, (3, 3))
	assert rots.shape[-1] == 3
	assert rots.shape[-2] == 3
	self._rots = rots.to(torch.float32)

	@classmethod
	def identity(cls, shape: tuple[int, ...], **tensor_kwargs) -> "RotationMatrix":
	rots = torch.eye(3, **tensor_kwargs)
	rots = rots.view(*[1 for _ in range(len(shape))], 3, 3)
	rots = rots.expand(*shape, -1, -1)
	return cls(rots)

	def __getitem__(self, idx) -> "RotationMatrix":
	indices = (idx,) if isinstance(idx, int) or idx is None else tuple(idx)
	return RotationMatrix(self._rots[indices + (slice(None), slice(None))])

	@property
	def shape(self) -> torch.Size:
	return self._rots.shape[:-2]

	@property
	def tensor(self) -> torch.Tensor:
	return self._rots.flatten(-2)

	@property
	def device(self) -> torch.device:
	return self._rots.device

	def as_matrix(self) -> "RotationMatrix":
	return self

	def apply(self, p: torch.Tensor) -> torch.Tensor:
	with fp32_autocast_context(self.device.type):
	p = p.to(self._rots.dtype)
	if self._rots.shape[-3] == 1:
	return p @ self._rots.transpose(-1, -2).squeeze(-3)
	return torch.einsum("...ij,...j", self._rots, p)

	def invert(self) -> "RotationMatrix":
	return RotationMatrix(self._rots.transpose(-1, -2))

	@staticmethod
	def from_graham_schmidt(
	x_axis: torch.Tensor,
	xy_plane: torch.Tensor,
	eps: float = 1e-12,
	) -> "RotationMatrix":
	with fp32_autocast_context(x_axis.device.type):
	e1 = xy_plane
	denom = torch.sqrt((x_axis**2).sum(dim=-1, keepdim=True) + eps)
	x_axis = x_axis / denom
	dot = (x_axis * e1).sum(dim=-1, keepdim=True)
	e1 = e1 - x_axis * dot
	denom = torch.sqrt((e1**2).sum(dim=-1, keepdim=True) + eps)
	e1 = e1 / denom
	e2 = torch.cross(x_axis, e1, dim=-1)
	return RotationMatrix(torch.stack([x_axis, e1, e2], dim=-1))


	@dataclass(frozen=True)
	class Affine3D:
	trans: torch.Tensor
	rot: RotationMatrix

	def __post_init__(self) -> None:
	assert self.trans.shape[:-1] == self.rot.shape

	def __getitem__(self, idx) -> "Affine3D":
	indices = (idx,) if isinstance(idx, int) or idx is None else tuple(idx)
	return Affine3D(
	trans=self.trans[indices + (slice(None),)],
	rot=self.rot[idx],
	)

	@property
	def shape(self) -> torch.Size:
	return self.trans.shape[:-1]

	@property
	def dtype(self) -> torch.dtype:
	return self.trans.dtype

	@property
	def device(self) -> torch.device:
	return self.trans.device

	@property
	def tensor(self) -> torch.Tensor:
	return torch.cat([self.rot.tensor, self.trans], dim=-1)

	def as_matrix(self) -> "Affine3D":
	return Affine3D(trans=self.trans, rot=self.rot.as_matrix())

	def apply(self, p: torch.Tensor) -> torch.Tensor:
	return self.rot.apply(p) + self.trans

	@staticmethod
	def from_tensor(t: torch.Tensor) -> "Affine3D":
	match t.shape[-1]:
	case 12:
	trans = t[..., -3:]
	rot = RotationMatrix(t[..., :-3].unflatten(-1, (3, 3)))
	case _:
	raise RuntimeError(
	f"Cannot detect rotation format from {t.shape[-1] - 3}-d flat vector"
	)
	return Affine3D(trans, rot)

	@staticmethod
	def from_graham_schmidt(
	neg_x_axis: torch.Tensor,
	origin: torch.Tensor,
	xy_plane: torch.Tensor,
	eps: float = 1e-10,
	) -> "Affine3D":
	x_axis = origin - neg_x_axis
	xy_plane = xy_plane - origin
	return Affine3D(
	trans=origin,
	rot=RotationMatrix.from_graham_schmidt(x_axis, xy_plane, eps),
	)


	def build_affine3d_from_coordinates(coords: torch.Tensor) -> tuple[Affine3D, torch.Tensor]:
	max_supported_distance = 1e6
	coord_mask = torch.all(
	torch.all(torch.isfinite(coords) & (coords < max_supported_distance), dim=-1),
	dim=-1,
	)

	def atom3_to_backbone_affine(bb_positions: torch.Tensor) -> Affine3D:
	n_atom, ca_atom, c_atom = bb_positions.unbind(dim=-2)
	return Affine3D.from_graham_schmidt(c_atom, ca_atom, n_atom)

	coords = coords.clone().float()
	coords[~coord_mask] = 0
	average_per_n_ca_c = coords.masked_fill(~coord_mask[..., None, None], 0).sum(1) / (
	coord_mask.sum(-1)[..., None, None] + 1e-8
	)
	affine_from_average = atom3_to_backbone_affine(
	average_per_n_ca_c.float()
	).as_matrix()

	batch_size, seq_len, _, _ = coords.shape
	affine_rot_mats = affine_from_average.rot.tensor[..., None, :].expand(
	batch_size,
	seq_len,
	9,
	)
	affine_trans = affine_from_average.trans[..., None, :].expand(batch_size, seq_len, 3)
	identity_rot = RotationMatrix.identity(
	(batch_size, seq_len),
	dtype=torch.float32,
	device=coords.device,
	requires_grad=False,
	)
	affine_rot_mats = affine_rot_mats.where(
	coord_mask.any(-1)[..., None, None],
	identity_rot.tensor,
	)
	black_hole_affine = Affine3D(affine_trans, RotationMatrix(affine_rot_mats))

	affine = atom3_to_backbone_affine(coords.float())
	affine = Affine3D.from_tensor(
	affine.tensor.where(coord_mask[..., None], black_hole_affine.tensor)
	)
	return affine, coord_mask


	class MultiHeadAttention(nn.Module):
	def __init__(
	self,
	d_model: int,
	n_heads: int,
	bias: bool = False,
	qk_layernorm: bool = True,
	attn_backend: str = "sdpa",
	):
	super().__init__()
	self.d_model = d_model
	self.n_heads = n_heads
	self.d_head = self.d_model // self.n_heads
	self.scale = self.d_head**-0.5
	self.attn_backend = resolve_attention_backend(attn_backend)
	self.layernorm_qkv = nn.Sequential(
	nn.LayerNorm(d_model),
	nn.Linear(d_model, d_model * 3, bias=bias),
	)
	self.out_proj = nn.Linear(d_model, d_model, bias=bias)
	if qk_layernorm:
	self.q_ln = nn.LayerNorm(d_model, bias=bias)
	self.k_ln = nn.LayerNorm(d_model, bias=bias)
	else:
	self.q_ln = nn.Identity()
	self.k_ln = nn.Identity()
	self.rotary = RotaryEmbedding(d_model // n_heads)

	def _apply_rotary(
	self,
	q: torch.Tensor,
	k: torch.Tensor,
	) -> tuple[torch.Tensor, torch.Tensor]:
	q = q.unflatten(-1, (self.n_heads, self.d_head))
	k = k.unflatten(-1, (self.n_heads, self.d_head))
	q, k = self.rotary(q, k)
	q = q.flatten(-2, -1)
	k = k.flatten(-2, -1)
	return q, k

	def forward(
	self,
	x: torch.Tensor,
	seq_id: Optional[torch.Tensor],
	output_attentions: bool = False,
	) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
	qkv = self.layernorm_qkv(x)
	query, key, value = torch.chunk(qkv, 3, dim=-1)
	query = self.q_ln(query).to(query.dtype)
	key = self.k_ln(key).to(query.dtype)
	query, key = self._apply_rotary(query, key)

	reshaper = functools.partial(
	einops.rearrange,
	pattern="b s (h d) -> b h s d",
	h=self.n_heads,
	)
	query, key, value = map(reshaper, (query, key, value))

	if seq_id is not None:
	mask = seq_id.unsqueeze(-1) == seq_id.unsqueeze(-2)
	mask = mask.unsqueeze(1)
	else:
	mask = None

	if output_attentions:
	attn_scores = torch.einsum("bhld,bhsd->bhls", query, key) * self.scale
	if mask is not None:
	attn_scores = attn_scores.masked_fill(~mask, float("-inf"))
	attn_weights = torch.softmax(attn_scores, dim=-1)
	context = torch.einsum("bhls,bhsd->bhld", attn_weights, value)
	else:
	attn_weights = None
	if self.attn_backend == AttentionBackend.FLEX:
	block_mask = self._create_flex_block_mask(seq_id, query)
	fn = _get_flex_attention_fn()
	assert fn is not None, "Flex Attention is not available in this environment."
	context = fn(
	query,
	key,
	value,
	block_mask=block_mask,
	scale=self.scale,
	)
	elif self.attn_backend == AttentionBackend.SDPA:
	context = F.scaled_dot_product_attention(
	query,
	key,
	value,
	attn_mask=mask,
	scale=self.scale,
	)
	else:
	raise AssertionError(f"Unsupported resolved ESM3 backend: {self.attn_backend}")

	context = einops.rearrange(context, "b h s d -> b s (h d)")
	return self.out_proj(context), attn_weights

	@staticmethod
	def _create_flex_block_mask(
	seq_id: Optional[torch.Tensor],
	query: torch.Tensor,
	) -> Optional["BlockMask"]:
	if seq_id is None:
	return None
	assert create_block_mask is not None, (
	"Flex Attention requested but torch.create_block_mask is unavailable."
	)
	batch_size, _, seq_len, _ = query.shape

	def mask_mod(batch_idx, head_idx, q_idx, kv_idx):
	return seq_id[batch_idx, q_idx] == seq_id[batch_idx, kv_idx]

	return create_block_mask(
	mask_mod,
	batch_size,
	1,
	seq_len,
	seq_len,
	device=query.device,
	)


	class GeometricReasoningOriginalImpl(nn.Module):
	def __init__(
	self,
	c_s: int,
	v_heads: int,
	num_vector_messages: int = 1,
	mask_and_zero_frameless: bool = True,
	bias: bool = False,
	):
	super().__init__()
	self.c_s = c_s
	self.v_heads = v_heads
	self.num_vector_messages = num_vector_messages
	self.mask_and_zero_frameless = mask_and_zero_frameless
	self.s_norm = nn.LayerNorm(c_s, bias=bias)
	dim_proj = 4 * self.v_heads * 3 + self.v_heads * 3 * self.num_vector_messages
	self.proj = nn.Linear(c_s, dim_proj, bias=bias)
	channels_out = self.v_heads * 3 * self.num_vector_messages
	self.out_proj = nn.Linear(channels_out, c_s, bias=bias)
	self.distance_scale_per_head = nn.Parameter(torch.zeros((self.v_heads)))
	self.rotation_scale_per_head = nn.Parameter(torch.zeros((self.v_heads)))

	def forward(
	self,
	s: torch.Tensor,
	affine: Affine3D,
	affine_mask: torch.Tensor,
	sequence_id: Optional[torch.Tensor],
	chain_id: torch.Tensor,
	) -> torch.Tensor:
	if sequence_id is None:
	sequence_id = torch.zeros_like(s[..., 0], dtype=torch.int64)
	attn_bias = sequence_id.unsqueeze(-1) == sequence_id.unsqueeze(-2)
	attn_bias = attn_bias.unsqueeze(1).float()
	attn_bias = attn_bias.masked_fill(
	~affine_mask[:, None, None, :],
	torch.finfo(attn_bias.dtype).min,
	)
	chain_id_mask = chain_id.unsqueeze(1) != chain_id.unsqueeze(2)
	attn_bias = attn_bias.masked_fill(
	chain_id_mask.unsqueeze(1),
	torch.finfo(s.dtype).min,
	)

	ns = self.s_norm(s)
	vec_rot, vec_dist = self.proj(ns).split(
	[
	self.v_heads * 2 * 3 + self.v_heads * 3 * self.num_vector_messages,
	self.v_heads * 2 * 3,
	],
	dim=-1,
	)

	query_rot, key_rot, value = (
	affine.rot[..., None]
	.apply(rearrange(vec_rot, "... (h c) -> ... h c", c=3))
	.split(
	[self.v_heads, self.v_heads, self.v_heads * self.num_vector_messages],
	dim=-2,
	)
	)
	query_dist, key_dist = (
	affine[..., None]
	.apply(rearrange(vec_dist, "... (h c) -> ... h c", c=3))
	.chunk(2, dim=-2)
	)

	query_dist = rearrange(query_dist, "b s h d -> b h s 1 d")
	key_dist = rearrange(key_dist, "b s h d -> b h 1 s d")
	query_rot = rearrange(query_rot, "b s h d -> b h s d")
	key_rot = rearrange(key_rot, "b s h d -> b h d s")
	value = rearrange(
	value,
	"b s (h m) d -> b h s (m d)",
	m=self.num_vector_messages,
	)

	distance_term = (query_dist - key_dist).norm(dim=-1) / math.sqrt(3)
	rotation_term = query_rot.matmul(key_rot) / math.sqrt(3)
	distance_term_weight = rearrange(
	F.softplus(self.distance_scale_per_head),
	"h -> h 1 1",
	)
	rotation_term_weight = rearrange(
	F.softplus(self.rotation_scale_per_head),
	"h -> h 1 1",
	)
	attn_weight = (
	rotation_term * rotation_term_weight - distance_term * distance_term_weight
	)

	s_q = attn_weight.size(2)
	s_k = attn_weight.size(3)
	offset_q = max(0, attn_bias.size(2) - s_q)
	offset_k = max(0, attn_bias.size(3) - s_k)
	attn_bias = attn_bias[:, :, offset_q:, offset_k:]
	attn_weight = torch.softmax(attn_weight + attn_bias, dim=-1)

	attn_out = attn_weight.matmul(value)
	attn_out = (
	affine.rot[..., None]
	.invert()
	.apply(
	rearrange(
	attn_out,
	"b h s (m d) -> b s (h m) d",
	m=self.num_vector_messages,
	)
	)
	)
	attn_out = rearrange(
	attn_out,
	"b s (h m) d -> b s (h m d)",
	m=self.num_vector_messages,
	)
	if self.mask_and_zero_frameless:
	attn_out = attn_out.masked_fill(~affine_mask[..., None], 0.0)
	attn_out = attn_out.to(self.out_proj.weight.dtype)
	return self.out_proj(attn_out)


	def swiglu_correction_fn(expansion_ratio: float, d_model: int) -> int:
	return int(((expansion_ratio * d_model) + 255) // 256 * 256)


	class SwiGLU(nn.Module):
	def forward(self, x: torch.Tensor) -> torch.Tensor:
	x1, x2 = x.chunk(2, dim=-1)
	return F.silu(x1) * x2


	def swiglu_ln_ffn(d_model: int, expansion_ratio: float, bias: bool) -> nn.Module:
	return nn.Sequential(
	nn.LayerNorm(d_model),
	nn.Linear(
	d_model,
	swiglu_correction_fn(expansion_ratio, d_model) * 2,
	bias=bias,
	),
	SwiGLU(),
	nn.Linear(swiglu_correction_fn(expansion_ratio, d_model), d_model, bias=bias),
	)


	def gelu_ln_ffn(d_model: int, expansion_ratio: float, bias: bool) -> nn.Module:
	hidden_dim = int(expansion_ratio * d_model)
	return nn.Sequential(
	nn.LayerNorm(d_model),
	nn.Linear(d_model, hidden_dim, bias=bias),
	nn.GELU(),
	nn.Linear(hidden_dim, d_model, bias=bias),
	)


	class UnifiedTransformerBlock(nn.Module):
	def __init__(
	self,
	d_model: int,
	n_heads: int,
	use_geom_attn: bool = False,
	use_plain_attn: bool = True,
	v_heads: Optional[int] = None,
	bias: bool = False,
	expansion_ratio: float = 4.0,
	residue_scaling_factor: float = 1.0,
	mask_and_zero_frameless: bool = False,
	qk_layernorm: bool = True,
	ffn_type: str = "swiglu",
	attn_backend: str = "sdpa",
	):
	super().__init__()
	self.use_plain_attn = use_plain_attn
	if self.use_plain_attn:
	self.attn = MultiHeadAttention(
	d_model,
	n_heads,
	bias,
	qk_layernorm=qk_layernorm,
	attn_backend=attn_backend,
	)
	self.use_geom_attn = use_geom_attn
	if self.use_geom_attn:
	assert v_heads is not None
	self.geom_attn = GeometricReasoningOriginalImpl(
	c_s=d_model,
	v_heads=v_heads,
	bias=bias,
	mask_and_zero_frameless=mask_and_zero_frameless,
	)
	if ffn_type == "swiglu":
	self.ffn = swiglu_ln_ffn(d_model, expansion_ratio, bias)
	elif ffn_type == "gelu":
	self.ffn = gelu_ln_ffn(d_model, expansion_ratio, bias)
	else:
	raise ValueError(f"Unknown ffn_type: {ffn_type}")
	self.scaling_factor = residue_scaling_factor

	def forward(
	self,
	x: torch.Tensor,
	sequence_id: Optional[torch.Tensor],
	frames: Affine3D,
	frames_mask: torch.Tensor,
	chain_id: torch.Tensor,
	output_attentions: bool = False,
	) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
	attn_weights = None
	if self.use_plain_attn:
	r1, attn_weights = self.attn(
	x,
	sequence_id,
	output_attentions=output_attentions,
	)
	x = x + r1 / self.scaling_factor

	if self.use_geom_attn:
	r2 = self.geom_attn(x, frames, frames_mask, sequence_id, chain_id)
	x = x + r2 / self.scaling_factor

	r3 = self.ffn(x) / self.scaling_factor
	x = x + r3
	return x, attn_weights


	class TransformerStack(nn.Module):
	def __init__(
	self,
	d_model: int,
	n_heads: int,
	v_heads: Optional[int],
	n_layers: int,
	n_layers_geom: int = 1,
	scale_residue: bool = True,
	mask_and_zero_frameless: bool = False,
	bias: bool = False,
	qk_layernorm: bool = True,
	ffn_type: str = "swiglu",
	expansion_ratio: float = 8 / 3,
	attn_backend: str = "sdpa",
	):
	super().__init__()
	self.blocks = nn.ModuleList(
	[
	UnifiedTransformerBlock(
	d_model,
	n_heads,
	v_heads=v_heads,
	use_geom_attn=index < n_layers_geom,
	residue_scaling_factor=(
	math.sqrt(n_layers / 36) if scale_residue else 1.0
	),
	expansion_ratio=expansion_ratio,
	mask_and_zero_frameless=mask_and_zero_frameless,
	bias=bias,
	qk_layernorm=qk_layernorm,
	ffn_type=ffn_type,
	attn_backend=attn_backend,
	)
	for index in range(n_layers)
	]
	)
	self.norm = nn.LayerNorm(d_model, bias=False)

	def forward(
	self,
	x: torch.Tensor,
	sequence_id: Optional[torch.Tensor] = None,
	affine: Optional[Affine3D] = None,
	affine_mask: Optional[torch.Tensor] = None,
	chain_id: Optional[torch.Tensor] = None,
	output_attentions: bool = False,
	) -> tuple[
	torch.Tensor,
	torch.Tensor,
	tuple[torch.Tensor, ...],
	Optional[tuple[torch.Tensor, ...]],
	]:
	*batch_dims, _ = x.shape
	if chain_id is None:
	chain_id = torch.ones(size=batch_dims, dtype=torch.int64, device=x.device)
	assert affine is not None
	assert affine_mask is not None
	all_hidden_states = []
	all_attentions = []
	for block in self.blocks:
	x, attn_weights = block(
	x,
	sequence_id,
	affine,
	affine_mask,
	chain_id,
	output_attentions=output_attentions,
	)
	all_hidden_states.append(x)
	if output_attentions and attn_weights is not None:
	all_attentions.append(attn_weights)
	hidden_states = tuple(all_hidden_states)
	attentions = tuple(all_attentions) if output_attentions else None
	return self.norm(x), x, hidden_states, attentions


	class EncodeInputs(nn.Module):
	def __init__(self, d_model: int):
	super().__init__()
	self.sequence_embed = nn.Embedding(64, d_model)
	self.plddt_projection = nn.Linear(16, d_model)
	self.structure_per_res_plddt_projection = nn.Linear(16, d_model)
	self.structure_tokens_embed = nn.Embedding(4096 + 5, d_model)
	self.ss8_embed = nn.Embedding(8 + 3, d_model)
	self.sasa_embed = nn.Embedding(16 + 3, d_model)
	self.function_embed = nn.ModuleList(
	[nn.Embedding(260, d_model // 8, padding_idx=0) for _ in range(8)]
	)
	self.residue_embed = nn.EmbeddingBag(1478, d_model, mode="sum", padding_idx=0)

	def forward(
	self,
	sequence_tokens: torch.Tensor,
	structure_tokens: torch.Tensor,
	average_plddt: torch.Tensor,
	per_res_plddt: torch.Tensor,
	ss8_tokens: torch.Tensor,
	sasa_tokens: torch.Tensor,
	function_tokens: torch.Tensor,
	residue_annotation_tokens: torch.Tensor,
	) -> torch.Tensor:
	sequence_embed = self.sequence_embed(sequence_tokens)
	rbf_16_fn = functools.partial(rbf, v_min=0.0, v_max=1.0, n_bins=16)
	plddt_embed = self.plddt_projection(
	rbf_16_fn(average_plddt).to(self.plddt_projection.weight.dtype)
	)
	structure_per_res_plddt = self.structure_per_res_plddt_projection(
	rbf_16_fn(per_res_plddt).to(
	self.structure_per_res_plddt_projection.weight.dtype
	)
	)
	structure_embed = self.structure_tokens_embed(structure_tokens)
	ss8_embed = self.ss8_embed(ss8_tokens)
	sasa_embed = self.sasa_embed(sasa_tokens)
	function_embed = torch.cat(
	[
	embed_fn(funcs)
	for embed_fn, funcs in zip(
	self.function_embed,
	function_tokens.unbind(-1),
	)
	],
	-1,
	)

	batch_size, seq_len, num_annotations = residue_annotation_tokens.shape
	residue_embed = self.residue_embed(
	rearrange(
	residue_annotation_tokens,
	"b l n -> (b l) n",
	b=batch_size,
	l=seq_len,
	n=num_annotations,
	)
	)
	residue_embed = rearrange(
	residue_embed,
	"(b l) d -> b l d",
	b=batch_size,
	l=seq_len,
	)

	return (
	sequence_embed
	+ plddt_embed
	+ structure_per_res_plddt
	+ structure_embed
	+ ss8_embed
	+ sasa_embed
	+ function_embed
	+ residue_embed
	)


	@dataclass
	class ESM3CoreOutput:
	sequence_logits: torch.Tensor
	structure_logits: torch.Tensor
	secondary_structure_logits: torch.Tensor
	sasa_logits: torch.Tensor
	function_logits: torch.Tensor
	residue_logits: torch.Tensor
	embeddings: torch.Tensor
	hidden_states: tuple[torch.Tensor, ...]
	attentions: Optional[tuple[torch.Tensor, ...]] = None


	class OutputHeads(nn.Module):
	def __init__(self, d_model: int):
	super().__init__()
	self.sequence_head = RegressionHead(d_model, 64)
	self.structure_head = RegressionHead(d_model, 4096)
	self.ss8_head = RegressionHead(d_model, 8 + 3)
	self.sasa_head = RegressionHead(d_model, 16 + 3)
	self.function_head = RegressionHead(d_model, 260 * 8)
	self.residue_head = RegressionHead(d_model, 1478)

	def forward(
	self,
	x: torch.Tensor,
	embed: torch.Tensor,
	hidden_states: tuple[torch.Tensor, ...],
	attentions: Optional[tuple[torch.Tensor, ...]] = None,
	) -> ESM3CoreOutput:
	function_logits = self.function_head(x)
	function_logits = rearrange(function_logits, "... (k v) -> ... k v", k=8)
	return ESM3CoreOutput(
	sequence_logits=self.sequence_head(x),
	structure_logits=self.structure_head(x),
	secondary_structure_logits=self.ss8_head(x),
	sasa_logits=self.sasa_head(x),
	function_logits=function_logits,
	residue_logits=self.residue_head(x),
	embeddings=embed,
	hidden_states=hidden_states,
	attentions=attentions,
	)


	class ESM3Core(nn.Module):
	def __init__(
	self,
	d_model: int,
	n_heads: int,
	v_heads: int,
	n_layers: int,
	tokenizers: FastESM3TokenizerCollection,
	attn_backend: str = "sdpa",
	):
	super().__init__()
	self.encoder = EncodeInputs(d_model)
	self.transformer = TransformerStack(
	d_model,
	n_heads,
	v_heads,
	n_layers,
	mask_and_zero_frameless=True,
	attn_backend=attn_backend,
	)
	self.output_heads = OutputHeads(d_model)
	self.tokenizers = tokenizers

	def forward(
	self,
	*,
	sequence_tokens: Optional[torch.Tensor] = None,
	structure_tokens: Optional[torch.Tensor] = None,
	ss8_tokens: Optional[torch.Tensor] = None,
	sasa_tokens: Optional[torch.Tensor] = None,
	function_tokens: Optional[torch.Tensor] = None,
	residue_annotation_tokens: Optional[torch.Tensor] = None,
	average_plddt: Optional[torch.Tensor] = None,
	per_res_plddt: Optional[torch.Tensor] = None,
	structure_coords: Optional[torch.Tensor] = None,
	chain_id: Optional[torch.Tensor] = None,
	sequence_id: Optional[torch.Tensor] = None,
	output_attentions: Optional[bool] = None,
	) -> ESM3CoreOutput:
	output_attentions = bool(output_attentions)
	present_inputs = [
	sequence_tokens,
	structure_tokens,
	ss8_tokens,
	sasa_tokens,
	structure_coords,
	function_tokens,
	residue_annotation_tokens,
	]
	try:
	seq_len, device = next(
	(x.shape[1], x.device) for x in present_inputs if x is not None
	)
	except StopIteration:
	raise ValueError("At least one of the inputs must be non-None")

	def defaults(x: Optional[torch.Tensor], token: int) -> torch.Tensor:
	if x is None:
	return torch.full(
	(1, seq_len),
	token,
	dtype=torch.long,
	device=device,
	)
	return x

	sequence_tokens = defaults(sequence_tokens, self.tokenizers.sequence.mask_token_id)
	ss8_tokens = defaults(ss8_tokens, SS8_PAD_TOKEN)
	sasa_tokens = defaults(sasa_tokens, SASA_PAD_TOKEN)
	average_plddt = defaults(average_plddt, 1).float()
	per_res_plddt = defaults(per_res_plddt, 0).float()
	chain_id = defaults(chain_id, 0)

	if residue_annotation_tokens is None:
	residue_annotation_tokens = torch.full(
	(1, seq_len, MAX_RESIDUE_ANNOTATIONS),
	RESIDUE_PAD_TOKEN,
	dtype=torch.long,
	device=device,
	)
	if function_tokens is None:
	function_tokens = torch.full(
	(1, seq_len, FUNCTION_TOKENS_DEPTH),
	INTERPRO_PAD_TOKEN,
	dtype=torch.long,
	device=device,
	)
	if structure_coords is None:
	structure_coords = torch.full(
	(1, seq_len, 3, 3),
	float("nan"),
	dtype=torch.float,
	device=device,
	)

	structure_coords = structure_coords[..., :3, :]
	affine, affine_mask = build_affine3d_from_coordinates(structure_coords)

	structure_tokens = defaults(structure_tokens, STRUCTURE_MASK_TOKEN)
	structure_tokens = (
	structure_tokens.masked_fill(structure_tokens == -1, STRUCTURE_MASK_TOKEN)
	.masked_fill(sequence_tokens == SEQUENCE_BOS_TOKEN, STRUCTURE_BOS_TOKEN)
	.masked_fill(sequence_tokens == SEQUENCE_PAD_TOKEN, STRUCTURE_PAD_TOKEN)
	.masked_fill(sequence_tokens == SEQUENCE_EOS_TOKEN, STRUCTURE_EOS_TOKEN)
	.masked_fill(
	sequence_tokens == SEQUENCE_CHAINBREAK_TOKEN,
	STRUCTURE_CHAINBREAK_TOKEN,
	)
	)

	x = self.encoder(
	sequence_tokens,
	structure_tokens,
	average_plddt,
	per_res_plddt,
	ss8_tokens,
	sasa_tokens,
	function_tokens,
	residue_annotation_tokens,
	)
	x, embedding, hidden_states, attentions = self.transformer(
	x,
	sequence_id,
	affine,
	affine_mask,
	chain_id,
	output_attentions=output_attentions,
	)
	return self.output_heads(
	x,
	embedding,
	hidden_states=hidden_states,
	attentions=attentions,
	)


	def _resolve_esm3_checkpoint_key(model_name: str) -> str:
	if model_name in ESM3_OPEN_SMALL_ALIASES:
	return ESM3_OPEN_SMALL
	raise ValueError(
	f"Unsupported ESM3 checkpoint {model_name}. "
	f"Supported names: {sorted(ESM3_OPEN_SMALL_ALIASES)}"
	)


	def parse_fasta(fasta_path: str) -> list[str]:
	assert os.path.exists(fasta_path), f"FASTA file does not exist: {fasta_path}"
	sequences = []
	current_seq = []
	with open(fasta_path, "r", encoding="utf-8") as handle:
	for line in handle:
	stripped = line.strip()
	if len(stripped) == 0:
	continue
	if stripped.startswith(">"):
	if len(current_seq) > 0:
	sequences.append("".join(current_seq))
	current_seq = []
	else:
	current_seq.append(stripped)
	if len(current_seq) > 0:
	sequences.append("".join(current_seq))
	return sequences


	def _ensure_official_esm_on_path() -> None:
	for parent in Path(__file__).resolve().parents:
	candidate = parent / "official" / "esm"
	if (candidate / "esm" / "models" / "esm3.py").exists():
	candidate_str = str(candidate)
	if candidate_str not in sys.path:
	sys.path.insert(0, candidate_str)
	return


	def _make_structure_encoder(device: Union[torch.device, str]) -> nn.Module:
	_ensure_official_esm_on_path()
	pretrained = importlib.import_module("esm.pretrained")
	return pretrained.ESM3_structure_encoder_v0(device)


	def _make_structure_decoder(device: Union[torch.device, str]) -> nn.Module:
	_ensure_official_esm_on_path()
	pretrained = importlib.import_module("esm.pretrained")
	return pretrained.ESM3_structure_decoder_v0(device)


	def _make_function_decoder(device: Union[torch.device, str]) -> nn.Module:
	_ensure_official_esm_on_path()
	pretrained = importlib.import_module("esm.pretrained")
	return pretrained.ESM3_function_decoder_v0(device)


	def _build_official_esm3(config: FastESM3Config) -> nn.Module:
	return ESM3Core(
	d_model=config.hidden_size,
	n_heads=config.num_attention_heads,
	v_heads=config.num_vector_heads,
	n_layers=config.num_hidden_layers,
	tokenizers=FastESM3TokenizerCollection(sequence=EsmSequenceTokenizer()),
	attn_backend=config.attn_backend,
	)


	class FastESM3PreTrainedModel(PreTrainedModel):
	config_class = FastESM3Config
	base_model_prefix = "esm3"
	main_input_name = "input_ids"
	supports_gradient_checkpointing = False
	all_tied_weights_keys = {}

	@classmethod
	def is_remote_code(cls) -> bool:
	return True

	def _init_weights(self, module: nn.Module) -> None:
	for parameter in module.parameters(recurse=False):
	if "_is_hf_initialized" in parameter.__dict__ and parameter.__dict__["_is_hf_initialized"]:
	return

	if isinstance(module, nn.Linear):
	nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
	if module.bias is not None:
	nn.init.zeros_(module.bias)
	elif isinstance(module, nn.Embedding):
	nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
	if module.padding_idx is not None:
	with torch.no_grad():
	module.weight[module.padding_idx].zero_()
	elif isinstance(module, nn.LayerNorm):
	if module.bias is not None:
	nn.init.zeros_(module.bias)
	nn.init.ones_(module.weight)

	@property
	def attn_backend(self) -> str:
	return self.config.attn_backend

	@attn_backend.setter
	def attn_backend(self, backend: str) -> None:
	assert backend in _SUPPORTED_ATTENTION_BACKENDS, (
	f"ESM3 currently supports only {_SUPPORTED_ATTENTION_BACKENDS}; got {backend}."
	)
	self.config.attn_backend = backend
	resolved = resolve_attention_backend(backend)
	for module in self.modules():
	if isinstance(module, MultiHeadAttention):
	module.attn_backend = resolved

	@classmethod
	def from_pretrained_esm(
	cls,
	model_name: str = ESM3_OPEN_SMALL,
	device: Union[torch.device, str] = "cpu",
	dtype: Optional[torch.dtype] = None,
	) -> "FastESM3Model":
	key = _resolve_esm3_checkpoint_key(model_name)
	spec = _ESM3_CHECKPOINT_SPECS[key]
	config = FastESM3Config(
	hidden_size=spec["hidden_size"],
	num_attention_heads=spec["num_attention_heads"],
	num_vector_heads=spec["num_vector_heads"],
	num_hidden_layers=spec["num_hidden_layers"],
	model_name=key,
	)
	model = FastESM3Model(config)
	checkpoint_root = Path(
	snapshot_download(
	repo_id=spec["repo_id"],
	allow_patterns=["data/weights/esm3_sm_open_v1.pth"],
	)
	)
	state_dict = torch.load(
	checkpoint_root / "data" / "weights" / "esm3_sm_open_v1.pth",
	map_location=torch.device(device),
	)
	load_result = model.esm3.load_state_dict(state_dict, strict=True)
	assert len(load_result.missing_keys) == 0, load_result.missing_keys
	assert len(load_result.unexpected_keys) == 0, load_result.unexpected_keys
	model = model.to(device)
	if dtype is not None:
	model = model.to(dtype=dtype)
	model.eval()
	return model


	class FastESM3Model(FastPLMTestTimeTrainingMixin, FastESM3PreTrainedModel):
	config_class = FastESM3Config

	def __init__(self, config: FastESM3Config, **kwargs):
	super().__init__(config, **kwargs)
	self.tokenizer = EsmSequenceTokenizer()
	self.esm3 = _build_official_esm3(config)
	self.__dict__["_official_sdk_model"] = None
	self.init_ttt({"lora_target_replace_module": "MultiHeadAttention"})

	@property
	def device(self) -> torch.device:
	return next(self.parameters()).device

	@property
	def raw_model(self) -> nn.Module:
	return self.esm3

	def _get_official_sdk_model(self) -> nn.Module:
	cached_model = self.__dict__["_official_sdk_model"]
	if cached_model is not None:
	return cached_model
	_ensure_official_esm_on_path()
	esm3_module = importlib.import_module("esm.models.esm3")
	tokenization = importlib.import_module("esm.tokenization")

	sdk_model = esm3_module.ESM3(
	d_model=self.config.hidden_size,
	n_heads=self.config.num_attention_heads,
	v_heads=self.config.num_vector_heads,
	n_layers=self.config.num_hidden_layers,
	structure_encoder_fn=_make_structure_encoder,
	structure_decoder_fn=_make_structure_decoder,
	function_decoder_fn=_make_function_decoder,
	tokenizers=tokenization.get_esm3_model_tokenizers(self.config.model_name),
	)
	load_result = sdk_model.load_state_dict(self.esm3.state_dict(), strict=True)
	assert len(load_result.missing_keys) == 0, load_result.missing_keys
	assert len(load_result.unexpected_keys) == 0, load_result.unexpected_keys
	dtype = next(self.esm3.parameters()).dtype
	sdk_model = sdk_model.to(self.device).to(dtype=dtype).eval()
	self.__dict__["_official_sdk_model"] = sdk_model
	return sdk_model

	def get_input_embeddings(self) -> nn.Module:
	return self.esm3.encoder.sequence_embed

	def set_input_embeddings(self, value: nn.Module) -> None:
	self.esm3.encoder.sequence_embed = value

	def tokenize_sequences(
	self,
	sequences: Union[str, list[str]],
	padding: bool = True,
	return_tensors: str = "pt",
	device: Optional[Union[torch.device, str]] = None,
	add_special_tokens: bool = True,
	) -> dict[str, torch.Tensor]:
	tokenized = self.tokenizer(
	sequences,
	padding=padding,
	return_tensors=return_tensors,
	add_special_tokens=add_special_tokens,
	)
	if device is None:
	return tokenized
	return {name: tensor.to(device) for name, tensor in tokenized.items()}

	def forward_sequence(
	self,
	sequences: Union[str, list[str]],
	device: Optional[Union[torch.device, str]] = None,
	**kwargs,
	) -> FastESM3Output:
	if device is None:
	device = self.device
	tokenized = self.tokenize_sequences(sequences, device=device)
	return self(tokenized, kwargs)

	def _embed(
	self,
	input_ids: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	hidden_state_index: int = -1,
	store_all_hidden_states: bool = False,
	**kwargs,
	) -> torch.Tensor:
	output = self(
	input_ids=input_ids,
	attention_mask=attention_mask,
	**kwargs,
	)
	if store_all_hidden_states:
	assert output.hidden_states is not None, "store_all_hidden_states requires hidden states."
	return torch.stack(tuple(output.hidden_states), dim=1)
	if hidden_state_index == -1:
	return output.last_hidden_state
	assert output.hidden_states is not None, "hidden_state_index selection requires hidden states."
	return output.hidden_states[hidden_state_index]

	def _pool_embeddings(
	self,
	embeddings: torch.Tensor,
	attention_mask: torch.Tensor,
	pooling_types: list[str],
	) -> torch.Tensor:
	pooled = []
	mask = attention_mask.to(dtype=embeddings.dtype).unsqueeze(-1)
	for pooling_type in pooling_types:
	if pooling_type == "mean":
	pooled.append((embeddings * mask).sum(dim=1) / mask.sum(dim=1))
	elif pooling_type == "cls":
	pooled.append(embeddings[:, 0, :])
	elif pooling_type == "max":
	bool_mask = attention_mask.unsqueeze(-1).bool()
	pooled.append(
	embeddings.masked_fill(~bool_mask, float("-inf")).max(dim=1).values
	)
	else:
	raise ValueError(
	f"Unsupported ESM3 pooling type {pooling_type}. "
	"Supported values are 'mean', 'cls', and 'max'."
	)
	return torch.cat(pooled, dim=-1)

	def embed_dataset(
	self,
	sequences: Optional[List[str]] = None,
	tokenizer: Optional[PreTrainedTokenizerFast] = None,
	batch_size: int = 2,
	max_len: int = 512,
	truncate: bool = True,
	full_embeddings: bool = False,
	embed_dtype: torch.dtype = torch.float32,
	pooling_types: List[str] = ["mean"],
	num_workers: int = 0,
	sql: bool = False,
	save: bool = True,
	sql_db_path: str = "embeddings.db",
	save_path: str = "embeddings.pth",
	fasta_path: Optional[str] = None,
	padding: str = "longest",
	hidden_state_index: int = -1,
	store_all_hidden_states: bool = False,
	**kwargs,
	) -> Dict[str, torch.Tensor]:
	del num_workers, sql_db_path
	assert not sql, "ESM3 embed_dataset currently supports .pth saves, not SQLite."
	assert isinstance(hidden_state_index, int), "hidden_state_index must be an integer."
	assert full_embeddings or not store_all_hidden_states, (
	"store_all_hidden_states=True requires full_embeddings=True."
	)
	if tokenizer is None:
	tokenizer = self.tokenizer
	if fasta_path is not None:
	fasta_sequences = parse_fasta(fasta_path)
	sequences = list(sequences or []) + fasta_sequences
	assert sequences is not None and len(sequences) > 0, (
	"Must provide at least one sequence via `sequences` or `fasta_path`."
	)

	unique_sequences = []
	seen_sequences = set()
	for sequence in sequences:
	prepared_sequence = sequence[:max_len] if truncate else sequence
	if prepared_sequence not in seen_sequences:
	unique_sequences.append(prepared_sequence)
	seen_sequences.add(prepared_sequence)
	unique_sequences = sorted(unique_sequences, key=len, reverse=True)

	embeddings_by_sequence: Dict[str, torch.Tensor] = {}
	was_training = self.training
	self.eval()
	for batch_start in range(0, len(unique_sequences), batch_size):
	batch_sequences = unique_sequences[batch_start : batch_start + batch_size]
	tokenized = tokenizer(
	batch_sequences,
	padding=padding,
	truncation=truncate,
	max_length=max_len + 2,
	return_tensors="pt",
	)
	tokenized = {
	name: tensor.to(self.device) for name, tensor in tokenized.items()
	}
	with torch.inference_mode():
	residue_embeddings = self._embed(
	**tokenized,
	hidden_state_index=hidden_state_index,
	store_all_hidden_states=store_all_hidden_states,
	**kwargs,
	)
	attention_mask = tokenized["attention_mask"]
	if full_embeddings:
	batch_embeddings = residue_embeddings.to(embed_dtype).cpu()
	for sequence, embedding, mask in zip(
	batch_sequences,
	batch_embeddings,
	attention_mask.cpu(),
	):
	if embedding.ndim == 3:
	embeddings_by_sequence[sequence] = embedding[:, mask.bool(), :]
	else:
	embeddings_by_sequence[sequence] = embedding[mask.bool()]
	else:
	pooled_embeddings = self._pool_embeddings(
	residue_embeddings,
	attention_mask,
	pooling_types,
	)
	pooled_embeddings = pooled_embeddings.to(embed_dtype).cpu()
	for sequence, embedding in zip(batch_sequences, pooled_embeddings):
	embeddings_by_sequence[sequence] = embedding

	if was_training:
	self.train()
	if save:
	torch.save(embeddings_by_sequence, save_path)
	return embeddings_by_sequence

	def encode(self, input):
	return self._get_official_sdk_model().encode(input)

	def decode(self, input):
	return self._get_official_sdk_model().decode(input)

	def generate(self, input, config):
	return self._get_official_sdk_model().generate(input, config)

	def batch_generate(self, inputs, configs):
	return self._get_official_sdk_model().batch_generate(inputs, configs)

	def _ttt_get_trainable_modules(self) -> list[nn.Module]:
	return [self.esm3]

	def forward_and_sample(self, input, sampling_configuration):
	return self._get_official_sdk_model().forward_and_sample(
	input,
	sampling_configuration,
	)

	def logits(self, input=None, config=None, **kwargs):
	if input is None:
	return self.forward(**kwargs)
	if isinstance(input, torch.Tensor):
	return self.forward(sequence_tokens=input, **kwargs)
	if config is None:
	return self._get_official_sdk_model().logits(input)
	return self._get_official_sdk_model().logits(input, config)

	def forward(
	self,
	input_ids: Optional[torch.Tensor] = None,
	attention_mask: Optional[torch.Tensor] = None,
	sequence_tokens: Optional[torch.Tensor] = None,
	structure_tokens: Optional[torch.Tensor] = None,
	ss8_tokens: Optional[torch.Tensor] = None,
	sasa_tokens: Optional[torch.Tensor] = None,
	function_tokens: Optional[torch.Tensor] = None,
	residue_annotation_tokens: Optional[torch.Tensor] = None,
	average_plddt: Optional[torch.Tensor] = None,
	per_res_plddt: Optional[torch.Tensor] = None,
	structure_coords: Optional[torch.Tensor] = None,
	chain_id: Optional[torch.Tensor] = None,
	sequence_id: Optional[torch.Tensor] = None,
	labels: Optional[torch.Tensor] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	**kwargs,
	) -> FastESM3Output:
	del output_hidden_states, return_dict, kwargs
	if sequence_tokens is None:
	sequence_tokens = input_ids
	if sequence_id is None and attention_mask is not None:
	sequence_id = attention_mask.to(dtype=torch.bool)

	output = self.esm3(
	sequence_tokens=sequence_tokens,
	structure_tokens=structure_tokens,
	ss8_tokens=ss8_tokens,
	sasa_tokens=sasa_tokens,
	function_tokens=function_tokens,
	residue_annotation_tokens=residue_annotation_tokens,
	average_plddt=average_plddt,
	per_res_plddt=per_res_plddt,
	structure_coords=structure_coords,
	chain_id=chain_id,
	sequence_id=sequence_id,
	output_attentions=output_attentions,
	)

	loss = None
	if labels is not None:
	loss = F.cross_entropy(
	output.sequence_logits.view(-1, output.sequence_logits.shape[-1]),
	labels.view(-1),
	ignore_index=-100,
	)

	return FastESM3Output(
	loss=loss,
	logits=output.sequence_logits,
	last_hidden_state=output.embeddings,
	sequence_logits=output.sequence_logits,
	structure_logits=output.structure_logits,
	secondary_structure_logits=output.secondary_structure_logits,
	sasa_logits=output.sasa_logits,
	function_logits=output.function_logits,
	residue_logits=output.residue_logits,
	embeddings=output.embeddings,
	hidden_states=output.hidden_states,
	attentions=output.attentions,
	)