Upload folder using huggingface_hub

Browse files

Files changed (6) hide show

README.md +22 -22
__init__.py +5 -5
configuration_esmfold2.py +19 -19
modeling_esmfold2.py +257 -257
modeling_esmfold2_common.py +81 -81
modeling_esmfold2_experimental.py +31 -15

README.md CHANGED Viewed

@@ -221,27 +221,27 @@ with torch.inference_mode():
 decoded = model.input_builder.decode(output, features, chain_infos)
 ```
-Set `load_esmc=False` when loading if you want to provide precomputed `lm_hidden_states` manually or run folding-trunk tests without loading the 6B ESM++ backbone:
-```python
-model = AutoModel.from_pretrained(
-    "Synthyra/ESMFold2-Fast",
     trust_remote_code=True,
     load_esmc=False,
-).cuda().eval()
-```
-For FP8 LM inference, install `transformer_engine.pytorch` in a CUDA
-environment with FP8-capable hardware and load the shared FastPLMs ESM++
-backbone with:
-```python
-model = AutoModel.from_pretrained(
-    "Synthyra/ESMFold2-Fast",
-    trust_remote_code=True,
-    esmc_precision="fp8",
-).cuda().eval()
-```
-FP8 is inference-only for the ESMFold2 LM backbone. TTT remains a bf16/fp32
-path.

 decoded = model.input_builder.decode(output, features, chain_infos)
 ```
+Set `load_esmc=False` when loading if you want to provide precomputed `lm_hidden_states` manually or run folding-trunk tests without loading the 6B ESM++ backbone:
+```python
+model = AutoModel.from_pretrained(
+    "Synthyra/ESMFold2-Fast",
     trust_remote_code=True,
     load_esmc=False,
+).cuda().eval()
+```
+For FP8 LM inference, install `transformer_engine.pytorch` in a CUDA
+environment with FP8-capable hardware and load the shared FastPLMs ESM++
+backbone with:
+```python
+model = AutoModel.from_pretrained(
+    "Synthyra/ESMFold2-Fast",
+    trust_remote_code=True,
+    esmc_precision="fp8",
+).cuda().eval()
+```
+FP8 is inference-only for the ESMFold2 LM backbone. TTT remains a bf16/fp32
+path.

__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
-from .configuration_esmfold2 import ESMFold2Config
-from .modeling_esmfold2_experimental import ESMFold2ExperimentalModel
-from .modeling_esmfold2 import ESMFold2Model
-__all__ = ["ESMFold2Config", "ESMFold2ExperimentalModel", "ESMFold2Model"]

+from .configuration_esmfold2 import ESMFold2Config
+from .modeling_esmfold2_experimental import ESMFold2ExperimentalModel
+from .modeling_esmfold2 import ESMFold2Model
+__all__ = ["ESMFold2Config", "ESMFold2ExperimentalModel", "ESMFold2Model"]

configuration_esmfold2.py CHANGED Viewed

@@ -201,19 +201,19 @@ class ESMFold2Config(PretrainedConfig):
             Number of trunk loops for iterative refinement.
         num_diffusion_samples (`int`, defaults to 8):
             Number of parallel structure predictions to generate.
-        lm_dropout (`float`, defaults to 0.0):
-            Dropout probability on LM pair embeddings. When > 0, dropout is
-            applied with ``training=True`` (including at inference) to match
-            the experimental training recipe used by binder design.
-        force_lm_dropout_during_inference (`bool`, defaults to False):
-            When True, apply ``lm_dropout`` even when ``model.eval()`` and
-            ``lm_dropout`` > 0. Binder-design loads set this to True.
-        lm_mask_pct (`float`, defaults to 0.0):
-            Fraction of LM residue tokens randomly replaced with the LM mask
-            token before running the PLM backbone.
-        disable_msa_features (`bool`, defaults to False):
-            When True, zero out MSA-derived ``profile`` and ``deletion_mean``
-            before the inputs embedder (experimental medium/large checkpoints).
         inputs (`InputsEmbedderConfig`):
             Configuration for the inputs embedder module.
         folding_trunk (`FoldingTrunkConfig`):
@@ -263,12 +263,12 @@ class ESMFold2Config(PretrainedConfig):
         # embedder.
         self.disable_msa_features: bool = kwargs.get("disable_msa_features", False)
         self.lm_dropout: float = kwargs.get("lm_dropout", 0.0)
-        self.force_lm_dropout_during_inference: bool = kwargs.get(
-            "force_lm_dropout_during_inference", False
-        )
-        self.lm_mask_pct: float = kwargs.get("lm_mask_pct", 0.0)
-        self.lm_d_model: int = kwargs.get("lm_d_model", 2560)
         self.lm_num_layers: int = kwargs.get("lm_num_layers", 80)
         # Backward-compatible field name; values now point to FastPLMs ESM++.
         raw_esmc_id = (

             Number of trunk loops for iterative refinement.
         num_diffusion_samples (`int`, defaults to 8):
             Number of parallel structure predictions to generate.
+        lm_dropout (`float`, defaults to 0.0):
+            Dropout probability on LM pair embeddings. When > 0, dropout is
+            applied with ``training=True`` (including at inference) to match
+            the experimental training recipe used by binder design.
+        force_lm_dropout_during_inference (`bool`, defaults to False):
+            When True, apply ``lm_dropout`` even when ``model.eval()`` and
+            ``lm_dropout`` > 0. Binder-design loads set this to True.
+        lm_mask_pct (`float`, defaults to 0.0):
+            Fraction of LM residue tokens randomly replaced with the LM mask
+            token before running the PLM backbone.
+        disable_msa_features (`bool`, defaults to False):
+            When True, zero out MSA-derived ``profile`` and ``deletion_mean``
+            before the inputs embedder (experimental medium/large checkpoints).
         inputs (`InputsEmbedderConfig`):
             Configuration for the inputs embedder module.
         folding_trunk (`FoldingTrunkConfig`):
         # embedder.
         self.disable_msa_features: bool = kwargs.get("disable_msa_features", False)
         self.lm_dropout: float = kwargs.get("lm_dropout", 0.0)
+        self.force_lm_dropout_during_inference: bool = kwargs.get(
+            "force_lm_dropout_during_inference", False
+        )
+        self.lm_mask_pct: float = kwargs.get("lm_mask_pct", 0.0)
+        self.lm_d_model: int = kwargs.get("lm_d_model", 2560)
         self.lm_num_layers: int = kwargs.get("lm_num_layers", 80)
         # Backward-compatible field name; values now point to FastPLMs ESM++.
         raw_esmc_id = (

modeling_esmfold2.py CHANGED Viewed

@@ -59,12 +59,12 @@ from .modeling_esmfold2_common import (
     TriangleMultiplicativeUpdate,
     _categorical_mean,
     _compute_intra_token_idx,
-    compute_lm_hidden_states,
-    gather_rep_atom_coords,
-    gather_token_to_atom,
-    maybe_apply_msa_column_masking,
-    maybe_subsample_msa,
-)
 from .esmfold2_affine3d import Affine3D as _FastPLMSESMFold2Affine3D
 from .esmfold2_aligner import Aligner as _FastPLMSESMFold2Aligner
 from .esmfold2_atom_indexer import AtomIndexer as _FastPLMSESMFold2AtomIndexer
@@ -699,27 +699,27 @@ class ESMFold2Model(FastPLMTestTimeTrainingMixin, PreTrainedModel):
         self.post_init()
         self.init_ttt({"lora_target_replace_module": "MultiHeadAttention"})
-    def load_esmc(self, esmc_model_path: str, precision: str = "bf16") -> None:
-        """Load the FastPLMs ESM++ LM used as the ESMFold2 PLM backbone.
-        ``precision``: ``"bf16"`` (default), ``"fp32"``, or opt-in ``"fp8"``.
-        """
-        dtype_map = {
-            "bf16": torch.bfloat16,
-            "fp32": torch.float32,
-            "fp8": torch.bfloat16,
-        }
-        if precision not in dtype_map:
-            raise ValueError(f"precision must be one of {list(dtype_map)}, got {precision!r}")
-        if precision == "fp8" and not TE_AVAILABLE:
-            raise RuntimeError(
-                "esmc_precision='fp8' requires transformer_engine.pytorch."
-            )
-        dtype = dtype_map[precision]
-        esmc = _load_fastplms_esmplusplus_for_esmfold2(
-            esmc_model_path=esmc_model_path,
-            attn_backend=self.config.esmc_attn_backend,
             device=self.device,
             dtype=dtype,
         )
@@ -730,24 +730,24 @@ class ESMFold2Model(FastPLMTestTimeTrainingMixin, PreTrainedModel):
         assert esmc.config.num_hidden_layers == self.config.lm_num_layers, (
             f"ESMFold2 expected lm_num_layers={self.config.lm_num_layers}, "
             f"but loaded ESM++ num_hidden_layers={esmc.config.num_hidden_layers}."
-        )
-        for p in esmc.parameters():
-            p.requires_grad_(False)
-        if precision == "fp8":
-            with torch.no_grad():
-                _convert_te_modules_to_fp8_inplace(esmc)
-        self._esmc_fp8 = precision == "fp8"
-        self._esmc = esmc
-        self._ttt_lm_head = None
-    def _ensure_ttt_lm_head(self) -> None:
-        assert self._esmc is not None, "ESMFold2 TTT requires load_esmc=True."
-        if self._esmc_fp8:
-            raise RuntimeError("ESMFold2 TTT is not supported with fp8 ESM++.")
-        if self._ttt_lm_head is not None:
-            return
         try:
             from fastplms.esm_plusplus.modeling_esm_plusplus import (
                 ESMplusplusConfig,
@@ -781,11 +781,11 @@ class ESMFold2Model(FastPLMTestTimeTrainingMixin, PreTrainedModel):
         self._ttt_lm_head.requires_grad_(False)
         del mlm
-    def _ttt_get_trainable_modules(self) -> list[nn.Module]:
-        assert self._esmc is not None, "ESMFold2 TTT requires load_esmc=True."
-        if self._esmc_fp8:
-            raise RuntimeError("ESMFold2 TTT is not supported with fp8 ESM++.")
-        return [self._esmc]
     def _ttt_tokenize(
         self,
@@ -846,13 +846,13 @@ class ESMFold2Model(FastPLMTestTimeTrainingMixin, PreTrainedModel):
         **kwargs,
     ) -> torch.Tensor:
         del kwargs
-        assert isinstance(batch, torch.Tensor), (
-            "ESMFold2 TTT expects input_ids tensors."
-        )
-        assert self._esmc is not None, "ESMFold2 TTT requires load_esmc=True."
-        if self._esmc_fp8:
-            raise RuntimeError("ESMFold2 TTT is not supported with fp8 ESM++.")
-        self._ensure_ttt_lm_head()
         assert self._ttt_lm_head is not None
         attention_mask = batch.ne(SEQUENCE_PAD_TOKEN)
         output = self._esmc(
@@ -947,30 +947,30 @@ class ESMFold2Model(FastPLMTestTimeTrainingMixin, PreTrainedModel):
         if self.msa_encoder is not None:
             self.msa_encoder.set_chunk_size(chunk_size)
-    def _compute_lm_hidden_states(
-        self,
-        input_ids: Tensor,
-        asym_id: Tensor,
-        residue_index: Tensor,
-        mol_type: Tensor,
-        tok_mask: Tensor,
-        lm_mask_pct: float = 0.0,
-    ) -> Tensor:
-        assert self._esmc is not None
-        # fp8 TE kernels require prod(shape[:-1]) % 8 == 0.
-        pad_to = 8 if self._esmc_fp8 else None
-        with _lm_precision_context(self._esmc_fp8):
             return compute_lm_hidden_states(
                 self._esmc,
                 input_ids,
                 asym_id,
                 residue_index,
-                mol_type,
-                tok_mask,
-                pad_to_multiple=pad_to,
-                lm_mask_pct=lm_mask_pct,
-                mask_token_id=SEQUENCE_MASK_TOKEN,
-            )
     def _discretized_dynamics(self) -> tuple[Tensor, Tensor]:
         delta = F.softplus(self.parcae_log_delta)
@@ -985,17 +985,17 @@ class ESMFold2Model(FastPLMTestTimeTrainingMixin, PreTrainedModel):
         return state.to(dtype=ref.dtype)
     def _run_one_loop(
-        self,
-        z: Tensor,
-        z_init: Tensor,
-        lm_z: Tensor | None,
-        _msa_inputs: dict | None,
-        pair_mask: Tensor,
-        a: Tensor,
-        b_mat: Tensor,
-        tok_mask: Tensor,
-        total_steps: int,
-    ) -> Tensor:
         # Helper method (not inline) so per-iter locals free on return —
         # otherwise leaks ~2 GB L²×c_z into distogram/sample scope.
         # training=True forces dropout under eval(), matching the per-loop
@@ -1025,49 +1025,49 @@ class ESMFold2Model(FastPLMTestTimeTrainingMixin, PreTrainedModel):
             if lm_z_i is not None and self.lm_encoder is None:
                 z_inject_pair = z_inject_pair + lm_z_i.to(z_inject_pair.dtype)
-            if self.msa_encoder is not None and _msa_inputs is not None:
-                msa_i, mask_i, hd_i, dv_i = maybe_subsample_msa(
-                    _msa_inputs["msa"],
-                    _msa_inputs["msa_attention_mask"],
-                    _msa_inputs["has_deletion"],
-                    _msa_inputs["deletion_value"],
-                    max_depth=_msa_inputs["max_depth"],
-                    enabled=_msa_inputs["subsample_enabled"],
-                )
-                B_msa, M, L_msa = msa_i.shape
-                msa_oh = F.one_hot(
-                    msa_i.permute(0, 2, 1).long(), num_classes=NUM_RES_TYPES
-                ).float()
-                msa_attn = (
-                    mask_i.permute(0, 2, 1).float()
-                    if mask_i is not None
-                    else tok_mask[:, :, None].expand(-1, -1, M).float()
-                )
-                # Bias-free MSAEncoder.embed requires zeroed padding.
-                msa_oh = msa_oh * msa_attn.unsqueeze(-1)
-                hd = (
-                    hd_i.permute(0, 2, 1).float()
-                    if hd_i is not None
-                    else torch.zeros(B_msa, L_msa, M, device=msa_i.device)
-                )
-                dv = (
-                    dv_i.permute(0, 2, 1).float()
-                    if dv_i is not None
-                    else torch.zeros(B_msa, L_msa, M, device=msa_i.device)
-                )
-                msa_pair = self.msa_encoder(
-                    x_pair=z_inject_pair,
-                    x_inputs=_msa_inputs["x_inputs"],
-                    msa_oh=msa_oh,
-                    has_deletion=hd,
-                    deletion_value=dv,
-                    msa_attention_mask=msa_attn,
-                ).to(z_inject_pair.dtype)
-                z_inject_pair = (
-                    msa_pair
-                    if self.config.msa_encoder_overwrite
-                    else (z_inject_pair + msa_pair)
-                )
             if refined_lm_z is not None:
                 z_inject_pair = z_inject_pair + refined_lm_z.to(z_inject_pair.dtype)
@@ -1104,16 +1104,16 @@ class ESMFold2Model(FastPLMTestTimeTrainingMixin, PreTrainedModel):
         deletion_value: Tensor | None = None,
         msa_attention_mask: Tensor | None = None,
         input_ids: Tensor | None = None,
-        lm_hidden_states: Tensor | None = None,
-        num_loops: int | None = None,
-        num_diffusion_samples: int | None = None,
-        num_sampling_steps: int | None = None,
-        lm_mask_pct: float | None = None,
-        msa_max_depth: int = 1024,
-        msa_column_mask_rate: float = 0.1,
-        msa_subsample_at_inference: bool = True,
-        **kwargs,
-    ) -> dict[str, Tensor]:
         tok_mask = token_attention_mask
         atm_mask = atom_attention_mask
         disto_idx = distogram_atom_idx
@@ -1196,19 +1196,19 @@ class ESMFold2Model(FastPLMTestTimeTrainingMixin, PreTrainedModel):
                 lm_hidden_states is None
                 and input_ids is not None
                 and self._esmc is not None
-            ):
-                lm_hidden_states = self._compute_lm_hidden_states(
-                    input_ids,
-                    asym_id,
-                    residue_index,
-                    mol_type,
-                    tok_mask,
-                    lm_mask_pct=(
-                        self.config.lm_mask_pct
-                        if lm_mask_pct is None
-                        else lm_mask_pct
-                    ),
-                )
             lm_z: Tensor | None = None
             if lm_hidden_states is not None:
                 lm_z = self.language_model(lm_hidden_states.detach())
@@ -1222,35 +1222,35 @@ class ESMFold2Model(FastPLMTestTimeTrainingMixin, PreTrainedModel):
             a = a.view(1, 1, 1, -1).to(device=z.device, dtype=z.dtype)
             b_mat = b.to(device=z.device, dtype=z.dtype)
-            _msa_inputs: dict | None = None
-            if self.msa_encoder is not None and msa is not None:
-                msa_attention_mask = maybe_apply_msa_column_masking(
-                    msa_attention_mask,
-                    msa_column_mask_rate,
-                )
-                _msa_inputs = dict(
-                    x_inputs=x_inputs,
-                    msa=msa,
-                    msa_attention_mask=msa_attention_mask,
-                    has_deletion=has_deletion,
-                    deletion_value=deletion_value,
-                    max_depth=msa_max_depth,
-                    subsample_enabled=msa_subsample_at_inference,
-                )
             # Method call (not inline loop) frees per-iter L²×c_z locals.
             z = self._run_one_loop(
-                z=z,
-                z_init=z_init,
-                lm_z=lm_z,
-                _msa_inputs=_msa_inputs,
-                pair_mask=pair_mask,
-                a=a,
-                b_mat=b_mat,
-                tok_mask=tok_mask,
-                total_steps=total_steps,
-            )
-            del z_init, lm_z, _msa_inputs, a, b_mat
             z = self.parcae_readout(z)
             z = self.parcae_coda(z, pair_attention_mask=pair_mask)
@@ -1362,38 +1362,38 @@ class ESMFold2Model(FastPLMTestTimeTrainingMixin, PreTrainedModel):
             complex_id=complex_id,
         )
-    def _fold_protein_no_ttt(
-        self,
-        sequence: str,
-        *,
-        chain_id: str = "A",
-        msa: Any | None = None,
-        msa_path: str | Path | None = None,
-        msa_max_sequences: int | None = None,
-        num_loops: int = 3,
-        num_sampling_steps: int = 50,
-        num_diffusion_samples: int = 1,
-        seed: int | None = None,
-        complex_id: str = "pred",
-    ):
-        from .esmfold2_types import MSA, ProteinInput, StructurePredictionInput
-        assert not (
-            msa is not None and msa_path is not None
-        ), "Pass at most one of msa or msa_path."
-        if msa_path is not None:
-            msa = MSA.from_a3m(msa_path, max_sequences=msa_max_sequences)
-        if msa is not None:
-            query = str(msa.query).replace("-", "").upper()
-            assert query == sequence.upper(), (
-                f"MSA query does not match sequence: expected {sequence.upper()!r}, got {query!r}"
-            )
-        input = StructurePredictionInput(
-            sequences=[ProteinInput(id=chain_id, sequence=sequence, msa=msa)]
-        )
-        return self.fold(
-            input,
             num_loops=num_loops,
             num_sampling_steps=num_sampling_steps,
             num_diffusion_samples=num_diffusion_samples,
@@ -1442,15 +1442,15 @@ class ESMFold2Model(FastPLMTestTimeTrainingMixin, PreTrainedModel):
     def fold_protein(
         self,
-        sequence: str,
-        *,
-        chain_id: str = "A",
-        msa: Any | None = None,
-        msa_path: str | Path | None = None,
-        msa_max_sequences: int | None = None,
-        num_loops: int = 3,
-        num_sampling_steps: int = 50,
-        num_diffusion_samples: int = 1,
         seed: int | None = None,
         complex_id: str = "pred",
         ttt: bool = False,
@@ -1458,57 +1458,57 @@ class ESMFold2Model(FastPLMTestTimeTrainingMixin, PreTrainedModel):
     ):
         if ttt:
             return self.fold_protein_ttt(
-                sequence=sequence,
-                chain_id=chain_id,
-                msa=msa,
-                msa_path=msa_path,
-                msa_max_sequences=msa_max_sequences,
-                num_loops=num_loops,
-                num_sampling_steps=num_sampling_steps,
-                num_diffusion_samples=num_diffusion_samples,
                 seed=seed,
                 complex_id=complex_id,
                 ttt_config=ttt_config,
             )
         return self._fold_protein_no_ttt(
-            sequence=sequence,
-            chain_id=chain_id,
-            msa=msa,
-            msa_path=msa_path,
-            msa_max_sequences=msa_max_sequences,
-            num_loops=num_loops,
-            num_sampling_steps=num_sampling_steps,
-            num_diffusion_samples=num_diffusion_samples,
             seed=seed,
             complex_id=complex_id,
         )
     def fold_protein_ttt(
         self,
-        sequence: str,
-        *,
-        chain_id: str = "A",
-        msa: Any | None = None,
-        msa_path: str | Path | None = None,
-        msa_max_sequences: int | None = None,
-        num_loops: int = 3,
-        num_sampling_steps: int = 50,
-        num_diffusion_samples: int = 1,
         seed: int | None = None,
         complex_id: str = "pred",
         ttt_config: TTTConfig | dict[str, Any] | None = None,
-    ):
-        assert self._esmc is not None, "ESMFold2 TTT requires load_esmc=True."
-        if self._esmc_fp8:
-            raise RuntimeError("ESMFold2 TTT is not supported with fp8 ESM++.")
-        fold_kwargs = {
-            "chain_id": chain_id,
-            "msa": msa,
-            "msa_path": msa_path,
-            "msa_max_sequences": msa_max_sequences,
-            "num_loops": num_loops,
-            "num_sampling_steps": num_sampling_steps,
-            "num_diffusion_samples": num_diffusion_samples,
             "seed": seed,
             "complex_id": complex_id,
         }

     TriangleMultiplicativeUpdate,
     _categorical_mean,
     _compute_intra_token_idx,
+    compute_lm_hidden_states,
+    gather_rep_atom_coords,
+    gather_token_to_atom,
+    maybe_apply_msa_column_masking,
+    maybe_subsample_msa,
+)
 from .esmfold2_affine3d import Affine3D as _FastPLMSESMFold2Affine3D
 from .esmfold2_aligner import Aligner as _FastPLMSESMFold2Aligner
 from .esmfold2_atom_indexer import AtomIndexer as _FastPLMSESMFold2AtomIndexer
         self.post_init()
         self.init_ttt({"lora_target_replace_module": "MultiHeadAttention"})
+    def load_esmc(self, esmc_model_path: str, precision: str = "bf16") -> None:
+        """Load the FastPLMs ESM++ LM used as the ESMFold2 PLM backbone.
+        ``precision``: ``"bf16"`` (default), ``"fp32"``, or opt-in ``"fp8"``.
+        """
+        dtype_map = {
+            "bf16": torch.bfloat16,
+            "fp32": torch.float32,
+            "fp8": torch.bfloat16,
+        }
+        if precision not in dtype_map:
+            raise ValueError(f"precision must be one of {list(dtype_map)}, got {precision!r}")
+        if precision == "fp8" and not TE_AVAILABLE:
+            raise RuntimeError(
+                "esmc_precision='fp8' requires transformer_engine.pytorch."
+            )
+        dtype = dtype_map[precision]
+        esmc = _load_fastplms_esmplusplus_for_esmfold2(
+            esmc_model_path=esmc_model_path,
+            attn_backend=self.config.esmc_attn_backend,
             device=self.device,
             dtype=dtype,
         )
         assert esmc.config.num_hidden_layers == self.config.lm_num_layers, (
             f"ESMFold2 expected lm_num_layers={self.config.lm_num_layers}, "
             f"but loaded ESM++ num_hidden_layers={esmc.config.num_hidden_layers}."
+        )
+        for p in esmc.parameters():
+            p.requires_grad_(False)
+        if precision == "fp8":
+            with torch.no_grad():
+                _convert_te_modules_to_fp8_inplace(esmc)
+        self._esmc_fp8 = precision == "fp8"
+        self._esmc = esmc
+        self._ttt_lm_head = None
+    def _ensure_ttt_lm_head(self) -> None:
+        assert self._esmc is not None, "ESMFold2 TTT requires load_esmc=True."
+        if self._esmc_fp8:
+            raise RuntimeError("ESMFold2 TTT is not supported with fp8 ESM++.")
+        if self._ttt_lm_head is not None:
+            return
         try:
             from fastplms.esm_plusplus.modeling_esm_plusplus import (
                 ESMplusplusConfig,
         self._ttt_lm_head.requires_grad_(False)
         del mlm
+    def _ttt_get_trainable_modules(self) -> list[nn.Module]:
+        assert self._esmc is not None, "ESMFold2 TTT requires load_esmc=True."
+        if self._esmc_fp8:
+            raise RuntimeError("ESMFold2 TTT is not supported with fp8 ESM++.")
+        return [self._esmc]
     def _ttt_tokenize(
         self,
         **kwargs,
     ) -> torch.Tensor:
         del kwargs
+        assert isinstance(batch, torch.Tensor), (
+            "ESMFold2 TTT expects input_ids tensors."
+        )
+        assert self._esmc is not None, "ESMFold2 TTT requires load_esmc=True."
+        if self._esmc_fp8:
+            raise RuntimeError("ESMFold2 TTT is not supported with fp8 ESM++.")
+        self._ensure_ttt_lm_head()
         assert self._ttt_lm_head is not None
         attention_mask = batch.ne(SEQUENCE_PAD_TOKEN)
         output = self._esmc(
         if self.msa_encoder is not None:
             self.msa_encoder.set_chunk_size(chunk_size)
+    def _compute_lm_hidden_states(
+        self,
+        input_ids: Tensor,
+        asym_id: Tensor,
+        residue_index: Tensor,
+        mol_type: Tensor,
+        tok_mask: Tensor,
+        lm_mask_pct: float = 0.0,
+    ) -> Tensor:
+        assert self._esmc is not None
+        # fp8 TE kernels require prod(shape[:-1]) % 8 == 0.
+        pad_to = 8 if self._esmc_fp8 else None
+        with _lm_precision_context(self._esmc_fp8):
             return compute_lm_hidden_states(
                 self._esmc,
                 input_ids,
                 asym_id,
                 residue_index,
+                mol_type,
+                tok_mask,
+                pad_to_multiple=pad_to,
+                lm_mask_pct=lm_mask_pct,
+                mask_token_id=SEQUENCE_MASK_TOKEN,
+            )
     def _discretized_dynamics(self) -> tuple[Tensor, Tensor]:
         delta = F.softplus(self.parcae_log_delta)
         return state.to(dtype=ref.dtype)
     def _run_one_loop(
+        self,
+        z: Tensor,
+        z_init: Tensor,
+        lm_z: Tensor | None,
+        _msa_inputs: dict | None,
+        pair_mask: Tensor,
+        a: Tensor,
+        b_mat: Tensor,
+        tok_mask: Tensor,
+        total_steps: int,
+    ) -> Tensor:
         # Helper method (not inline) so per-iter locals free on return —
         # otherwise leaks ~2 GB L²×c_z into distogram/sample scope.
         # training=True forces dropout under eval(), matching the per-loop
             if lm_z_i is not None and self.lm_encoder is None:
                 z_inject_pair = z_inject_pair + lm_z_i.to(z_inject_pair.dtype)
+            if self.msa_encoder is not None and _msa_inputs is not None:
+                msa_i, mask_i, hd_i, dv_i = maybe_subsample_msa(
+                    _msa_inputs["msa"],
+                    _msa_inputs["msa_attention_mask"],
+                    _msa_inputs["has_deletion"],
+                    _msa_inputs["deletion_value"],
+                    max_depth=_msa_inputs["max_depth"],
+                    enabled=_msa_inputs["subsample_enabled"],
+                )
+                B_msa, M, L_msa = msa_i.shape
+                msa_oh = F.one_hot(
+                    msa_i.permute(0, 2, 1).long(), num_classes=NUM_RES_TYPES
+                ).float()
+                msa_attn = (
+                    mask_i.permute(0, 2, 1).float()
+                    if mask_i is not None
+                    else tok_mask[:, :, None].expand(-1, -1, M).float()
+                )
+                # Bias-free MSAEncoder.embed requires zeroed padding.
+                msa_oh = msa_oh * msa_attn.unsqueeze(-1)
+                hd = (
+                    hd_i.permute(0, 2, 1).float()
+                    if hd_i is not None
+                    else torch.zeros(B_msa, L_msa, M, device=msa_i.device)
+                )
+                dv = (
+                    dv_i.permute(0, 2, 1).float()
+                    if dv_i is not None
+                    else torch.zeros(B_msa, L_msa, M, device=msa_i.device)
+                )
+                msa_pair = self.msa_encoder(
+                    x_pair=z_inject_pair,
+                    x_inputs=_msa_inputs["x_inputs"],
+                    msa_oh=msa_oh,
+                    has_deletion=hd,
+                    deletion_value=dv,
+                    msa_attention_mask=msa_attn,
+                ).to(z_inject_pair.dtype)
+                z_inject_pair = (
+                    msa_pair
+                    if self.config.msa_encoder_overwrite
+                    else (z_inject_pair + msa_pair)
+                )
             if refined_lm_z is not None:
                 z_inject_pair = z_inject_pair + refined_lm_z.to(z_inject_pair.dtype)
         deletion_value: Tensor | None = None,
         msa_attention_mask: Tensor | None = None,
         input_ids: Tensor | None = None,
+        lm_hidden_states: Tensor | None = None,
+        num_loops: int | None = None,
+        num_diffusion_samples: int | None = None,
+        num_sampling_steps: int | None = None,
+        lm_mask_pct: float | None = None,
+        msa_max_depth: int = 1024,
+        msa_column_mask_rate: float = 0.1,
+        msa_subsample_at_inference: bool = True,
+        **kwargs,
+    ) -> dict[str, Tensor]:
         tok_mask = token_attention_mask
         atm_mask = atom_attention_mask
         disto_idx = distogram_atom_idx
                 lm_hidden_states is None
                 and input_ids is not None
                 and self._esmc is not None
+            ):
+                lm_hidden_states = self._compute_lm_hidden_states(
+                    input_ids,
+                    asym_id,
+                    residue_index,
+                    mol_type,
+                    tok_mask,
+                    lm_mask_pct=(
+                        self.config.lm_mask_pct
+                        if lm_mask_pct is None
+                        else lm_mask_pct
+                    ),
+                )
             lm_z: Tensor | None = None
             if lm_hidden_states is not None:
                 lm_z = self.language_model(lm_hidden_states.detach())
             a = a.view(1, 1, 1, -1).to(device=z.device, dtype=z.dtype)
             b_mat = b.to(device=z.device, dtype=z.dtype)
+            _msa_inputs: dict | None = None
+            if self.msa_encoder is not None and msa is not None:
+                msa_attention_mask = maybe_apply_msa_column_masking(
+                    msa_attention_mask,
+                    msa_column_mask_rate,
+                )
+                _msa_inputs = dict(
+                    x_inputs=x_inputs,
+                    msa=msa,
+                    msa_attention_mask=msa_attention_mask,
+                    has_deletion=has_deletion,
+                    deletion_value=deletion_value,
+                    max_depth=msa_max_depth,
+                    subsample_enabled=msa_subsample_at_inference,
+                )
             # Method call (not inline loop) frees per-iter L²×c_z locals.
             z = self._run_one_loop(
+                z=z,
+                z_init=z_init,
+                lm_z=lm_z,
+                _msa_inputs=_msa_inputs,
+                pair_mask=pair_mask,
+                a=a,
+                b_mat=b_mat,
+                tok_mask=tok_mask,
+                total_steps=total_steps,
+            )
+            del z_init, lm_z, _msa_inputs, a, b_mat
             z = self.parcae_readout(z)
             z = self.parcae_coda(z, pair_attention_mask=pair_mask)
             complex_id=complex_id,
         )
+    def _fold_protein_no_ttt(
+        self,
+        sequence: str,
+        *,
+        chain_id: str = "A",
+        msa: Any | None = None,
+        msa_path: str | Path | None = None,
+        msa_max_sequences: int | None = None,
+        num_loops: int = 3,
+        num_sampling_steps: int = 50,
+        num_diffusion_samples: int = 1,
+        seed: int | None = None,
+        complex_id: str = "pred",
+    ):
+        from .esmfold2_types import MSA, ProteinInput, StructurePredictionInput
+        assert not (
+            msa is not None and msa_path is not None
+        ), "Pass at most one of msa or msa_path."
+        if msa_path is not None:
+            msa = MSA.from_a3m(msa_path, max_sequences=msa_max_sequences)
+        if msa is not None:
+            query = str(msa.query).replace("-", "").upper()
+            assert query == sequence.upper(), (
+                f"MSA query does not match sequence: expected {sequence.upper()!r}, got {query!r}"
+            )
+        input = StructurePredictionInput(
+            sequences=[ProteinInput(id=chain_id, sequence=sequence, msa=msa)]
+        )
+        return self.fold(
+            input,
             num_loops=num_loops,
             num_sampling_steps=num_sampling_steps,
             num_diffusion_samples=num_diffusion_samples,
     def fold_protein(
         self,
+        sequence: str,
+        *,
+        chain_id: str = "A",
+        msa: Any | None = None,
+        msa_path: str | Path | None = None,
+        msa_max_sequences: int | None = None,
+        num_loops: int = 3,
+        num_sampling_steps: int = 50,
+        num_diffusion_samples: int = 1,
         seed: int | None = None,
         complex_id: str = "pred",
         ttt: bool = False,
     ):
         if ttt:
             return self.fold_protein_ttt(
+                sequence=sequence,
+                chain_id=chain_id,
+                msa=msa,
+                msa_path=msa_path,
+                msa_max_sequences=msa_max_sequences,
+                num_loops=num_loops,
+                num_sampling_steps=num_sampling_steps,
+                num_diffusion_samples=num_diffusion_samples,
                 seed=seed,
                 complex_id=complex_id,
                 ttt_config=ttt_config,
             )
         return self._fold_protein_no_ttt(
+            sequence=sequence,
+            chain_id=chain_id,
+            msa=msa,
+            msa_path=msa_path,
+            msa_max_sequences=msa_max_sequences,
+            num_loops=num_loops,
+            num_sampling_steps=num_sampling_steps,
+            num_diffusion_samples=num_diffusion_samples,
             seed=seed,
             complex_id=complex_id,
         )
     def fold_protein_ttt(
         self,
+        sequence: str,
+        *,
+        chain_id: str = "A",
+        msa: Any | None = None,
+        msa_path: str | Path | None = None,
+        msa_max_sequences: int | None = None,
+        num_loops: int = 3,
+        num_sampling_steps: int = 50,
+        num_diffusion_samples: int = 1,
         seed: int | None = None,
         complex_id: str = "pred",
         ttt_config: TTTConfig | dict[str, Any] | None = None,
+    ):
+        assert self._esmc is not None, "ESMFold2 TTT requires load_esmc=True."
+        if self._esmc_fp8:
+            raise RuntimeError("ESMFold2 TTT is not supported with fp8 ESM++.")
+        fold_kwargs = {
+            "chain_id": chain_id,
+            "msa": msa,
+            "msa_path": msa_path,
+            "msa_max_sequences": msa_max_sequences,
+            "num_loops": num_loops,
+            "num_sampling_steps": num_sampling_steps,
+            "num_diffusion_samples": num_diffusion_samples,
             "seed": seed,
             "complex_id": complex_id,
         }

modeling_esmfold2_common.py CHANGED Viewed

@@ -140,61 +140,61 @@ _EPS = 1e-5
 # chunk=64 leaves headroom for the largest foldbench targets). Override via
 # ``model.set_chunk_size(...)``; pass None to disable chunking (faster for
 # short L but OOM-prone past ~600).
-_DEFAULT_CHUNK_SIZE = 64
-# ===========================================================================
-# MSA inference-time diversity augmentations
-# ===========================================================================
-def maybe_subsample_msa(
-    msa: Tensor,
-    msa_attention_mask: Tensor | None,
-    has_deletion: Tensor | None,
-    deletion_value: Tensor | None,
-    *,
-    max_depth: int | None,
-    enabled: bool,
-) -> tuple[Tensor, Tensor | None, Tensor | None, Tensor | None]:
-    if not enabled or max_depth is None:
-        return msa, msa_attention_mask, has_deletion, deletion_value
-    depth = msa.size(1)
-    if depth <= 1 or depth <= max_depth:
-        return msa, msa_attention_mask, has_deletion, deletion_value
-    indices = torch.zeros(max_depth, dtype=torch.long, device=msa.device)
-    indices[1:] = torch.randperm(depth - 1, device=msa.device)[: max_depth - 1] + 1
-    indices = indices.sort().values
-    msa = msa[:, indices]
-    if msa_attention_mask is not None:
-        msa_attention_mask = msa_attention_mask[:, indices]
-    if has_deletion is not None:
-        has_deletion = has_deletion[:, indices]
-    if deletion_value is not None:
-        deletion_value = deletion_value[:, indices]
-    return msa, msa_attention_mask, has_deletion, deletion_value
-def maybe_apply_msa_column_masking(
-    msa_attention_mask: Tensor | None,
-    rate: float,
-) -> Tensor | None:
-    if msa_attention_mask is None or rate <= 0.0 or msa_attention_mask.size(1) <= 1:
-        return msa_attention_mask
-    batch_size, _, length = msa_attention_mask.shape
-    col_keep = torch.rand(batch_size, length, device=msa_attention_mask.device) >= rate
-    col_keep = col_keep.unsqueeze(1).expand_as(msa_attention_mask).clone()
-    col_keep[:, 0, :] = True
-    return msa_attention_mask.bool() & col_keep
-# ===========================================================================
-# Atom-token utilities
-# ===========================================================================
 def gather_token_to_atom(token_features: Tensor, atom_to_token_idx: Tensor) -> Tensor:
@@ -2182,17 +2182,17 @@ def _seed_context(seed: int | None, *, cuda: bool = True):
 # ===========================================================================
-def compute_lm_hidden_states(
-    esmc: nn.Module,
-    input_ids: Tensor,
-    asym_id: Tensor,
-    residue_index: Tensor,
-    mol_type: Tensor,
-    token_mask: Tensor,
-    pad_to_multiple: int | None = None,
-    lm_mask_pct: float = 0.0,
-    mask_token_id: int = 32,
-) -> Tensor:
     """Run ESMC with BOS/EOS wrapping, return hidden states [B, L, N, D] with N=81 layers.
     Atom-tokenized modified residues (HYP, MSE, ACE, NH2, ...) span multiple
@@ -2277,21 +2277,21 @@ def compute_lm_hidden_states(
     for b in range(B):
         lm_input_ids[b, : lm_lengths[b]] = lm_input_list[b]
-    # sequence_id for chain-aware attention; PAD tokens get -1 (no attention).
-    sequence_id = (lm_input_ids == 0).cumsum(dim=1) - 1  # BOS=0
-    sequence_id = sequence_id.masked_fill(lm_input_ids == 1, -1)  # PAD=1
-    if lm_mask_pct > 0.0:
-        special = (lm_input_ids == 0) | (lm_input_ids == 1) | (lm_input_ids == 2)
-        do_mask = (
-            torch.rand(lm_input_ids.shape, device=device) < lm_mask_pct
-        ) & ~special
-        lm_input_ids = lm_input_ids.masked_fill(do_mask, mask_token_id)
-    with torch.inference_mode():
-        esmc_out = esmc(
-            input_ids=lm_input_ids, sequence_id=sequence_id, output_hidden_states=True
-        )
     hs = esmc_out.hidden_states  # [n_layers+1, B, max_len, D]
     n_layers_plus_1, _, _, D = hs.shape

 # chunk=64 leaves headroom for the largest foldbench targets). Override via
 # ``model.set_chunk_size(...)``; pass None to disable chunking (faster for
 # short L but OOM-prone past ~600).
+_DEFAULT_CHUNK_SIZE = 64
+# ===========================================================================
+# MSA inference-time diversity augmentations
+# ===========================================================================
+def maybe_subsample_msa(
+    msa: Tensor,
+    msa_attention_mask: Tensor | None,
+    has_deletion: Tensor | None,
+    deletion_value: Tensor | None,
+    *,
+    max_depth: int | None,
+    enabled: bool,
+) -> tuple[Tensor, Tensor | None, Tensor | None, Tensor | None]:
+    if not enabled or max_depth is None:
+        return msa, msa_attention_mask, has_deletion, deletion_value
+    depth = msa.size(1)
+    if depth <= 1 or depth <= max_depth:
+        return msa, msa_attention_mask, has_deletion, deletion_value
+    indices = torch.zeros(max_depth, dtype=torch.long, device=msa.device)
+    indices[1:] = torch.randperm(depth - 1, device=msa.device)[: max_depth - 1] + 1
+    indices = indices.sort().values
+    msa = msa[:, indices]
+    if msa_attention_mask is not None:
+        msa_attention_mask = msa_attention_mask[:, indices]
+    if has_deletion is not None:
+        has_deletion = has_deletion[:, indices]
+    if deletion_value is not None:
+        deletion_value = deletion_value[:, indices]
+    return msa, msa_attention_mask, has_deletion, deletion_value
+def maybe_apply_msa_column_masking(
+    msa_attention_mask: Tensor | None,
+    rate: float,
+) -> Tensor | None:
+    if msa_attention_mask is None or rate <= 0.0 or msa_attention_mask.size(1) <= 1:
+        return msa_attention_mask
+    batch_size, _, length = msa_attention_mask.shape
+    col_keep = torch.rand(batch_size, length, device=msa_attention_mask.device) >= rate
+    col_keep = col_keep.unsqueeze(1).expand_as(msa_attention_mask).clone()
+    col_keep[:, 0, :] = True
+    return msa_attention_mask.bool() & col_keep
+# ===========================================================================
+# Atom-token utilities
+# ===========================================================================
 def gather_token_to_atom(token_features: Tensor, atom_to_token_idx: Tensor) -> Tensor:
 # ===========================================================================
+def compute_lm_hidden_states(
+    esmc: nn.Module,
+    input_ids: Tensor,
+    asym_id: Tensor,
+    residue_index: Tensor,
+    mol_type: Tensor,
+    token_mask: Tensor,
+    pad_to_multiple: int | None = None,
+    lm_mask_pct: float = 0.0,
+    mask_token_id: int = 32,
+) -> Tensor:
     """Run ESMC with BOS/EOS wrapping, return hidden states [B, L, N, D] with N=81 layers.
     Atom-tokenized modified residues (HYP, MSE, ACE, NH2, ...) span multiple
     for b in range(B):
         lm_input_ids[b, : lm_lengths[b]] = lm_input_list[b]
+    # sequence_id for chain-aware attention; PAD tokens get -1 (no attention).
+    sequence_id = (lm_input_ids == 0).cumsum(dim=1) - 1  # BOS=0
+    sequence_id = sequence_id.masked_fill(lm_input_ids == 1, -1)  # PAD=1
+    if lm_mask_pct > 0.0:
+        special = (lm_input_ids == 0) | (lm_input_ids == 1) | (lm_input_ids == 2)
+        do_mask = (
+            torch.rand(lm_input_ids.shape, device=device) < lm_mask_pct
+        ) & ~special
+        lm_input_ids = lm_input_ids.masked_fill(do_mask, mask_token_id)
+    with torch.inference_mode():
+        esmc_out = esmc(
+            input_ids=lm_input_ids, sequence_id=sequence_id, output_hidden_states=True
+        )
     hs = esmc_out.hidden_states  # [n_layers+1, B, max_len, D]
     n_layers_plus_1, _, _, D = hs.shape

modeling_esmfold2_experimental.py CHANGED Viewed

@@ -521,14 +521,14 @@ class ESMFold2ExperimentalModel(PreTrainedModel):
             "bf16": torch.bfloat16,
             "fp32": torch.float32,
         }
-        if precision not in dtype_map:
-            if precision == "fp8":
-                raise RuntimeError(
-                    "esmc_precision='fp8' is supported only by the standard "
-                    "released ESMFold2 model. The experimental binder-design "
-                    "model keeps the FastPLMs ESM++ backbone in bf16 or fp32."
-                )
-            raise ValueError(f"precision must be one of {list(dtype_map)}, got {precision!r}")
         esmc = _load_fastplms_esmplusplus_for_esmfold2(
             esmc_model_path=esmc_model_path,
             attn_backend=self.config.esmc_attn_backend,
@@ -852,13 +852,29 @@ class ESMFold2ExperimentalModel(PreTrainedModel):
                 return_atom_repr=False,
                 denoising_early_exit_rmsd=(0.10 if early_exit else None),
             )
-        sample_coords = structure_output["sample_atom_coords"]
-        assert sample_coords is not None
-        output: dict[str, Tensor] = {
-            "distogram_logits": distogram_logits,
-            "sample_atom_coords": sample_coords,
-        }
         if calculate_confidence and self.confidence_head is not None:
             confidence_output = self.confidence_head(
                 s_inputs=x_inputs.detach(),

             "bf16": torch.bfloat16,
             "fp32": torch.float32,
         }
+        if precision not in dtype_map:
+            if precision == "fp8":
+                raise RuntimeError(
+                    "esmc_precision='fp8' is supported only by the standard "
+                    "released ESMFold2 model. The experimental binder-design "
+                    "model keeps the FastPLMs ESM++ backbone in bf16 or fp32."
+                )
+            raise ValueError(f"precision must be one of {list(dtype_map)}, got {precision!r}")
         esmc = _load_fastplms_esmplusplus_for_esmfold2(
             esmc_model_path=esmc_model_path,
             attn_backend=self.config.esmc_attn_backend,
                 return_atom_repr=False,
                 denoising_early_exit_rmsd=(0.10 if early_exit else None),
             )
+        sample_coords = structure_output["sample_atom_coords"]
+        assert sample_coords is not None
+        if sample_coords.ndim == 4:
+            batch, sample_count, atom_count, coord_dim = sample_coords.shape
+            sample_coords_for_gather = sample_coords.reshape(
+                batch * sample_count,
+                atom_count,
+                coord_dim,
+            )
+            rep_idx = distogram_atom_idx.repeat_interleave(sample_count, 0).long()
+        else:
+            sample_coords_for_gather = sample_coords
+            rep_idx = distogram_atom_idx.long()
+        representative_atom_coords = gather_rep_atom_coords(
+            sample_coords_for_gather,
+            rep_idx,
+        )
+        output: dict[str, Tensor] = {
+            "distogram_logits": distogram_logits,
+            "sample_atom_coords": sample_coords,
+            "representative_atom_coords": representative_atom_coords,
+        }
         if calculate_confidence and self.confidence_head is not None:
             confidence_output = self.confidence_head(
                 s_inputs=x_inputs.detach(),