Synthyra
/

FastESMFold

@@ -665,6 +665,59 @@ class FastEsmBackbone(nn.Module):
 _ESM_STANDARD_AA = list("ACDEFGHIKLMNPQRSTVWY")
 @dataclass
 class TTTConfig:
     lr: float = 4e-4
@@ -683,7 +736,7 @@ class TTTConfig:
     freeze_embeddings: bool = True
     lora_rank: int = 8
     lora_alpha: float = 32.0
-    lora_target_modules: Tuple[str, ...] = ("query", "key", "value")
     def verify(self) -> None:
         assert self.lr > 0.0, "TTT learning rate must be positive."
@@ -761,16 +814,19 @@ class FastEsmForProteinFolding(EsmForProteinFolding):
         super().__init__(config)
         # Replace standard ESM2 backbone with FastESM2 (multi-backend attention)
-        self.esm = FastEsmBackbone(config)
-        self.esm.requires_grad_(False)
-        if config.esmfold_config.fp16_esm:
-            self.esm.half()
         # MLM head for TTT (pretrained EsmLMHead: Dense -> GELU -> LN -> Linear)
         self.mlm_head = EsmLMHead(config)
         # TTT state (lazy initialization)
-        self._ttt_cfg = TTTConfig(**config.ttt_config)
         self._ttt_cfg.verify()
         self._ttt_initialized = False
         self._ttt_initial_state = None
@@ -800,6 +856,9 @@ class FastEsmForProteinFolding(EsmForProteinFolding):
             self.mlm_head.eval()
             for p in self.mlm_head.parameters():
                 p.requires_grad = False
             self._inject_lora()
         else:
             # Legacy path: jointly-trained random linear projection head
@@ -816,25 +875,32 @@ class FastEsmForProteinFolding(EsmForProteinFolding):
         return self._ttt_cfg.lora_rank > 0
     def _inject_lora(self) -> None:
-        from peft import LoraConfig, inject_adapter_in_model
-        lora_config = LoraConfig(
             r=self._ttt_cfg.lora_rank,
-            lora_alpha=self._ttt_cfg.lora_alpha,
-            target_modules=list(self._ttt_cfg.lora_target_modules),
-            lora_dropout=0.0,
-            bias="none",
         )
-        inject_adapter_in_model(lora_config, self.esm, adapter_name="ttt")
     # ---- TTT State Management ----
     def _ttt_get_state(self) -> Dict[str, Any]:
         if self._uses_lora:
-            lora_state = {
-                k: v.clone() for k, v in self.esm.state_dict().items()
-                if "lora_" in k
-            }
             return {"_lora_state": lora_state}
         return {
             "esm": copy.deepcopy(self.esm),
@@ -843,9 +909,11 @@ class FastEsmForProteinFolding(EsmForProteinFolding):
     def _ttt_set_state(self, state: Dict[str, Any]) -> None:
         if "_lora_state" in state:
-            current_state = self.esm.state_dict()
-            current_state.update(state["_lora_state"])
-            self.esm.load_state_dict(current_state)
             return
         if "esm" in state:
             self.esm = copy.deepcopy(state["esm"])
@@ -993,11 +1061,9 @@ class FastEsmForProteinFolding(EsmForProteinFolding):
         for parameter in self.parameters():
             parameter.requires_grad = False
-        for name, parameter in self.esm.named_parameters():
-            if "lora_" in name:
-                parameter.requires_grad = True
-        lora_params = [p for n, p in self.esm.named_parameters() if "lora_" in n]
-        optimizer = self._ttt_get_optimizer(iter(lora_params))
         optimizer.zero_grad(set_to_none=True)
         self.eval()
@@ -1097,49 +1163,126 @@ class FastEsmForProteinFolding(EsmForProteinFolding):
     # ---- High-Level API ----
     def fold_protein(
         self,
         sequence: str,
-        ttt: bool = False,
         return_pdb_string: bool = True,
     ) -> Dict[str, Any]:
-        """Fold a protein sequence, optionally with test-time training.
         Args:
             sequence: Protein sequence (single-letter amino acid codes)
-            ttt: If True, run test-time training before folding (improves accuracy)
             return_pdb_string: If True, include PDB string in output
         Returns:
             Dict with keys:
-                - plddt: float, mean per-residue pLDDT confidence score
-                - ptm: float, predicted TM-score
-                - pdb_string: str (if return_pdb_string=True), PDB format structure
-                - ttt_losses: list[float] (if ttt=True), per-step MLM losses
         """
-        result: Dict[str, Any] = {}
-        if ttt:
-            ttt_result = self.ttt(sequence)
-            result["ttt_losses"] = ttt_result["losses"]
-        with torch.no_grad():
-            output = self.infer(sequence)
-        plddt = output["plddt"]
-        if plddt.dim() >= 2:
-            mean_plddt = float(plddt.mean(dim=-1).mean().item())
-        else:
-            mean_plddt = float(plddt.mean().item())
-        result["plddt"] = mean_plddt
-        result["ptm"] = float(output["ptm"].item()) if "ptm" in output else None
-        if return_pdb_string:
-            pdb_strings = self.output_to_pdb(output)
-            result["pdb_string"] = pdb_strings[0] if isinstance(pdb_strings, list) else pdb_strings
-        if ttt:
-            self.ttt_reset()
-        return result

 _ESM_STANDARD_AA = list("ACDEFGHIKLMNPQRSTVWY")
+class LoraInjectedLinear(nn.Module):
+    """LoRA-augmented linear layer matching lora_diffusion's behavior.
+    Replaces an existing nn.Linear with base(x) + lora_up(lora_down(x)) * scale.
+    Initialization follows cloneofsimo/lora: down=Normal(0, 1/r), up=zeros.
+    """
+    def __init__(self, original_linear: nn.Linear, r: int = 4, scale: float = 1.0):
+        super().__init__()
+        self.linear = original_linear
+        in_features = original_linear.in_features
+        out_features = original_linear.out_features
+        assert r <= min(in_features, out_features), f"LoRA rank {r} exceeds dimensions ({in_features}, {out_features})"
+        self.lora_down = nn.Linear(in_features, r, bias=False)
+        self.lora_up = nn.Linear(r, out_features, bias=False)
+        self.scale = scale
+        nn.init.normal_(self.lora_down.weight, std=1.0 / r)
+        nn.init.zeros_(self.lora_up.weight)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.linear(x) + self.lora_up(self.lora_down(x)) * self.scale
+def inject_trainable_lora(
+    model: nn.Module,
+    target_class_name: str,
+    r: int,
+    scale: float,
+) -> List[nn.Parameter]:
+    """Replace nn.Linear layers inside modules matching target_class_name with LoRA.
+    Matches lora_diffusion's inject_trainable_lora behavior: finds all modules whose
+    class name matches target_class_name, then replaces their nn.Linear children with
+    LoraInjectedLinear. Returns the list of trainable LoRA parameters.
+    """
+    lora_params: List[nn.Parameter] = []
+    for _parent_name, parent_module in model.named_modules():
+        if parent_module.__class__.__name__ != target_class_name:
+            continue
+        for child_name, child_module in list(parent_module.named_children()):
+            if not isinstance(child_module, nn.Linear):
+                continue
+            lora_linear = LoraInjectedLinear(child_module, r=r, scale=scale)
+            lora_linear = lora_linear.to(
+                device=child_module.weight.device,
+                dtype=child_module.weight.dtype,
+            )
+            setattr(parent_module, child_name, lora_linear)
+            lora_params.extend(lora_linear.lora_down.parameters())
+            lora_params.extend(lora_linear.lora_up.parameters())
+    return lora_params
 @dataclass
 class TTTConfig:
     lr: float = 4e-4
     freeze_embeddings: bool = True
     lora_rank: int = 8
     lora_alpha: float = 32.0
+    lora_target_class: str = "EsmSelfAttention"
     def verify(self) -> None:
         assert self.lr > 0.0, "TTT learning rate must be positive."
         super().__init__(config)
         # Replace standard ESM2 backbone with FastESM2 (multi-backend attention)
+        # unless use_standard_backbone is set (for TTT debugging/compatibility)
+        if not config.ttt_config.get("use_standard_backbone", False):
+            self.esm = FastEsmBackbone(config)
+            self.esm.requires_grad_(False)
+            if config.esmfold_config.fp16_esm:
+                self.esm.half()
         # MLM head for TTT (pretrained EsmLMHead: Dense -> GELU -> LN -> Linear)
         self.mlm_head = EsmLMHead(config)
         # TTT state (lazy initialization)
+        ttt_kwargs = {k: v for k, v in config.ttt_config.items() if k != "use_standard_backbone"}
+        self._ttt_cfg = TTTConfig(**ttt_kwargs)
         self._ttt_cfg.verify()
         self._ttt_initialized = False
         self._ttt_initial_state = None
             self.mlm_head.eval()
             for p in self.mlm_head.parameters():
                 p.requires_grad = False
+            # Seed global state before LoRA init for reproducible weight initialization
+            if self._ttt_cfg.seed is not None:
+                torch.manual_seed(self._ttt_cfg.seed)
             self._inject_lora()
         else:
             # Legacy path: jointly-trained random linear projection head
         return self._ttt_cfg.lora_rank > 0
     def _inject_lora(self) -> None:
+        """Inject LoRA adapters into ESM2 attention layers (matching lora_diffusion behavior)."""
+        self._lora_params = inject_trainable_lora(
+            self.esm,
+            target_class_name=self._ttt_cfg.lora_target_class,
             r=self._ttt_cfg.lora_rank,
+            scale=self._ttt_cfg.lora_alpha,
+        )
+        assert len(self._lora_params) > 0, (
+            f"No LoRA params injected. Check target_class_name='{self._ttt_cfg.lora_target_class}' "
+            f"matches attention modules in the backbone."
         )
     # ---- TTT State Management ----
+    def _get_lora_modules(self) -> List[LoraInjectedLinear]:
+        """Find all LoraInjectedLinear modules in the backbone."""
+        return [m for m in self.esm.modules() if isinstance(m, LoraInjectedLinear)]
     def _ttt_get_state(self) -> Dict[str, Any]:
         if self._uses_lora:
+            lora_state = []
+            for m in self._get_lora_modules():
+                lora_state.append({
+                    "down": m.lora_down.weight.data.clone(),
+                    "up": m.lora_up.weight.data.clone(),
+                })
             return {"_lora_state": lora_state}
         return {
             "esm": copy.deepcopy(self.esm),
     def _ttt_set_state(self, state: Dict[str, Any]) -> None:
         if "_lora_state" in state:
+            modules = self._get_lora_modules()
+            assert len(modules) == len(state["_lora_state"])
+            for m, saved in zip(modules, state["_lora_state"]):
+                m.lora_down.weight.data.copy_(saved["down"])
+                m.lora_up.weight.data.copy_(saved["up"])
             return
         if "esm" in state:
             self.esm = copy.deepcopy(state["esm"])
         for parameter in self.parameters():
             parameter.requires_grad = False
+        for p in self._lora_params:
+            p.requires_grad = True
+        optimizer = self._ttt_get_optimizer(self._lora_params)
         optimizer.zero_grad(set_to_none=True)
         self.eval()
     # ---- High-Level API ----
+    def _fold_single(self, sequence: str, return_pdb_string: bool = True) -> Dict[str, Any]:
+        """Fold a sequence once and return pLDDT, ptm, and optionally PDB string."""
+        with torch.no_grad():
+            output = self.infer(sequence)
+        plddt = output["plddt"]
+        if plddt.dim() >= 2:
+            mean_plddt = float(plddt.mean(dim=-1).mean().item())
+        else:
+            mean_plddt = float(plddt.mean().item())
+        result = {
+            "plddt": mean_plddt,
+            "ptm": float(output["ptm"].item()) if "ptm" in output else None,
+        }
+        if return_pdb_string:
+            pdb_strings = self.output_to_pdb(output)
+            result["pdb_string"] = pdb_strings[0] if isinstance(pdb_strings, list) else pdb_strings
+        return result
     def fold_protein(
         self,
         sequence: str,
         return_pdb_string: bool = True,
     ) -> Dict[str, Any]:
+        """Fold a protein sequence with test-time training.
+        Runs TTT (masked language model adaptation via LoRA) for the configured
+        number of steps, folding after each optimizer step to track pLDDT. Returns
+        the structure with the highest pLDDT across all steps (including baseline).
         Args:
             sequence: Protein sequence (single-letter amino acid codes)
             return_pdb_string: If True, include PDB string in output
         Returns:
             Dict with keys:
+                - plddt: float, best mean pLDDT across all TTT steps
+                - ptm: float, predicted TM-score from best step
+                - pdb_string: str (if return_pdb_string=True), PDB from best step
+                - step_plddts: list[float], pLDDT at each step [baseline, s1, ..., s10]
+                - best_step: int, which step produced best structure (0=baseline)
         """
+        self._ensure_ttt_ready()
+        # Cast to fp32 for TTT stability
+        esm_dtype = next(self.esm.parameters()).dtype
+        if esm_dtype != torch.float32:
+            self.esm.float()
+            self.mlm_head.float()
+        device = next(self.parameters()).device
+        non_blocking = device.type == "cuda"
+        # Step 0: baseline fold (no TTT adaptation)
+        best = self._fold_single(sequence, return_pdb_string=return_pdb_string)
+        step_plddts = [best["plddt"]]
+        if self._ttt_cfg.steps > 0:
+            # Tokenize for masked LM training
+            x = self._ttt_tokenize(sequence)
+            # Freeze all, unfreeze LoRA
+            for p in self.parameters():
+                p.requires_grad = False
+            if self._uses_lora:
+                for p in self._lora_params:
+                    p.requires_grad = True
+                optimizer = self._ttt_get_optimizer(self._lora_params)
+            else:
+                for p in self.esm.parameters():
+                    p.requires_grad = True
+                if self._ttt_cfg.freeze_embeddings:
+                    for p in self.esm.embeddings.parameters():
+                        p.requires_grad = False
+                for p in self._ttt_lm_proj.parameters():
+                    p.requires_grad = True
+                trainable = [p for p in self.parameters() if p.requires_grad]
+                optimizer = self._ttt_get_optimizer(trainable)
+            optimizer.zero_grad(set_to_none=True)
+            self.eval()
+            for step in range(self._ttt_cfg.steps * self._ttt_cfg.ags):
+                batch_masked, targets, mask, _start = self._ttt_sample_batch(x)
+                batch_masked = batch_masked.to(device, non_blocking=non_blocking)
+                targets = targets.to(device, non_blocking=non_blocking)
+                mask = mask.to(device, non_blocking=non_blocking)
+                self.train()
+                logits = self._ttt_predict_logits(batch_masked)
+                loss = self._ttt_cross_entropy_loss(logits, targets, mask)
+                loss.backward()
+                if (step + 1) % self._ttt_cfg.ags == 0:
+                    optimizer.step()
+                    optimizer.zero_grad(set_to_none=True)
+                    # Fold after this optimizer step
+                    self.eval()
+                    current = self._fold_single(sequence, return_pdb_string=return_pdb_string)
+                    step_plddts.append(current["plddt"])
+                    if current["plddt"] > best["plddt"]:
+                        best = current
+            self.eval()
+            # Restore requires_grad
+            for p in self.parameters():
+                p.requires_grad = False
+        # Reset LoRA weights for next sequence
+        self.ttt_reset()
+        # Restore dtype
+        if esm_dtype != torch.float32:
+            self.esm.to(esm_dtype)
+            self.mlm_head.to(esm_dtype)
+        return {
+            "plddt": best["plddt"],
+            "ptm": best["ptm"],
+            "pdb_string": best.get("pdb_string"),
+            "step_plddts": step_plddts,
+            "best_step": step_plddts.index(max(step_plddts)),
+        }