Spaces:

yushize
/

Embedding-predictor

Running

App Files Files Community

yushize commited on 2 days ago

Commit

f7bcbc4

verified ·

1 Parent(s): 21429bd

Update app.py

Browse files

Files changed (1) hide show

app.py +81 -48

app.py CHANGED Viewed

@@ -2,10 +2,12 @@ import gc
 import io
 import os
 import re
 import zipfile
 import tempfile
 from dataclasses import dataclass
-from typing import Dict, List, Optional
 import gradio as gr
 import torch
@@ -16,6 +18,8 @@ APP_TITLE = "Protein Embedding"
 ALLOWED_AA = set(list("ACDEFGHIKLMNPQRSTVWYXBZJUO"))
 REPLACE_WITH_X = set(list("UZOB"))
 @dataclass
 class ModelSpec:
@@ -189,6 +193,21 @@ def normalize_to_Ld(
     raise ValueError(f"Cannot normalize token length {T} to residue length {expected_len}.")
 class SingleModelRunner:
     def __init__(self):
         self.model_key = None
@@ -235,6 +254,8 @@ class SingleModelRunner:
             self.model.eval()
         elif spec.family == "prosst":
             self.tokenizer = AutoTokenizer.from_pretrained(spec.tokenizer_id, trust_remote_code=True)
             self.model = AutoModel.from_pretrained(
                 spec.model_id,
@@ -244,8 +265,6 @@ class SingleModelRunner:
             self.model.to(target_device)
             self.model.eval()
-            # Official ProSST sequence-only route:
-            # predict structure tokens from sequence, then feed them into ProSST.
             from prosst.structure.get_sst_seq import SSTPredictor
             self.sst_predictor = SSTPredictor()
@@ -335,76 +354,86 @@ def embed_esmc(seq: str) -> torch.Tensor:
     raise ValueError(f"ESMC returned shape {tuple(emb.shape)} for sequence length {len(seq)}.")
-@torch.no_grad()
-def embed_prosst(seq: str) -> torch.Tensor:
-    # Sequence-only mode:
-    # 1) predict structure token sequence from amino-acid sequence
-    # 2) feed sequence + structure tokens into ProSST
-    structure_tokens = RUNNER.sst_predictor.predict(seq)
-    # Structure tokens may come back as list[int], np.ndarray, or space-separated string
-    if isinstance(structure_tokens, str):
-        sst_seq = structure_tokens
     else:
-        sst_seq = " ".join([str(x) for x in structure_tokens])
-    aa_spaced = protein_to_spaced(seq)
-    enc = RUNNER.tokenizer(
         aa_spaced,
         return_tensors="pt",
         add_special_tokens=True,
         return_special_tokens_mask=True,
         truncation=False,
     )
-    enc = {k: v.to(RUNNER.device) for k, v in enc.items()}
-    # Different ProSST remote-code implementations may expect different kwarg names.
-    # Try the common names first.
-    tried = []
     for kw in ("ss_input_ids", "structure_ids", "sst_input_ids", "struc_input_ids"):
         try:
-            sst_enc = RUNNER.tokenizer(
-                sst_seq,
-                return_tensors="pt",
-                add_special_tokens=True,
-                truncation=False,
-            )
-            sst_ids = sst_enc["input_ids"].to(RUNNER.device)
             out = RUNNER.model(
-                input_ids=enc["input_ids"],
-                attention_mask=enc.get("attention_mask", None),
                 output_hidden_states=True,
                 **{kw: sst_ids},
             )
             hidden = out.hidden_states[-1][0]
             emb = normalize_to_Ld(
                 hidden=hidden,
                 expected_len=len(seq),
-                special_tokens_mask=enc.get("special_tokens_mask", None)[0] if enc.get("special_tokens_mask", None) is not None else None,
-                attention_mask=enc.get("attention_mask", None)[0] if enc.get("attention_mask", None) is not None else None,
             )
-            return emb.detach().cpu().float()
         except Exception as e:
             tried.append(f"{kw}: {repr(e)}")
-    raise RuntimeError(
-        "Failed to run ProSST. The installed ProSST remote-code signature may differ. "
-        + " | ".join(tried)
-    )
-def embed_one_sequence(seq: str) -> torch.Tensor:
     if RUNNER.family == "hf_encoder":
-        return embed_hf_encoder(seq)
     if RUNNER.family == "t5_encoder":
-        return embed_t5_encoder(seq)
     if RUNNER.family == "esmc":
-        return embed_esmc(seq)
     if RUNNER.family == "prosst":
         return embed_prosst(seq)
     raise ValueError(f"Unsupported family: {RUNNER.family}")
@@ -430,17 +459,21 @@ def run_embedding(fasta_text: str, model_keys: List[str], device: str, progress=
             for rec in records:
                 step += 1
                 progress(step / total_steps, desc=f"{model_key} | {rec['id']}")
-                emb = embed_one_sequence(rec["sequence"])
                 if emb.ndim != 2 or emb.shape[0] != len(rec["sequence"]):
                     raise ValueError(
                         f"{model_key} failed on {rec['id']}: got shape {tuple(emb.shape)}, expected ({len(rec['sequence'])}, d)"
                     )
-                inner_name = f"{safe_filename(model_key)}/{safe_filename(rec['id'])}.pt"
-                buffer = io.BytesIO()
-                torch.save(emb, buffer)
-                zf.writestr(inner_name, buffer.getvalue())
     return zip_path, f"Done: {len(records)} sequence(s), {len(model_keys)} model(s)."

 import io
 import os
 import re
+import sys
 import zipfile
 import tempfile
+import subprocess
 from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple
 import gradio as gr
 import torch
 ALLOWED_AA = set(list("ACDEFGHIKLMNPQRSTVWYXBZJUO"))
 REPLACE_WITH_X = set(list("UZOB"))
+PROSST_REPO_DIR = "/tmp/ProSST"
 @dataclass
 class ModelSpec:
     raise ValueError(f"Cannot normalize token length {T} to residue length {expected_len}.")
+def ensure_prosst_repo():
+    if os.path.isdir(PROSST_REPO_DIR) and os.path.isdir(os.path.join(PROSST_REPO_DIR, "prosst")):
+        if PROSST_REPO_DIR not in sys.path:
+            sys.path.append(PROSST_REPO_DIR)
+        return
+    subprocess.run(
+        ["git", "clone", "--depth", "1", "https://github.com/openmedlab/ProSST.git", PROSST_REPO_DIR],
+        check=True,
+    )
+    if PROSST_REPO_DIR not in sys.path:
+        sys.path.append(PROSST_REPO_DIR)
 class SingleModelRunner:
     def __init__(self):
         self.model_key = None
             self.model.eval()
         elif spec.family == "prosst":
+            ensure_prosst_repo()
             self.tokenizer = AutoTokenizer.from_pretrained(spec.tokenizer_id, trust_remote_code=True)
             self.model = AutoModel.from_pretrained(
                 spec.model_id,
             self.model.to(target_device)
             self.model.eval()
             from prosst.structure.get_sst_seq import SSTPredictor
             self.sst_predictor = SSTPredictor()
     raise ValueError(f"ESMC returned shape {tuple(emb.shape)} for sequence length {len(seq)}.")
+def get_sst_tokens(seq: str):
+    sst = RUNNER.sst_predictor.predict(seq)
+    if isinstance(sst, str):
+        tokens = [int(x) for x in sst.strip().split()]
+    elif isinstance(sst, torch.Tensor):
+        tokens = sst.detach().cpu().view(-1).tolist()
+    elif hasattr(sst, "tolist"):
+        tokens = sst.tolist()
+        if isinstance(tokens, list) and len(tokens) > 0 and isinstance(tokens[0], list):
+            tokens = tokens[0]
+    elif isinstance(sst, (list, tuple)):
+        tokens = list(sst)
     else:
+        raise ValueError(f"Unsupported SSTPredictor output type: {type(sst)}")
+    tokens = [int(x) for x in tokens]
+    # 尽量规整到 L
+    if len(tokens) == len(seq) + 2:
+        tokens = tokens[1:-1]
+    elif len(tokens) == len(seq) + 1:
+        tokens = tokens[:len(seq)]
+    elif len(tokens) > len(seq):
+        tokens = tokens[:len(seq)]
+    if len(tokens) != len(seq):
+        raise ValueError(f"SST token length mismatch: got {len(tokens)}, expected {len(seq)}")
+    return tokens
+@torch.no_grad()
+def embed_prosst(seq: str) -> Tuple[torch.Tensor, List[int]]:
+    sst_tokens = get_sst_tokens(seq)
+    aa_spaced = protein_to_spaced(seq)
+    seq_enc = RUNNER.tokenizer(
         aa_spaced,
         return_tensors="pt",
         add_special_tokens=True,
         return_special_tokens_mask=True,
         truncation=False,
     )
+    seq_enc = {k: v.to(RUNNER.device) for k, v in seq_enc.items()}
+    # ProSST 常见做法是把结构 token 当作额外输入 ids
+    # 这里直接构建 [1, L] LongTensor
+    sst_ids = torch.tensor([sst_tokens], dtype=torch.long, device=RUNNER.device)
+    tried = []
     for kw in ("ss_input_ids", "structure_ids", "sst_input_ids", "struc_input_ids"):
         try:
             out = RUNNER.model(
+                input_ids=seq_enc["input_ids"],
+                attention_mask=seq_enc.get("attention_mask", None),
                 output_hidden_states=True,
                 **{kw: sst_ids},
             )
             hidden = out.hidden_states[-1][0]
             emb = normalize_to_Ld(
                 hidden=hidden,
                 expected_len=len(seq),
+                special_tokens_mask=seq_enc.get("special_tokens_mask", None)[0] if seq_enc.get("special_tokens_mask", None) is not None else None,
+                attention_mask=seq_enc.get("attention_mask", None)[0] if seq_enc.get("attention_mask", None) is not None else None,
             )
+            return emb.detach().cpu().float(), sst_tokens
         except Exception as e:
             tried.append(f"{kw}: {repr(e)}")
+    raise RuntimeError("Failed to run ProSST with known structure-token arg names: " + " | ".join(tried))
+def embed_one_sequence(seq: str):
     if RUNNER.family == "hf_encoder":
+        return embed_hf_encoder(seq), None
     if RUNNER.family == "t5_encoder":
+        return embed_t5_encoder(seq), None
     if RUNNER.family == "esmc":
+        return embed_esmc(seq), None
     if RUNNER.family == "prosst":
         return embed_prosst(seq)
     raise ValueError(f"Unsupported family: {RUNNER.family}")
             for rec in records:
                 step += 1
                 progress(step / total_steps, desc=f"{model_key} | {rec['id']}")
+                emb, sst_tokens = embed_one_sequence(rec["sequence"])
                 if emb.ndim != 2 or emb.shape[0] != len(rec["sequence"]):
                     raise ValueError(
                         f"{model_key} failed on {rec['id']}: got shape {tuple(emb.shape)}, expected ({len(rec['sequence'])}, d)"
                     )
+                pt_name = f"{safe_filename(model_key)}/{safe_filename(rec['id'])}.pt"
+                pt_buf = io.BytesIO()
+                torch.save(emb, pt_buf)
+                zf.writestr(pt_name, pt_buf.getvalue())
+                if sst_tokens is not None:
+                    tok_name = f"{safe_filename(model_key)}_structure_tokens/{safe_filename(rec['id'])}.txt"
+                    zf.writestr(tok_name, " ".join(map(str, sst_tokens)))
     return zip_path, f"Done: {len(records)} sequence(s), {len(model_keys)} model(s)."