Spaces:

ChatterjeeLab
/

MetaLATTE-demo

Runtime error

App Files Files Community

yinuozhang commited on Oct 8, 2025

Commit

fa4c075

verified ·

1 Parent(s): 3008831

Update app.py

Browse files

Files changed (1) hide show

app.py +74 -46

app.py CHANGED Viewed

@@ -37,68 +37,96 @@ def snapshot_to(local_name, repo_id, revision, allow_patterns):
         local_dir=local_dir,  # new hub ignores symlink flag; this is enough
     )
-# Download tokenizer files (small)
 esm_local = snapshot_to(
-    "esm2_tokenizer",
-    TOKENIZER_ID,
-    TOKENIZER_REV,
     allow_patterns=[
         "tokenizer.json","tokenizer_config.json","vocab.*","merges.*",
-        "special_tokens_map.json","*.model","tokenizer*.txt","spiece.*","*.tiktoken",
-        "config.json"  # some tokenizers use it
     ],
 )
-# Download MetaLATTE weights + config ONLY (skip stage1 blob)
 metalatte_local = snapshot_to(
-    "metalatte_model",
-    MODEL_ID,
-    MODEL_REV,
-    allow_patterns=["config.json", "pytorch_model.bin"],
 )
-# Your local custom code
-metalatte_path = '.'
-sys.path.insert(0, metalatte_path)
 from configuration import MetaLATTEConfig
 from modeling_metalatte import MultitaskProteinModel
 AutoConfig.register("metalatte", MetaLATTEConfig)
 AutoModel.register(MetaLATTEConfig, MultitaskProteinModel)
-# Load config + instantiate model (no network)
-config = AutoConfig.from_pretrained(metalatte_local, local_files_only=True)
-# Find the weight file locally
-weight_candidates = [
-    "pytorch_model.bin",
-    "model/pytorch_model.bin",
-    "model.safetensors",
-    "model/model.safetensors",
-    "stage1_model.bin",
-    "model/stage1_model.bin",
-]
-weight_path = None
-for c in weight_candidates:
-    p = os.path.join(metalatte_local, c)
-    if os.path.exists(p):
-        weight_path = p
-        break
-if weight_path is None:
-    raise FileNotFoundError(f"No weights found in {metalatte_local}. Looked for: {weight_candidates}")
-# Build model and load the local state dict
-model = MultitaskProteinModel(config)
-if weight_path.endswith(".safetensors"):
-    from safetensors.torch import load_file
-    state_dict = load_file(weight_path, device="cpu", weights_only=False)
-else:
-    state_dict = torch.load(weight_path, map_location="cpu", weights_only=False)
-missing, unexpected = model.load_state_dict(state_dict, strict=False)
-if missing or unexpected:
-    print(f"[load_state_dict] missing={len(missing)} unexpected={len(unexpected)}")
-model.eval()
-# Tokenizer
 tokenizer = AutoTokenizer.from_pretrained(esm_local, local_files_only=True)
 @torch.inference_mode()
 def predict(sequence):

         local_dir=local_dir,  # new hub ignores symlink flag; this is enough
     )
+# Download tokenizer (unchanged)
 esm_local = snapshot_to(
+    "esm2_tokenizer", "facebook/esm2_t33_650M_UR50D", os.getenv("TOKENIZER_REV",""),
     allow_patterns=[
         "tokenizer.json","tokenizer_config.json","vocab.*","merges.*",
+        "special_tokens_map.json","*.model","tokenizer*.txt","spiece.*","*.tiktoken","config.json"
     ],
 )
+# Download MetaLATTE: include both main and stage1 in case your loader uses them
 metalatte_local = snapshot_to(
+    "metalatte_model", "ChatterjeeLab/MetaLATTE", os.getenv("MODEL_REV", "ad1716045c768b30ce87eb6b3963d58578fa5401"),
+    allow_patterns=[
+        "config.json",
+        "pytorch_model.bin",
+        "model/pytorch_model.bin",
+        "model.safetensors",
+        "model/model.safetensors",
+        "stage1_model.bin",
+        "model/stage1_model.bin",
+    ],
 )
+import os, sys, torch, pandas as pd, gradio as gr
+from transformers import AutoTokenizer, AutoModel, AutoConfig
+# --- your local package ---
+sys.path.insert(0, ".")
 from configuration import MetaLATTEConfig
 from modeling_metalatte import MultitaskProteinModel
+# Register types BEFORE loading
 AutoConfig.register("metalatte", MetaLATTEConfig)
 AutoModel.register(MetaLATTEConfig, MultitaskProteinModel)
+# ---- Monkey-patch: make your from_pretrained support local dirs ----
+def _local_aware_from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
+    import os
+    from transformers import AutoConfig
+    from safetensors.torch import load_file as load_safetensors
+    # If a local directory is passed, load directly from disk
+    if os.path.isdir(pretrained_model_name_or_path):
+        config = kwargs.get("config", None)
+        if config is None:
+            try:
+                # works because we registered the type above
+                config = AutoConfig.from_pretrained(pretrained_model_name_or_path, local_files_only=True)
+            except Exception:
+                # fallback in case AutoConfig isn't enough
+                config = MetaLATTEConfig.from_pretrained(pretrained_model_name_or_path, local_files_only=True)
+        model = cls(config)
+        # Look for weights in common locations; prefer .safetensors > pytorch .bin > stage1
+        candidates = [
+            "model/model.safetensors", "model.safetensors",
+            "model/pytorch_model.bin", "pytorch_model.bin",
+            "model/stage1_model.bin", "stage1_model.bin",
+        ]
+        weight_path = next((os.path.join(pretrained_model_name_or_path, c) for c in candidates if os.path.exists(os.path.join(pretrained_model_name_or_path, c))), None)
+        if weight_path is None:
+            raise FileNotFoundError(f"No weights found in {pretrained_model_name_or_path}; tried {candidates}")
+        # Load state dict (STRICT to catch any mismatch instead of silently skipping)
+        if weight_path.endswith(".safetensors"):
+            state = load_safetensors(weight_path, device="cpu")
+        else:
+            state = torch.load(weight_path, map_location="cpu")
+        missing, unexpected = model.load_state_dict(state, strict=True)
+        if missing or unexpected:
+            raise RuntimeError(f"State dict mismatch. missing={missing[:5]}... unexpected={unexpected[:5]}...")
+        model.eval()
+        return model
+    # Otherwise, fall back to the original remote/HF logic (your class already had)
+    # NOTE: We call the original classmethod via the unbound function on the class
+    return _orig_from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
+# Swap in the monkey patch (but keep a handle to the original)
+_orig_from_pretrained = MultitaskProteinModel.from_pretrained.__func__
+MultitaskProteinModel.from_pretrained = classmethod(_local_aware_from_pretrained)
+# --------------------------------------------------------------------
+# Load config and model exactly like before (now it will use the local-aware loader)
+config = AutoConfig.from_pretrained(metalatte_local, local_files_only=True)
 tokenizer = AutoTokenizer.from_pretrained(esm_local, local_files_only=True)
+model = AutoModel.from_pretrained(metalatte_local, config=config, local_files_only=True)
+model.eval()
 @torch.inference_mode()
 def predict(sequence):