Spaces:

melvinalves
/

protein_function_prediction

Build error

App Files Files Community

melvinalves commited on May 23, 2025

Commit

d31f1ca

verified ·

1 Parent(s): 0cbd3d0

Update app.py

Browse files

Files changed (1) hide show

app.py +59 -65

app.py CHANGED Viewed

@@ -7,96 +7,90 @@ from transformers import AutoTokenizer, AutoModel
 from huggingface_hub import hf_hub_download
 from keras.models import load_model
-# ---------- Configuração ----------
-SPACE_ID = "melvinalves/protein_function_prediction"
-TOP_N = 10
-CHUNK_PB = 512
-CHUNK_ESM = 1024
-# ---------- Cache de downloads ----------
 @st.cache_resource
-def download_model_file(filename):
-    local_path = hf_hub_download(
         repo_id=SPACE_ID,
         repo_type="space",
-        filename=f"models/{filename}",
     )
-    print(f"📦 {filename} → {os.path.getsize(local_path)} bytes")
-    return local_path
 @st.cache_resource
-def load_hf_model(model_name):
-    tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=False)
-    model = AutoModel.from_pretrained(model_name)
-    model.eval()
-    return tokenizer, model
 @st.cache_resource
-def load_keras_model(filename):
-    path = download_model_file(filename)
-    return load_model(path, compile=False)
-# ---------- Carregar modelos ----------
-mlp_pb   = load_keras_model("mlp_protbert.keras")
-mlp_bfd  = load_keras_model("mlp_protbertbfd.keras")
-mlp_esm  = load_keras_model("mlp_esm2.keras")
-stacking = load_keras_model("ensemble_stacking.keras")
-# ---------- Carregar MultiLabelBinarizer ----------
-mlb = joblib.load(hf_hub_download(
-    repo_id=SPACE_ID,
-    repo_type="space",
-    filename="data/mlb_597.pkl"
-))
 go_terms = mlb.classes_
-# ---------- Função para gerar embeddings ----------
-def embed_sequence(model_name, seq, chunk_size):
-    tokenizer, model = load_hf_model(model_name)
-    def format_seq(s): return " ".join(list(s))
-    chunks = [seq[i:i+chunk_size] for i in range(0, len(seq), chunk_size)]
-    embeddings = []
-    for chunk in chunks:
-        formatted = format_seq(chunk)
-        inputs = tokenizer(formatted, return_tensors="pt", truncation=True)
         with torch.no_grad():
-            outputs = model(**inputs)
-        cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
-        embeddings.append(cls_embedding)
-    return np.mean(embeddings, axis=0, keepdims=True)
-# ---------- Interface Streamlit ----------
 st.title("🔬 Predição de Funções de Proteínas")
-user_input = st.text_area("Insere a sequência FASTA:", height=200)
-if user_input and st.button("Prever GO terms"):
-    # Limpar sequência FASTA
-    sequence = "\n".join([line for line in user_input.splitlines() if not line.startswith(">")])
-    sequence = sequence.replace(" ", "").replace("\n", "").strip().upper()
-    if not sequence:
         st.warning("Por favor, insere uma sequência válida.")
         st.stop()
     st.write("⏳ A gerar embeddings…")
-    emb_pb  = embed_sequence("Rostlab/prot_bert",            sequence, CHUNK_PB)
-    emb_bfd = embed_sequence("Rostlab/prot_bert_bfd",        sequence, CHUNK_PB)
-    emb_esm = embed_sequence("facebook/esm2_t33_650M_UR50D", sequence, CHUNK_ESM)
-    st.write("🧠 A fazer predições...")
     y_pb  = mlp_pb.predict(emb_pb)
     y_bfd = mlp_bfd.predict(emb_bfd)
-    y_esm = mlp_esm.predict(emb_esm)[:, :597]  # garantir alinhamento
-    X_stack = np.concatenate([y_pb, y_bfd, y_esm], axis=1)
-    y_pred = stacking.predict(X_stack)
-    st.subheader("🎯 GO terms com probabilidade ≥ 0.5")
     hits = mlb.inverse_transform((y_pred >= 0.5).astype(int))[0]
     st.code("\n".join(hits) if hits else "— nenhum —")
-    st.subheader(f"⭐ Top {TOP_N} GO terms mais prováveis")
-    top_idx = np.argsort(-y_pred[0])[:TOP_N]
-    for i in top_idx:
-        st.write(f"{go_terms[i]} : {y_pred[0][i]:.4f}")

 from huggingface_hub import hf_hub_download
 from keras.models import load_model
+# ———————————————————  CONFIGURAÇÃO  ——————————————————— #
+SPACE_ID      = "melvinalves/protein_function_prediction"   # id deste Space
+TOP_N         = 10
+CHUNK_PB      = 512
+CHUNK_ESM     = 1024
+# ———————————————————  HELPERS DE CACHE  ——————————————————— #
 @st.cache_resource
+def download_file(path_in_repo: str):
+    """Descarrega (e faz cache) um ficheiro do próprio Space, mesmo que esteja em LFS."""
+    local = hf_hub_download(
         repo_id=SPACE_ID,
         repo_type="space",
+        filename=path_in_repo,
     )
+    return local
 @st.cache_resource
+def load_keras(file_name: str):
+    """Carrega um modelo Keras (.h5) via hf_hub_download + load_model()."""
+    full_path = download_file(f"models/{file_name}")
+    return load_model(full_path, compile=False)
 @st.cache_resource
+def load_hf_encoder(model_name: str):
+    """Carrega tokenizer + encoder HuggingFace (ProtBERT/BFD/ESM)."""
+    tok = AutoTokenizer.from_pretrained(model_name, do_lower_case=False)
+    mdl = AutoModel.from_pretrained(model_name)
+    mdl.eval()
+    return tok, mdl
+# ———————————————————  MODELOS KERAS (.h5)  ——————————————————— #
+mlp_pb   = load_keras("mlp_protbert.h5")
+mlp_bfd  = load_keras("mlp_protbertbfd.h5")
+mlp_esm  = load_keras("mlp_esm2.h5")           # 602 saídas → corta-se p/ 597
+stacking = load_keras("ensemble_stacking.h5")  # espera 1791 entradas
+# ———————————————————  LABEL BINARIZER  ——————————————————— #
+mlb = joblib.load(download_file("data/mlb_597.pkl"))
 go_terms = mlb.classes_
+# ———————————————————  EMBEDDING POR CHUNKS  ——————————————————— #
+def embed_seq(encoder_name: str, seq: str, chunk: int) -> np.ndarray:
+    tok, mdl = load_hf_encoder(encoder_name)
+    fmt = lambda s: " ".join(list(s))
+    parts = [seq[i:i+chunk] for i in range(0, len(seq), chunk)]
+    vecs  = []
+    for p in parts:
         with torch.no_grad():
+            out = mdl(**tok(fmt(p), return_tensors="pt", truncation=True))
+        vecs.append(out.last_hidden_state[:, 0, :].squeeze().numpy())
+    return np.mean(vecs, axis=0, keepdims=True)
+# ———————————————————  INTERFACE STREAMLIT  ——————————————————— #
 st.title("🔬 Predição de Funções de Proteínas")
+fasta = st.text_area("Insere a sequência FASTA:", height=200)
+if fasta and st.button("Prever GO terms"):
+    # limpar FASTA
+    seq = "\n".join(l for l in fasta.splitlines() if not l.startswith(">"))
+    seq = seq.replace(" ", "").replace("\n", "").upper()
+    if not seq:
         st.warning("Por favor, insere uma sequência válida.")
         st.stop()
     st.write("⏳ A gerar embeddings…")
+    emb_pb  = embed_seq("Rostlab/prot_bert",            seq, CHUNK_PB)
+    emb_bfd = embed_seq("Rostlab/prot_bert_bfd",        seq, CHUNK_PB)
+    emb_esm = embed_seq("facebook/esm2_t33_650M_UR50D", seq, CHUNK_ESM)
+    st.write("🧠 A fazer predições…")
     y_pb  = mlp_pb.predict(emb_pb)
     y_bfd = mlp_bfd.predict(emb_bfd)
+    y_esm = mlp_esm.predict(emb_esm)[:, :597]      # corta 602 → 597
+    X_stack = np.concatenate([y_pb, y_bfd, y_esm], axis=1)  # (1, 1791)
+    y_pred  = stacking.predict(X_stack)
+    # ——— Resultados ———
+    st.subheader("GO terms com probabilidade ≥ 0.5")
     hits = mlb.inverse_transform((y_pred >= 0.5).astype(int))[0]
     st.code("\n".join(hits) if hits else "— nenhum —")
+    st.subheader(f"Top {TOP_N} GO terms mais prováveis")
+    for idx in np.argsort(-y_pred[0])[:TOP_N]:
+        st.write(f"{go_terms[idx]} : {y_pred[0][idx]:.4f}")