Spaces:

melvinalves
/

protein_function_prediction

Sleeping

App Files Files Community

melvinalves commited on May 24, 2025

Commit

bdfb703

verified ·

1 Parent(s): 6f54ff6

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -42

app.py CHANGED Viewed

@@ -8,97 +8,100 @@ from huggingface_hub import hf_hub_download
 from keras.models import load_model
 # ———————————————————  CONFIGURAÇÃO  ——————————————————— #
-SPACE_ID      = "melvinalves/protein_function_prediction"   # id deste Space
 TOP_N         = 10
 CHUNK_PB      = 512
 CHUNK_ESM     = 1024
 # ———————————————————  HELPERS DE CACHE  ——————————————————— #
 @st.cache_resource
 def download_file(path_in_repo: str):
-    """Descarrega (e faz cache) um ficheiro do próprio Space, mesmo que esteja em LFS."""
-    local = hf_hub_download(
-        repo_id=SPACE_ID,
-        repo_type="space",
-        filename=path_in_repo,
-    )
-    return local
 @st.cache_resource
 def load_keras(file_name: str):
-    """Carrega um modelo Keras (.h5) via hf_hub_download + load_model()."""
-    full_path = download_file(f"models/{file_name}")
-    return load_model(full_path, compile=False)
 @st.cache_resource
 def load_hf_encoder(model_name: str):
-    """Carrega tokenizer + encoder HuggingFace (ProtBERT/BFD/ESM)."""
     tok = AutoTokenizer.from_pretrained(model_name, do_lower_case=False)
     mdl = AutoModel.from_pretrained(model_name)
     mdl.eval()
     return tok, mdl
-# ———————————————————  MODELOS KERAS (.h5)  ——————————————————— #
 mlp_pb   = load_keras("mlp_protbert.h5")
 mlp_bfd  = load_keras("mlp_protbertbfd.h5")
-mlp_esm  = load_keras("mlp_esm2.h5")           # 602 saídas → corta-se p/ 597
-stacking = load_keras("ensemble_stacking.h5")  # espera 1791 entradas
 # ———————————————————  LABEL BINARIZER  ——————————————————— #
-mlb = joblib.load(download_file("data/mlb_597.pkl"))
-go_terms = mlb.classes_
-# ———————————————————  EMBEDDING POR CHUNKS  ——————————————————— #
-def embed_seq(encoder_name: str, seq: str, chunk: int) -> np.ndarray:
-    tok, mdl = load_hf_encoder(encoder_name)
-    fmt = lambda s: " ".join(list(s))
     parts = [seq[i:i+chunk] for i in range(0, len(seq), chunk)]
     vecs  = []
     for p in parts:
         with torch.no_grad():
-            out = mdl(**tok(fmt(p), return_tensors="pt", truncation=True))
         vecs.append(out.last_hidden_state[:, 0, :].squeeze().numpy())
     return np.mean(vecs, axis=0, keepdims=True)
-# ———————————————————  INTERFACE STREAMLIT  ——————————————————— #
 st.title("🔬 Predição de Funções de Proteínas")
-st.markdown("""
-    <style>
-    textarea {
-        font-size: 0.9rem !important;
-    }
-    </style>
-""", unsafe_allow_html=True)
 fasta = st.text_area("Insere a sequência FASTA:", height=200)
 if fasta and st.button("Prever GO terms"):
-    # limpar FASTA
     seq = "\n".join(l for l in fasta.splitlines() if not l.startswith(">"))
     seq = seq.replace(" ", "").replace("\n", "").upper()
     if not seq:
         st.warning("Por favor, insere uma sequência válida.")
         st.stop()
     st.write("⏳ A gerar embeddings…")
     emb_pb  = embed_seq("Rostlab/prot_bert",            seq, CHUNK_PB)
     emb_bfd = embed_seq("Rostlab/prot_bert_bfd",        seq, CHUNK_PB)
     emb_esm = embed_seq("facebook/esm2_t33_650M_UR50D", seq, CHUNK_ESM)
     st.write("🧠 A fazer predições…")
     y_pb  = mlp_pb.predict(emb_pb)
     y_bfd = mlp_bfd.predict(emb_bfd)
     y_esm = mlp_esm.predict(emb_esm)[:, :597]      # corta 602 → 597
-    X_stack = np.concatenate([y_pb, y_bfd, y_esm], axis=1)  # (1, 1791)
-    y_pred  = stacking.predict(X_stack)
-    # ——— Resultados ———
-    st.subheader("GO terms com probabilidade ≥ 0.5")
-    hits = mlb.inverse_transform((y_pred >= 0.5).astype(int))[0]
-    st.code("\n".join(hits) if hits else "— nenhum —")
-    st.subheader(f"Top {TOP_N} GO terms mais prováveis")
-    for idx in np.argsort(-y_pred[0])[:TOP_N]:
-        st.write(f"{go_terms[idx]} : {y_pred[0][idx]:.4f}")

 from keras.models import load_model
 # ———————————————————  CONFIGURAÇÃO  ——————————————————— #
+SPACE_ID      = "melvinalves/protein_function_prediction"
 TOP_N         = 10
+THRESH        = 0.50                # limiar para listar GO terms
 CHUNK_PB      = 512
 CHUNK_ESM     = 1024
 # ———————————————————  HELPERS DE CACHE  ——————————————————— #
 @st.cache_resource
 def download_file(path_in_repo: str):
+    return hf_hub_download(repo_id=SPACE_ID, repo_type="space", filename=path_in_repo)
 @st.cache_resource
 def load_keras(file_name: str):
+    return load_model(download_file(f"models/{file_name}"), compile=False)
 @st.cache_resource
 def load_hf_encoder(model_name: str):
     tok = AutoTokenizer.from_pretrained(model_name, do_lower_case=False)
     mdl = AutoModel.from_pretrained(model_name)
     mdl.eval()
     return tok, mdl
+# ———————————————————  MODELOS  ——————————————————— #
 mlp_pb   = load_keras("mlp_protbert.h5")
 mlp_bfd  = load_keras("mlp_protbertbfd.h5")
+mlp_esm  = load_keras("mlp_esm2.h5")
+stacking = load_keras("ensemble_stack.h5")      # usa o nome que tiveres guardado
 # ———————————————————  LABEL BINARIZER  ——————————————————— #
+mlb       = joblib.load(download_file("data/mlb_597.pkl"))
+GO_TERMS  = mlb.classes_
+# ———————————————————  EMBEDDINGS  ——————————————————— #
+def embed_seq(model_name: str, seq: str, chunk: int) -> np.ndarray:
+    tok, mdl = load_hf_encoder(model_name)
     parts = [seq[i:i+chunk] for i in range(0, len(seq), chunk)]
     vecs  = []
     for p in parts:
         with torch.no_grad():
+            out = mdl(**tok(" ".join(p), return_tensors="pt", truncation=False))
         vecs.append(out.last_hidden_state[:, 0, :].squeeze().numpy())
     return np.mean(vecs, axis=0, keepdims=True)
+# ———————————————————  UI  ——————————————————— #
 st.title("🔬 Predição de Funções de Proteínas")
+st.markdown(
+    """
+    <style> textarea { font-size: 0.9rem !important; } </style>
+    """,
+    unsafe_allow_html=True,
+)
 fasta = st.text_area("Insere a sequência FASTA:", height=200)
+# ---------- BOTÃO ----------
 if fasta and st.button("Prever GO terms"):
     seq = "\n".join(l for l in fasta.splitlines() if not l.startswith(">"))
     seq = seq.replace(" ", "").replace("\n", "").upper()
     if not seq:
         st.warning("Por favor, insere uma sequência válida.")
         st.stop()
+    # 1) EMBEDDINGS
     st.write("⏳ A gerar embeddings…")
     emb_pb  = embed_seq("Rostlab/prot_bert",            seq, CHUNK_PB)
     emb_bfd = embed_seq("Rostlab/prot_bert_bfd",        seq, CHUNK_PB)
     emb_esm = embed_seq("facebook/esm2_t33_650M_UR50D", seq, CHUNK_ESM)
+    # 2) PREDIÇÕES INDIVIDUAIS
     st.write("🧠 A fazer predições…")
     y_pb  = mlp_pb.predict(emb_pb)
     y_bfd = mlp_bfd.predict(emb_bfd)
     y_esm = mlp_esm.predict(emb_esm)[:, :597]      # corta 602 → 597
+    # 3) ENSEMBLE
+    X_stack = np.concatenate([y_pb, y_bfd, y_esm], axis=1)   # (1, 1791)
+    y_ens   = stacking.predict(X_stack)
+    # ——— Função auxiliar para mostrar resultados ———
+    def show_results(label: str, y_pred):
+        with st.expander(label, expanded=(label == "Ensemble (Stacking)")):
+            hits = mlb.inverse_transform((y_pred >= THRESH).astype(int))[0]
+            st.markdown(f"**GO terms com prob ≥ {THRESH}**")
+            st.code("\n".join(hits) if hits else "— nenhum —")
+            st.markdown(f"**Top {TOP_N} GO terms mais prováveis**")
+            top_idx = np.argsort(-y_pred[0])[:TOP_N]
+            for i in top_idx:
+                st.write(f"{GO_TERMS[i]} : {y_pred[0][i]:.4f}")
+    # 4) OUTPUT
+    show_results("ProtBERT (MLP)",       y_pb)
+    show_results("ProtBERT-BFD (MLP)",   y_bfd)
+    show_results("ESM-2 (MLP)",          y_esm)
+    show_results("Ensemble (Stacking)",  y_ens)