Spaces:

melvinalves
/

protein_function_prediction

Sleeping

App Files Files Community

melvinalves commited on May 25, 2025

Commit

be01d59

verified ·

1 Parent(s): 4ee1dd6

Update app.py

Browse files

Files changed (1) hide show

app.py +49 -14

app.py CHANGED Viewed

@@ -6,11 +6,12 @@ import streamlit as st
 from transformers import AutoTokenizer, AutoModel
 from huggingface_hub import hf_hub_download
 from keras.models import load_model
 # ———————————————————  CONFIG  ——————————————————— #
 SPACE_ID   = "melvinalves/protein_function_prediction"
 TOP_N      = 10
-THRESH     = 0.35
 CHUNK_PB   = 512
 CHUNK_ESM  = 1024
@@ -40,6 +41,14 @@ def embed_seq(model, seq, chunk):
         vecs.append(out.last_hidden_state[:, 0, :].squeeze().numpy())
     return np.mean(vecs, axis=0, keepdims=True)
 # ———————————————————  CARGA MODELOS  ——————————————————— #
 mlp_pb   = load_keras("mlp_protbert.h5")
 mlp_bfd  = load_keras("mlp_protbertbfd.h5")
@@ -60,24 +69,35 @@ st.markdown(
 )
 fasta_input = st.text_area("Insere a sequência FASTA:", height=200)
 predict_clicked = st.button("Prever GO terms")
 if predict_clicked:
-    # ——— Validação mínima ———
-    seq = "".join(l.strip() for l in fasta_input.splitlines() if not l.startswith(">")).replace(" ", "").upper()
     if not seq:
         st.warning("Por favor, insere primeiro uma sequência FASTA válida.")
         st.stop()
-    # ——— 1) EMBEDDINGS ———
     with st.spinner("⏳ A gerar embeddings…"):
         emb_pb  = embed_seq("Rostlab/prot_bert",            seq, CHUNK_PB)
         emb_bfd = embed_seq("Rostlab/prot_bert_bfd",        seq, CHUNK_PB)
         emb_esm = embed_seq("facebook/esm2_t33_650M_UR50D", seq, CHUNK_ESM)
-    # ——— 2) PREDIÇÕES ———
     with st.spinner("🧠 A fazer predições…"):
         y_pb  = mlp_pb.predict(emb_pb)
         y_bfd = mlp_bfd.predict(emb_bfd)
@@ -85,17 +105,32 @@ if predict_clicked:
         X     = np.concatenate([y_pb, y_bfd, y_esm], axis=1)
         y_ens = stacking.predict(X)
-    # ——— 3) MOSTRAR RESULTADOS ———
     def mostrar(tag, y_pred):
-        with st.expander(tag, expanded=(tag == "Ensemble (Stacking)")):
             hits = mlb.inverse_transform((y_pred >= THRESH).astype(int))[0]
             st.markdown(f"**GO terms com prob ≥ {THRESH}**")
-            st.code("\n".join(hits) if hits else "— nenhum —")
             st.markdown(f"**Top {TOP_N} GO terms mais prováveis**")
-            for i in np.argsort(-y_pred[0])[:TOP_N]:
-                st.write(f"{GO[i]} : {y_pred[0][i]:.4f}")
-    #mostrar("ProtBERT (MLP)",      y_pb)
-    #mostrar("ProtBERT-BFD (MLP)",  y_bfd)
-    #mostrar("ESM-2 (MLP)",         y_esm)
-    mostrar("Ensemble (Stacking)", y_ens)

 from transformers import AutoTokenizer, AutoModel
 from huggingface_hub import hf_hub_download
 from keras.models import load_model
+from goatools.obo_parser import GODag
 # ———————————————————  CONFIG  ——————————————————— #
 SPACE_ID   = "melvinalves/protein_function_prediction"
 TOP_N      = 10
+THRESH     = 0.37
 CHUNK_PB   = 512
 CHUNK_ESM  = 1024
         vecs.append(out.last_hidden_state[:, 0, :].squeeze().numpy())
     return np.mean(vecs, axis=0, keepdims=True)
+@st.cache_resource
+def load_go_info():
+    obo_path = download_file("data/go.obo")
+    dag = GODag(obo_path, optional_attrs=['defn'])
+    return {tid: (term.name, term.defn) for tid, term in dag.items()}
+GO_INFO = load_go_info()
 # ———————————————————  CARGA MODELOS  ——————————————————— #
 mlp_pb   = load_keras("mlp_protbert.h5")
 mlp_bfd  = load_keras("mlp_protbertbfd.h5")
 )
 fasta_input = st.text_area("Insere a sequência FASTA:", height=200)
+selected_model = st.selectbox("Modelo a utilizar:", [
+    "ProtBERT (MLP)",
+    "ProtBERT-BFD (MLP)",
+    "ESM-2 (MLP)",
+    "Ensemble (Stacking)"
+])
 predict_clicked = st.button("Prever GO terms")
 if predict_clicked:
+    # ——— 1) PRÉ-PROCESSAMENTO FASTA ———
+    lines = fasta_input.splitlines()
+    header = next((l for l in lines if l.startswith(">")), None)
+    seq = "".join(l.strip() for l in lines if not l.startswith(">")).replace(" ", "").upper()
     if not seq:
         st.warning("Por favor, insere primeiro uma sequência FASTA válida.")
         st.stop()
+    if header:
+        st.markdown(f"**🧬 ID da proteína:** `{header[1:].strip()}`")
+    # ——— 2) EMBEDDINGS ———
     with st.spinner("⏳ A gerar embeddings…"):
         emb_pb  = embed_seq("Rostlab/prot_bert",            seq, CHUNK_PB)
         emb_bfd = embed_seq("Rostlab/prot_bert_bfd",        seq, CHUNK_PB)
         emb_esm = embed_seq("facebook/esm2_t33_650M_UR50D", seq, CHUNK_ESM)
+    # ——— 3) PREDIÇÕES ———
     with st.spinner("🧠 A fazer predições…"):
         y_pb  = mlp_pb.predict(emb_pb)
         y_bfd = mlp_bfd.predict(emb_bfd)
         X     = np.concatenate([y_pb, y_bfd, y_esm], axis=1)
         y_ens = stacking.predict(X)
+    # ——— 4) RESULTADOS ———
     def mostrar(tag, y_pred):
+        with st.expander(tag, expanded=True):
             hits = mlb.inverse_transform((y_pred >= THRESH).astype(int))[0]
             st.markdown(f"**GO terms com prob ≥ {THRESH}**")
+            if hits:
+                for go_id in hits:
+                    name, defin = GO_INFO.get(go_id, ("— sem nome —", "— sem definição —"))
+                    st.write(f"**{go_id} — {name}**")
+                    st.caption(defin)
+            else:
+                st.code("— nenhum —")
             st.markdown(f"**Top {TOP_N} GO terms mais prováveis**")
+            for idx in np.argsort(-y_pred[0])[:TOP_N]:
+                go_id = GO[idx]
+                name, _ = GO_INFO.get(go_id, ("", ""))
+                st.write(f"{go_id} — {name} : {y_pred[0][idx]:.4f}")
+    # ——— 5) MOSTRAR RESULTADO DO MODELO ESCOLHIDO ———
+    if selected_model == "ProtBERT (MLP)":
+        mostrar("ProtBERT (MLP)", y_pb)
+    elif selected_model == "ProtBERT-BFD (MLP)":
+        mostrar("ProtBERT-BFD (MLP)", y_bfd)
+    elif selected_model == "ESM-2 (MLP)":
+        mostrar("ESM-2 (MLP)", y_esm)
+    else:
+        mostrar("Ensemble (Stacking)", y_ens)