Spaces:

melvinalves
/

protein_function_prediction

Sleeping

App Files Files Community

melvinalves commited on Jun 17, 2025

Commit

1dadffc

verified ·

1 Parent(s): c6dfc57

Update app.py

Browse files

Files changed (1) hide show

app.py +61 -46

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 # -------------------------------------------------------------------------------------------------
 #  app.py  –  Streamlit app para predição de GO:MF
-#  Versão: usa ProtBERT & ProtBERT-BFD fine-tuned (melvinalves/FineTune) + ESM-2 base
 # -------------------------------------------------------------------------------------------------
 import os, re, numpy as np, torch, joblib, streamlit as st
 from huggingface_hub import login
@@ -8,17 +9,17 @@ from transformers import AutoTokenizer, AutoModel
 from keras.models import load_model
 from goatools.obo_parser import GODag
-# ———————————————————  AUTHENTICAÇÃO  ——————————————————— #
 login(os.environ["HF_TOKEN"])
 # ———————————————————  CONFIG  ——————————————————— #
 SPACE_ID   = "melvinalves/protein_function_prediction"
 TOP_N      = 10
 THRESH     = 0.37
-CHUNK_PB   = 512
-CHUNK_ESM  = 1024
-# Repositórios dos modelos
 FINETUNED_PB   = ("melvinalves/FineTune", "fineTunedProtbert")
 FINETUNED_BFD  = ("melvinalves/FineTune", "fineTunedProtbertbfd")
 BASE_ESM       = "facebook/esm2_t33_650M_UR50D"
@@ -26,53 +27,60 @@ BASE_ESM       = "facebook/esm2_t33_650M_UR50D"
 # ———————————————————  HELPERS  ——————————————————— #
 @st.cache_resource
 def download_file(path):
-    """Ficheiros pequenos guardados no repositório do Space (≤1 GB total)."""
     from huggingface_hub import hf_hub_download
     return hf_hub_download(repo_id=SPACE_ID, repo_type="space", filename=path)
 @st.cache_resource
 def load_keras(name):
-    """Carrega modelos Keras (MLPs + stacking)."""
     return load_model(download_file(f"models/{name}"), compile=False)
 @st.cache_resource
-def load_hf_encoder(repo_id, subfolder=None, base_tok="Rostlab/prot_bert"):
     """
-    Carrega um encoder HF (PyTorch) – se existir apenas tf_model.h5 no repo,
-    usa from_tf=True para converter on-the-fly.
     """
     tok = AutoTokenizer.from_pretrained(base_tok, do_lower_case=False)
-    mdl = AutoModel.from_pretrained(
-        repo_id,
-        subfolder=subfolder,
-        from_tf=True,   # converte pesos TF se necessário
-    )
     mdl.eval()
     return tok, mdl
 def embed_seq(model_ref, seq, chunk):
     """
-    Extrai embedding médio (CLS) para sequências grandes usando chunks.
-    - model_ref  pode ser string (modelo base) ou tuple (repo_id, subfolder) p/ fine-tuned.
     """
-    if isinstance(model_ref, tuple):
-        tok, mdl = load_hf_encoder(*model_ref)
-    else:
-        # mantém o tokenizer apropriado
-        base_tok = "Rostlab/prot_bert" if "prot_bert" in model_ref else model_ref
-        tok, mdl = load_hf_encoder(model_ref, base_tok=base_tok)
     parts = [seq[i:i+chunk] for i in range(0, len(seq), chunk)]
-    vecs = []
     for p in parts:
-        tokens = tok(" ".join(p), return_tensors="pt", truncation=False)
         with torch.no_grad():
-            out = mdl(**{k: v.to(mdl.device) for k, v in tokens.items()})
         vecs.append(out.last_hidden_state[:, 0, :].cpu().numpy())
     return np.mean(vecs, axis=0, keepdims=True)
 @st.cache_resource
 def load_go_info():
     obo_path = download_file("data/go.obo")
     dag = GODag(obo_path, optional_attrs=["defn"])
     return {tid: (term.name, term.defn) for tid, term in dag.items()}
@@ -91,25 +99,28 @@ GO       = mlb.classes_
 # ———————————————————  UI  ——————————————————— #
 st.title("Predição de Funções Moleculares de Proteínas")
-st.markdown(
-    "<style> textarea { font-size: 0.9rem !important; } </style>",
-    unsafe_allow_html=True,
-)
 fasta_input = st.text_area("Insere uma ou mais sequências FASTA:", height=300)
 predict_clicked = st.button("Prever GO terms")
 # ———————————————————  PARSE DE MÚLTIPLAS SEQUÊNCIAS  ——————————————————— #
 def parse_fasta_multiple(fasta_str):
     entries, parsed = fasta_str.strip().split(">"), []
     for i, entry in enumerate(entries):
         if not entry.strip():
             continue
         lines = entry.strip().splitlines()
-        if i > 0:
             header = lines[0].strip()
             seq = "".join(lines[1:]).replace(" ", "").upper()
-        else:
             header = f"Seq_{i+1}"
             seq = "".join(lines).replace(" ", "").upper()
         if seq:
@@ -125,42 +136,46 @@ if predict_clicked:
     for header, seq in parsed_seqs:
         with st.spinner(f"A processar {header}… (pode demorar alguns minutos)"):
-            # ——— Embeddings ——— #
             emb_pb  = embed_seq(FINETUNED_PB,  seq, CHUNK_PB)
             emb_bfd = embed_seq(FINETUNED_BFD, seq, CHUNK_PB)
             emb_esm = embed_seq(BASE_ESM,       seq, CHUNK_ESM)
-            # ——— Predições dos MLPs ——— #
             y_pb  = mlp_pb.predict(emb_pb)
             y_bfd = mlp_bfd.predict(emb_bfd)
-            y_esm = mlp_esm.predict(emb_esm)[:, :597]
-            # ——— Stacking ——— #
             X     = np.concatenate([y_pb, y_bfd, y_esm], axis=1)
             y_ens = stacking.predict(X)
-        # ———————————————————  RESULTADOS ——————————————————— #
-        def mostrar_resultados(tag, y_pred):
             with st.expander(tag, expanded=True):
-                hits = mlb.inverse_transform((y_pred >= THRESH).astype(int))[0]
                 st.markdown(f"**GO terms com prob ≥ {THRESH}**")
                 if hits:
                     for go_id in hits:
                         name, defin = GO_INFO.get(go_id, ("— sem nome —", ""))
-                        limp = re.sub(r'^\s*"?(.+?)"?\s*(\[[^\]]*\])?\s*$', r'\1', defin or "")
                         st.write(f"**{go_id} — {name}**")
-                        st.caption(limp)
                 else:
                     st.code("— nenhum —")
                 st.markdown(f"**Top {TOP_N} GO terms mais prováveis**")
                 for idx in np.argsort(-y_pred[0])[:TOP_N]:
                     go_id = GO[idx]
                     name, _ = GO_INFO.get(go_id, ("", ""))
                     st.write(f"{go_id} — {name} : {y_pred[0][idx]:.4f}")
-        # Mostrar apenas ensemble (descomenta se quiseres os individuais)
-        # mostrar_resultados(f"{header} — ProtBERT",      y_pb)
-        # mostrar_resultados(f"{header} — ProtBERT-BFD",  y_bfd)
-        # mostrar_resultados(f"{header} — ESM-2",         y_esm)
-        mostrar_resultados(header, y_ens)

 # -------------------------------------------------------------------------------------------------
 #  app.py  –  Streamlit app para predição de GO:MF
+#  • ProtBERT / ProtBERT-BFD fine-tuned   (melvinalves/FineTune)
+#  • ESM-2 base                            (facebook/esm2_t33_650M_UR50D)
 # -------------------------------------------------------------------------------------------------
 import os, re, numpy as np, torch, joblib, streamlit as st
 from huggingface_hub import login
 from keras.models import load_model
 from goatools.obo_parser import GODag
+# ———————————————————  AUTENTICAÇÃO  ——————————————————— #
 login(os.environ["HF_TOKEN"])
 # ———————————————————  CONFIG  ——————————————————— #
 SPACE_ID   = "melvinalves/protein_function_prediction"
 TOP_N      = 10
 THRESH     = 0.37
+CHUNK_PB   = 512   # janela ProtBERT / ProtBERT-BFD
+CHUNK_ESM  = 1024  # janela ESM-2
+# repositórios HF
 FINETUNED_PB   = ("melvinalves/FineTune", "fineTunedProtbert")
 FINETUNED_BFD  = ("melvinalves/FineTune", "fineTunedProtbertbfd")
 BASE_ESM       = "facebook/esm2_t33_650M_UR50D"
 # ———————————————————  HELPERS  ——————————————————— #
 @st.cache_resource
 def download_file(path):
+    """Ficheiros pequenos (≤1 GB) guardados no Space."""
     from huggingface_hub import hf_hub_download
     return hf_hub_download(repo_id=SPACE_ID, repo_type="space", filename=path)
 @st.cache_resource
 def load_keras(name):
+    """Carrega modelos Keras (MLPs e stacking)."""
     return load_model(download_file(f"models/{name}"), compile=False)
+# ---------- carregar tokenizer + encoder ----------
 @st.cache_resource
+def load_hf_encoder(repo_id, subfolder=None, base_tok=None):
     """
+    • repo_id   : repositório HF ou caminho local
+    • subfolder : subpasta onde vivem pesos/config (None se não houver)
+    • base_tok  : repo para o tokenizer      (None => usa repo_id)
+    Converte tf_model.h5 → PyTorch on-the-fly (from_tf=True).
     """
+    if base_tok is None:
+        base_tok = repo_id
     tok = AutoTokenizer.from_pretrained(base_tok, do_lower_case=False)
+    kwargs = dict(from_tf=True)
+    if subfolder:
+        kwargs["subfolder"] = subfolder
+    mdl = AutoModel.from_pretrained(repo_id, **kwargs)
     mdl.eval()
     return tok, mdl
+# ---------- extrair embedding ----------
 def embed_seq(model_ref, seq, chunk):
     """
+    • model_ref = string (modelo base)  OU  tuple(repo_id, subfolder) (modelo fine-tuned)
+    Retorna embedding CLS médio (caso a sequência seja dividida em chunks).
     """
+    if isinstance(model_ref, tuple):                # ProtBERT / ProtBERT-BFD fine-tuned
+        repo_id, subf = model_ref
+        tok, mdl = load_hf_encoder(repo_id, subfolder=subf,
+                                   base_tok="Rostlab/prot_bert")
+    else:                                           # modelo base (ESM-2)
+        tok, mdl = load_hf_encoder(model_ref)
     parts = [seq[i:i+chunk] for i in range(0, len(seq), chunk)]
+    vecs  = []
     for p in parts:
+        toks = tok(" ".join(p), return_tensors="pt", truncation=False)
         with torch.no_grad():
+            out = mdl(**{k: v.to(mdl.device) for k, v in toks.items()})
         vecs.append(out.last_hidden_state[:, 0, :].cpu().numpy())
     return np.mean(vecs, axis=0, keepdims=True)
 @st.cache_resource
 def load_go_info():
+    """Lê GO.obo e devolve dicionário id → (name, definition)."""
     obo_path = download_file("data/go.obo")
     dag = GODag(obo_path, optional_attrs=["defn"])
     return {tid: (term.name, term.defn) for tid, term in dag.items()}
 # ———————————————————  UI  ——————————————————— #
 st.title("Predição de Funções Moleculares de Proteínas")
+# Pequeno ajuste de fonte no textarea
+st.markdown("<style> textarea { font-size: 0.9rem !important; } </style>",
+            unsafe_allow_html=True)
 fasta_input = st.text_area("Insere uma ou mais sequências FASTA:", height=300)
 predict_clicked = st.button("Prever GO terms")
 # ———————————————————  PARSE DE MÚLTIPLAS SEQUÊNCIAS  ——————————————————— #
 def parse_fasta_multiple(fasta_str):
+    """
+    Devolve lista de (header, seq) a partir de texto FASTA possivelmente múltiplo.
+    Suporta bloco inicial sem '>'.
+    """
     entries, parsed = fasta_str.strip().split(">"), []
     for i, entry in enumerate(entries):
         if not entry.strip():
             continue
         lines = entry.strip().splitlines()
+        if i > 0:  # bloco típico FASTA
             header = lines[0].strip()
             seq = "".join(lines[1:]).replace(" ", "").upper()
+        else:      # sequência sem '>'
             header = f"Seq_{i+1}"
             seq = "".join(lines).replace(" ", "").upper()
         if seq:
     for header, seq in parsed_seqs:
         with st.spinner(f"A processar {header}… (pode demorar alguns minutos)"):
+            # ————————————  EMBEDDINGS  ———————————— #
             emb_pb  = embed_seq(FINETUNED_PB,  seq, CHUNK_PB)
             emb_bfd = embed_seq(FINETUNED_BFD, seq, CHUNK_PB)
             emb_esm = embed_seq(BASE_ESM,       seq, CHUNK_ESM)
+            # ————————————  PREDIÇÕES MLPs  ———————————— #
             y_pb  = mlp_pb.predict(emb_pb)
             y_bfd = mlp_bfd.predict(emb_bfd)
+            y_esm = mlp_esm.predict(emb_esm)[:, :597]  # alinhar nº de termos
+            # ————————————  STACKING  ———————————— #
             X     = np.concatenate([y_pb, y_bfd, y_esm], axis=1)
             y_ens = stacking.predict(X)
+        # ———————————————————  RESULTADOS  ——————————————————— #
+        def mostrar(tag, y_pred):
             with st.expander(tag, expanded=True):
+                # GO terms acima do threshold
                 st.markdown(f"**GO terms com prob ≥ {THRESH}**")
+                hits = mlb.inverse_transform((y_pred >= THRESH).astype(int))[0]
                 if hits:
                     for go_id in hits:
                         name, defin = GO_INFO.get(go_id, ("— sem nome —", ""))
+                        defin = re.sub(r'^\s*"?(.+?)"?\s*(\[[^\]]*\])?\s*$', r'\1',
+                                       defin or "")
                         st.write(f"**{go_id} — {name}**")
+                        st.caption(defin)
                 else:
                     st.code("— nenhum —")
+                # Top-N mais prováveis
                 st.markdown(f"**Top {TOP_N} GO terms mais prováveis**")
                 for idx in np.argsort(-y_pred[0])[:TOP_N]:
                     go_id = GO[idx]
                     name, _ = GO_INFO.get(go_id, ("", ""))
                     st.write(f"{go_id} — {name} : {y_pred[0][idx]:.4f}")
+        # ———————————————————  ESCOLHE QUAIS MOSTRAR  ——————————————————— #
+        #   Descomenta se quiseres ver as saídas individuais
+        # mostrar(f"{header} — ProtBERT (MLP)",     y_pb)
+        # mostrar(f"{header} — ProtBERT-BFD (MLP)", y_bfd)
+        # mostrar(f"{header} — ESM-2 (MLP)",        y_esm)
+        mostrar(header, y_ens)  # ensemble