melvinalves commited on
Commit
1e0b741
Β·
verified Β·
1 Parent(s): c9aceda

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -58
app.py CHANGED
@@ -14,10 +14,10 @@ login(os.environ["HF_TOKEN"])
14
 
15
  # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” CONFIG β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” #
16
  SPACE_ID = "melvinalves/protein_function_prediction"
17
- TOP_N = 20 # top-20
18
  THRESH = 0.37
19
- CHUNK_PB = 512 # janela ProtBERT
20
- CHUNK_ESM = 1024 # janela ESM-2
21
 
22
  # repositΓ³rios HF
23
  FINETUNED_PB = ("melvinalves/FineTune", "fineTunedProtbert")
@@ -36,14 +36,10 @@ def load_keras(name):
36
  """Carrega modelos Keras (MLPs e stacking)."""
37
  return load_model(download_file(f"models/{name}"), compile=False)
38
 
39
- # ---------- carregar tokenizer + encoder ----------
40
  @st.cache_resource
41
  def load_hf_encoder(repo_id, subfolder=None, base_tok=None):
42
  """
43
- β€’ repo_id : repositΓ³rio HF ou caminho local
44
- β€’ subfolder : subpasta (None se nΓ£o houver)
45
- β€’ base_tok : repo para o tokenizer (None => usa repo_id)
46
- Converte tf_model.h5 β†’ PyTorch on-the-fly (from_tf=True).
47
  """
48
  if base_tok is None:
49
  base_tok = repo_id
@@ -56,17 +52,15 @@ def load_hf_encoder(repo_id, subfolder=None, base_tok=None):
56
  mdl.eval()
57
  return tok, mdl
58
 
59
- # ---------- extrair embedding ----------
60
  def embed_seq(model_ref, seq, chunk):
61
  """
62
- β€’ model_ref = string (modelo base) ou tuple(repo_id, subfolder) (fine-tuned)
63
- Retorna embedding CLS mΓ©dio (se a sequΓͺncia for dividida em chunks).
64
  """
65
- if isinstance(model_ref, tuple): # ProtBERT fine-tuned
66
  repo_id, subf = model_ref
67
  tok, mdl = load_hf_encoder(repo_id, subfolder=subf,
68
  base_tok="Rostlab/prot_bert")
69
- else: # modelo base (ESM-2)
70
  tok, mdl = load_hf_encoder(model_ref)
71
 
72
  parts = [seq[i:i+chunk] for i in range(0, len(seq), chunk)]
@@ -80,9 +74,8 @@ def embed_seq(model_ref, seq, chunk):
80
 
81
  @st.cache_resource
82
  def load_go_info():
83
- """LΓͺ GO.obo e devolve dicionΓ‘rio id β†’ (name, definition)."""
84
- obo_path = download_file("data/go.obo")
85
- dag = GODag(obo_path, optional_attrs=["defn"])
86
  return {tid: (term.name, term.defn) for tid, term in dag.items()}
87
 
88
  GO_INFO = load_go_info()
@@ -100,72 +93,68 @@ GO = mlb.classes_
100
  st.set_page_config(page_title="PrediΓ§Γ£o de FunΓ§Γ΅es Moleculares de ProteΓ­nas",
101
  page_icon="🧬", layout="centered")
102
 
103
- # CSS: fundo branco e separador vertical
104
  st.markdown(
105
  """
106
  <style>
107
- /* traΓ§o vertical entre colunas dentro dos expanders */
 
 
108
  div[data-testid="column"]:first-child {
109
- border-right:1px solid #000000;
110
- padding-right:1rem !important;
111
  }
112
  </style>
113
  """,
114
  unsafe_allow_html=True
115
  )
116
 
117
- # Logo (logo.png na raiz do Space)
118
  if os.path.exists("logo.png"):
119
  st.image("logo.png", width=180)
120
 
121
  st.title("PrediΓ§Γ£o de FunΓ§Γ΅es Moleculares de ProteΓ­nas (GO:MF)")
122
 
123
- fasta_input = st.text_area("Insere uma ou mais sequΓͺncias FASTA:", height=300)
124
- predict_clicked = st.button("Prever GO terms")
125
 
126
- # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” PARSE FASTA β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” #
127
- def parse_fasta_multiple(fasta_str):
128
- """
129
- Devolve lista de (header, seq). Suporta bloco inicial sem '>'.
130
- """
131
- entries, parsed = fasta_str.strip().split(">"), []
132
- for i, entry in enumerate(entries):
133
- if not entry.strip():
134
  continue
135
- lines = entry.strip().splitlines()
136
- if i > 0: # bloco FASTA normal
137
- header = lines[0].strip()
138
- seq = "".join(lines[1:]).replace(" ", "").upper()
139
- else: # sequΓͺncia sem '>'
140
- header = f"Seq_{i+1}"
141
- seq = "".join(lines).replace(" ", "").upper()
142
  if seq:
143
- parsed.append((header, seq))
144
- return parsed
 
 
 
 
 
 
 
145
 
146
- # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” LIGAÇÕES β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” #
147
  def go_link(go_id, name=""):
148
- """Link QuickGO."""
149
  url = f"https://www.ebi.ac.uk/QuickGO/term/{go_id}"
150
  return f"[{go_id} β€” {name}]({url})" if name else f"[{go_id}]({url})"
151
 
152
  # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” MOSTRAR RESULTADOS β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” #
153
  def mostrar(header, y_pred):
154
- """Expander com botΓ£o UniProt e 2 colunas (β‰₯0.37 | Top-20)."""
155
  pid = header.split()[0]
156
  uniprot = f"https://www.uniprot.org/uniprotkb/{pid}"
157
 
158
  with st.expander(header, expanded=True):
159
- # botΓ£o UniProt
160
  st.markdown(
161
  f"""
162
  <div style="text-align:right;margin-bottom:0.5rem">
163
  <a href="{uniprot}" target="_blank">
164
  <button style="background:#2b8cbe;border:none;border-radius:4px;
165
  padding:0.35rem 0.8rem;color:#fff;font-size:0.9rem;
166
- cursor:pointer">
167
- Visitar UniProt
168
- </button>
169
  </a>
170
  </div>
171
  """,
@@ -174,26 +163,24 @@ def mostrar(header, y_pred):
174
 
175
  col1, col2 = st.columns(2)
176
 
177
- # coluna 1 β€” GO terms acima do threshold
178
  with col1:
179
  st.markdown(f"**GO terms com prob β‰₯ {THRESH}**")
180
  hits = mlb.inverse_transform((y_pred >= THRESH).astype(int))[0]
181
  if hits:
182
  for go_id in hits:
183
- name, defin = GO_INFO.get(go_id, ("β€” sem nome β€”", ""))
184
- # limpeza: remove [ … ], aspas e espaΓ§os extra
185
- defin = re.sub(r"\\[[^\\]]*\\]", "", defin or "")
186
- defin = defin.strip(' "')
187
  st.markdown(f"- {go_link(go_id, name)}")
188
  if defin:
189
  st.caption(defin)
190
  else:
191
  st.code("β€” nenhum β€”")
192
 
193
- # coluna 2 β€” top-20
194
  with col2:
195
  st.markdown(f"**Top {TOP_N} GO terms mais provΓ‘veis**")
196
- for rank, idx in enumerate(np.argsort(-y_pred[0])[:TOP_N], start=1):
197
  go_id = GO[idx]
198
  name, _ = GO_INFO.get(go_id, ("", ""))
199
  st.markdown(f"{rank}. {go_link(go_id, name)} : {y_pred[0][idx]:.4f}")
@@ -202,22 +189,19 @@ def mostrar(header, y_pred):
202
  if predict_clicked:
203
  for header, seq in parse_fasta_multiple(fasta_input):
204
  with st.spinner(f"A processar {header}… (pode demorar alguns minutos)"):
205
- # embeddings
206
  emb_pb = embed_seq(FINETUNED_PB, seq, CHUNK_PB)
207
  emb_bfd = embed_seq(FINETUNED_BFD, seq, CHUNK_PB)
208
  emb_esm = embed_seq(BASE_ESM, seq, CHUNK_ESM)
209
 
210
- # prediΓ§Γ΅es
211
  y_pb = mlp_pb.predict(emb_pb)
212
  y_bfd = mlp_bfd.predict(emb_bfd)
213
  y_esm = mlp_esm.predict(emb_esm)[:, :597]
214
 
215
- # stacking
216
  y_ens = stacking.predict(np.concatenate([y_pb, y_bfd, y_esm], axis=1))
217
 
218
  mostrar(header, y_ens)
219
 
220
- # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” LISTA COMPLETA DOS 597 TERMOS β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” #
221
  with st.expander("Mostrar lista completa dos 597 GO terms possΓ­veis", expanded=False):
222
  cols = st.columns(3)
223
  for i, go_id in enumerate(GO):
 
14
 
15
  # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” CONFIG β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” #
16
  SPACE_ID = "melvinalves/protein_function_prediction"
17
+ TOP_N = 20
18
  THRESH = 0.37
19
+ CHUNK_PB = 512
20
+ CHUNK_ESM = 1024
21
 
22
  # repositΓ³rios HF
23
  FINETUNED_PB = ("melvinalves/FineTune", "fineTunedProtbert")
 
36
  """Carrega modelos Keras (MLPs e stacking)."""
37
  return load_model(download_file(f"models/{name}"), compile=False)
38
 
 
39
  @st.cache_resource
40
  def load_hf_encoder(repo_id, subfolder=None, base_tok=None):
41
  """
42
+ Carrega tokenizer + encoder HF (converte TF-weights p/ PyTorch on-the-fly).
 
 
 
43
  """
44
  if base_tok is None:
45
  base_tok = repo_id
 
52
  mdl.eval()
53
  return tok, mdl
54
 
 
55
  def embed_seq(model_ref, seq, chunk):
56
  """
57
+ Devolve embedding CLS mΓ©dio (divide seq. longa em chunks se necessΓ‘rio).
 
58
  """
59
+ if isinstance(model_ref, tuple): # ProtBERT fine-tuned
60
  repo_id, subf = model_ref
61
  tok, mdl = load_hf_encoder(repo_id, subfolder=subf,
62
  base_tok="Rostlab/prot_bert")
63
+ else: # modelo base ESM-2
64
  tok, mdl = load_hf_encoder(model_ref)
65
 
66
  parts = [seq[i:i+chunk] for i in range(0, len(seq), chunk)]
 
74
 
75
  @st.cache_resource
76
  def load_go_info():
77
+ """LΓͺ GO.obo e devolve {id: (name, definition bruta)}."""
78
+ dag = GODag(download_file("data/go.obo"), optional_attrs=["defn"])
 
79
  return {tid: (term.name, term.defn) for tid, term in dag.items()}
80
 
81
  GO_INFO = load_go_info()
 
93
  st.set_page_config(page_title="PrediΓ§Γ£o de FunΓ§Γ΅es Moleculares de ProteΓ­nas",
94
  page_icon="🧬", layout="centered")
95
 
 
96
  st.markdown(
97
  """
98
  <style>
99
+ body, .stApp { background:#FFFFFF !important; }
100
+ .block-container { padding-top:1.5rem; }
101
+ textarea { font-size:0.9rem !important; }
102
  div[data-testid="column"]:first-child {
103
+ border-right:1px solid #E0E0E0; padding-right:1rem !important;
 
104
  }
105
  </style>
106
  """,
107
  unsafe_allow_html=True
108
  )
109
 
 
110
  if os.path.exists("logo.png"):
111
  st.image("logo.png", width=180)
112
 
113
  st.title("PrediΓ§Γ£o de FunΓ§Γ΅es Moleculares de ProteΓ­nas (GO:MF)")
114
 
115
+ fasta_input = st.text_area("Insere uma ou mais sequΓͺncias FASTA:", height=300)
116
+ predict_clicked = st.button("Prever GO terms")
117
 
118
+ # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” UTILITÁRIOS β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” #
119
+ def parse_fasta_multiple(text):
120
+ """Extrai [(header, seq)] de texto FASTA (bloco inicial sem '>' suportado)."""
121
+ out, blocks = [], text.strip().split(">")
122
+ for i, blk in enumerate(blocks):
123
+ if not blk.strip():
 
 
124
  continue
125
+ lines = blk.strip().splitlines()
126
+ if i > 0:
127
+ header, seq = lines[0].strip(), "".join(lines[1:]).replace(" ", "").upper()
128
+ else:
129
+ header, seq = f"Seq_{i+1}", "".join(lines).replace(" ", "").upper()
 
 
130
  if seq:
131
+ out.append((header, seq))
132
+ return out
133
+
134
+ def clean_definition(defin: str) -> str:
135
+ """Remove '\"', blocos [ ... ] e mΓΊltiplos espaΓ§os."""
136
+ defin = re.sub(r"\\[[^\\]]*\\]", "", defin or "") # tira citaΓ§Γ΅es [...]
137
+ defin = defin.replace('"', "") # tira aspas
138
+ defin = re.sub(r"\\s{2,}", " ", defin) # colapsa espaΓ§os
139
+ return defin.strip()
140
 
 
141
  def go_link(go_id, name=""):
 
142
  url = f"https://www.ebi.ac.uk/QuickGO/term/{go_id}"
143
  return f"[{go_id} β€” {name}]({url})" if name else f"[{go_id}]({url})"
144
 
145
  # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” MOSTRAR RESULTADOS β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” #
146
  def mostrar(header, y_pred):
 
147
  pid = header.split()[0]
148
  uniprot = f"https://www.uniprot.org/uniprotkb/{pid}"
149
 
150
  with st.expander(header, expanded=True):
 
151
  st.markdown(
152
  f"""
153
  <div style="text-align:right;margin-bottom:0.5rem">
154
  <a href="{uniprot}" target="_blank">
155
  <button style="background:#2b8cbe;border:none;border-radius:4px;
156
  padding:0.35rem 0.8rem;color:#fff;font-size:0.9rem;
157
+ cursor:pointer">Visitar UniProt</button>
 
 
158
  </a>
159
  </div>
160
  """,
 
163
 
164
  col1, col2 = st.columns(2)
165
 
166
+ # --- coluna 1 : β‰₯ threshold
167
  with col1:
168
  st.markdown(f"**GO terms com prob β‰₯ {THRESH}**")
169
  hits = mlb.inverse_transform((y_pred >= THRESH).astype(int))[0]
170
  if hits:
171
  for go_id in hits:
172
+ name, defin_raw = GO_INFO.get(go_id, ("β€” sem nome β€”", ""))
173
+ defin = clean_definition(defin_raw)
 
 
174
  st.markdown(f"- {go_link(go_id, name)}")
175
  if defin:
176
  st.caption(defin)
177
  else:
178
  st.code("β€” nenhum β€”")
179
 
180
+ # --- coluna 2 : Top-20
181
  with col2:
182
  st.markdown(f"**Top {TOP_N} GO terms mais provΓ‘veis**")
183
+ for rank, idx in enumerate(np.argsort(-y_pred[0])[:TOP_N], 1):
184
  go_id = GO[idx]
185
  name, _ = GO_INFO.get(go_id, ("", ""))
186
  st.markdown(f"{rank}. {go_link(go_id, name)} : {y_pred[0][idx]:.4f}")
 
189
  if predict_clicked:
190
  for header, seq in parse_fasta_multiple(fasta_input):
191
  with st.spinner(f"A processar {header}… (pode demorar alguns minutos)"):
 
192
  emb_pb = embed_seq(FINETUNED_PB, seq, CHUNK_PB)
193
  emb_bfd = embed_seq(FINETUNED_BFD, seq, CHUNK_PB)
194
  emb_esm = embed_seq(BASE_ESM, seq, CHUNK_ESM)
195
 
 
196
  y_pb = mlp_pb.predict(emb_pb)
197
  y_bfd = mlp_bfd.predict(emb_bfd)
198
  y_esm = mlp_esm.predict(emb_esm)[:, :597]
199
 
 
200
  y_ens = stacking.predict(np.concatenate([y_pb, y_bfd, y_esm], axis=1))
201
 
202
  mostrar(header, y_ens)
203
 
204
+ # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” LISTA COMPLETA (597) β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” #
205
  with st.expander("Mostrar lista completa dos 597 GO terms possΓ­veis", expanded=False):
206
  cols = st.columns(3)
207
  for i, go_id in enumerate(GO):