melvinalves commited on
Commit
c9aceda
Β·
verified Β·
1 Parent(s): 3b8f083

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -56
app.py CHANGED
@@ -14,9 +14,9 @@ login(os.environ["HF_TOKEN"])
14
 
15
  # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” CONFIG β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” #
16
  SPACE_ID = "melvinalves/protein_function_prediction"
17
- TOP_N = 20 # mostra agora top-20
18
  THRESH = 0.37
19
- CHUNK_PB = 512 # janela ProtBERT / ProtBERT-BFD
20
  CHUNK_ESM = 1024 # janela ESM-2
21
 
22
  # repositΓ³rios HF
@@ -41,7 +41,7 @@ def load_keras(name):
41
  def load_hf_encoder(repo_id, subfolder=None, base_tok=None):
42
  """
43
  β€’ repo_id : repositΓ³rio HF ou caminho local
44
- β€’ subfolder : subpasta onde vivem pesos/config (None se nΓ£o houver)
45
  β€’ base_tok : repo para o tokenizer (None => usa repo_id)
46
  Converte tf_model.h5 β†’ PyTorch on-the-fly (from_tf=True).
47
  """
@@ -59,8 +59,8 @@ def load_hf_encoder(repo_id, subfolder=None, base_tok=None):
59
  # ---------- extrair embedding ----------
60
  def embed_seq(model_ref, seq, chunk):
61
  """
62
- β€’ model_ref = string (modelo base) OU tuple(repo_id, subfolder) (modelo fine-tuned)
63
- Retorna embedding CLS mΓ©dio (caso a sequΓͺncia seja dividida em chunks).
64
  """
65
  if isinstance(model_ref, tuple): # ProtBERT fine-tuned
66
  repo_id, subf = model_ref
@@ -100,80 +100,72 @@ GO = mlb.classes_
100
  st.set_page_config(page_title="PrediΓ§Γ£o de FunΓ§Γ΅es Moleculares de ProteΓ­nas",
101
  page_icon="🧬", layout="centered")
102
 
103
- # Fundo branco + separador de colunas
104
  st.markdown(
105
  """
106
  <style>
107
- body, .stApp { background-color: #FFFFFF !important; }
108
- .block-container { padding-top: 1.5rem; }
109
- textarea { font-size: 0.9rem !important; }
110
-
111
- /* traΓ§o vertical entre primeiras colunas */
112
  div[data-testid="column"]:first-child {
113
- border-right: 1px solid #E0E0E0;
114
- padding-right: 1rem !important;
115
  }
116
  </style>
117
  """,
118
  unsafe_allow_html=True
119
  )
120
 
121
- # Logo (coloca logo.png na raiz do Space)
122
- LOGO_PATH = "logo.png"
123
- if os.path.exists(LOGO_PATH):
124
- st.image(LOGO_PATH, width=180)
125
 
126
  st.title("PrediΓ§Γ£o de FunΓ§Γ΅es Moleculares de ProteΓ­nas (GO:MF)")
127
 
128
  fasta_input = st.text_area("Insere uma ou mais sequΓͺncias FASTA:", height=300)
129
  predict_clicked = st.button("Prever GO terms")
130
 
131
- # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” PARSE DE MÚLTIPLAS SEQUÊNCIAS β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” #
132
  def parse_fasta_multiple(fasta_str):
133
  """
134
- Devolve lista de (header, seq) a partir de texto FASTA possivelmente mΓΊltiplo.
135
- Suporta bloco inicial sem '>'.
136
  """
137
  entries, parsed = fasta_str.strip().split(">"), []
138
  for i, entry in enumerate(entries):
139
  if not entry.strip():
140
  continue
141
  lines = entry.strip().splitlines()
142
- if i > 0: # bloco tΓ­pico FASTA
143
  header = lines[0].strip()
144
  seq = "".join(lines[1:]).replace(" ", "").upper()
145
- else: # sequΓͺncia sem '>'
146
  header = f"Seq_{i+1}"
147
  seq = "".join(lines).replace(" ", "").upper()
148
  if seq:
149
  parsed.append((header, seq))
150
  return parsed
151
 
152
- # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” FUNÇÕES AUXILIARES DE LAYOUT β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” #
153
  def go_link(go_id, name=""):
154
- """Cria link para pΓ‘gina do GO term (QuickGO)."""
155
  url = f"https://www.ebi.ac.uk/QuickGO/term/{go_id}"
156
- label = f"{go_id} β€” {name}" if name else go_id
157
- return f"[{label}]({url})"
158
 
159
- # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” FUNÇÃO PRINCIPAL DE RESULTADOS β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” #
160
  def mostrar(header, y_pred):
161
- """Mostra resultados (botΓ£o UniProt + duas colunas)."""
162
- prot_id = header.split()[0]
163
- prot_url = f"https://www.uniprot.org/uniprotkb/{prot_id}"
164
 
165
  with st.expander(header, expanded=True):
166
- # BotΓ£o Β«Visitar UniProtΒ»
167
  st.markdown(
168
  f"""
169
- <div style="text-align:right; margin-bottom:0.5rem;">
170
- <a href="{prot_url}" target="_blank">
171
- <button style="
172
- background:#2b8cbe;border:none;border-radius:4px;
173
- padding:0.35rem 0.8rem;color:#fff;font-size:0.9rem;
174
- cursor:pointer;">
175
- Visitar UniProt
176
- </button>
177
  </a>
178
  </div>
179
  """,
@@ -182,22 +174,23 @@ def mostrar(header, y_pred):
182
 
183
  col1, col2 = st.columns(2)
184
 
185
- # β€”β€”β€” coluna 1 : termos acima do threshold
186
  with col1:
187
  st.markdown(f"**GO terms com prob β‰₯ {THRESH}**")
188
  hits = mlb.inverse_transform((y_pred >= THRESH).astype(int))[0]
189
  if hits:
190
  for go_id in hits:
191
  name, defin = GO_INFO.get(go_id, ("β€” sem nome β€”", ""))
192
- defin = re.sub(r'^\\s*"?(.+?)"?\\s*(\\[[^\\]]*\\])?\\s*$', r'\\1',
193
- defin or "")
 
194
  st.markdown(f"- {go_link(go_id, name)}")
195
  if defin:
196
  st.caption(defin)
197
  else:
198
  st.code("β€” nenhum β€”")
199
 
200
- # β€”β€”β€” coluna 2 : top-N mais provΓ‘veis
201
  with col2:
202
  st.markdown(f"**Top {TOP_N} GO terms mais provΓ‘veis**")
203
  for rank, idx in enumerate(np.argsort(-y_pred[0])[:TOP_N], start=1):
@@ -207,30 +200,24 @@ def mostrar(header, y_pred):
207
 
208
  # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” INFERÊNCIA β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” #
209
  if predict_clicked:
210
- parsed_seqs = parse_fasta_multiple(fasta_input)
211
- if not parsed_seqs:
212
- st.warning("NΓ£o foi possΓ­vel encontrar nenhuma sequΓͺncia vΓ‘lida.")
213
- st.stop()
214
-
215
- for header, seq in parsed_seqs:
216
  with st.spinner(f"A processar {header}… (pode demorar alguns minutos)"):
217
- # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” EMBEDDINGS β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” #
218
  emb_pb = embed_seq(FINETUNED_PB, seq, CHUNK_PB)
219
  emb_bfd = embed_seq(FINETUNED_BFD, seq, CHUNK_PB)
220
  emb_esm = embed_seq(BASE_ESM, seq, CHUNK_ESM)
221
 
222
- # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” PREDIÇÕES MLPs β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” #
223
  y_pb = mlp_pb.predict(emb_pb)
224
  y_bfd = mlp_bfd.predict(emb_bfd)
225
- y_esm = mlp_esm.predict(emb_esm)[:, :597] # alinhar nΒΊ de termos
226
 
227
- # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” STACKING β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” #
228
- X = np.concatenate([y_pb, y_bfd, y_esm], axis=1)
229
- y_ens = stacking.predict(X)
230
 
231
  mostrar(header, y_ens)
232
 
233
- # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” LISTA COMPLETA DE TERMOS SUPORTADOS β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” #
234
  with st.expander("Mostrar lista completa dos 597 GO terms possΓ­veis", expanded=False):
235
  cols = st.columns(3)
236
  for i, go_id in enumerate(GO):
 
14
 
15
  # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” CONFIG β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” #
16
  SPACE_ID = "melvinalves/protein_function_prediction"
17
+ TOP_N = 20 # top-20
18
  THRESH = 0.37
19
+ CHUNK_PB = 512 # janela ProtBERT
20
  CHUNK_ESM = 1024 # janela ESM-2
21
 
22
  # repositΓ³rios HF
 
41
  def load_hf_encoder(repo_id, subfolder=None, base_tok=None):
42
  """
43
  β€’ repo_id : repositΓ³rio HF ou caminho local
44
+ β€’ subfolder : subpasta (None se nΓ£o houver)
45
  β€’ base_tok : repo para o tokenizer (None => usa repo_id)
46
  Converte tf_model.h5 β†’ PyTorch on-the-fly (from_tf=True).
47
  """
 
59
  # ---------- extrair embedding ----------
60
  def embed_seq(model_ref, seq, chunk):
61
  """
62
+ β€’ model_ref = string (modelo base) ou tuple(repo_id, subfolder) (fine-tuned)
63
+ Retorna embedding CLS mΓ©dio (se a sequΓͺncia for dividida em chunks).
64
  """
65
  if isinstance(model_ref, tuple): # ProtBERT fine-tuned
66
  repo_id, subf = model_ref
 
100
  st.set_page_config(page_title="PrediΓ§Γ£o de FunΓ§Γ΅es Moleculares de ProteΓ­nas",
101
  page_icon="🧬", layout="centered")
102
 
103
+ # CSS: fundo branco e separador vertical
104
  st.markdown(
105
  """
106
  <style>
107
+ /* traΓ§o vertical entre colunas dentro dos expanders */
 
 
 
 
108
  div[data-testid="column"]:first-child {
109
+ border-right:1px solid #000000;
110
+ padding-right:1rem !important;
111
  }
112
  </style>
113
  """,
114
  unsafe_allow_html=True
115
  )
116
 
117
+ # Logo (logo.png na raiz do Space)
118
+ if os.path.exists("logo.png"):
119
+ st.image("logo.png", width=180)
 
120
 
121
  st.title("PrediΓ§Γ£o de FunΓ§Γ΅es Moleculares de ProteΓ­nas (GO:MF)")
122
 
123
  fasta_input = st.text_area("Insere uma ou mais sequΓͺncias FASTA:", height=300)
124
  predict_clicked = st.button("Prever GO terms")
125
 
126
+ # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” PARSE FASTA β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” #
127
  def parse_fasta_multiple(fasta_str):
128
  """
129
+ Devolve lista de (header, seq). Suporta bloco inicial sem '>'.
 
130
  """
131
  entries, parsed = fasta_str.strip().split(">"), []
132
  for i, entry in enumerate(entries):
133
  if not entry.strip():
134
  continue
135
  lines = entry.strip().splitlines()
136
+ if i > 0: # bloco FASTA normal
137
  header = lines[0].strip()
138
  seq = "".join(lines[1:]).replace(" ", "").upper()
139
+ else: # sequΓͺncia sem '>'
140
  header = f"Seq_{i+1}"
141
  seq = "".join(lines).replace(" ", "").upper()
142
  if seq:
143
  parsed.append((header, seq))
144
  return parsed
145
 
146
+ # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” LIGAÇÕES β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” #
147
  def go_link(go_id, name=""):
148
+ """Link QuickGO."""
149
  url = f"https://www.ebi.ac.uk/QuickGO/term/{go_id}"
150
+ return f"[{go_id} β€” {name}]({url})" if name else f"[{go_id}]({url})"
 
151
 
152
+ # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” MOSTRAR RESULTADOS β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” #
153
  def mostrar(header, y_pred):
154
+ """Expander com botΓ£o UniProt e 2 colunas (β‰₯0.37 | Top-20)."""
155
+ pid = header.split()[0]
156
+ uniprot = f"https://www.uniprot.org/uniprotkb/{pid}"
157
 
158
  with st.expander(header, expanded=True):
159
+ # botΓ£o UniProt
160
  st.markdown(
161
  f"""
162
+ <div style="text-align:right;margin-bottom:0.5rem">
163
+ <a href="{uniprot}" target="_blank">
164
+ <button style="background:#2b8cbe;border:none;border-radius:4px;
165
+ padding:0.35rem 0.8rem;color:#fff;font-size:0.9rem;
166
+ cursor:pointer">
167
+ Visitar UniProt
168
+ </button>
 
169
  </a>
170
  </div>
171
  """,
 
174
 
175
  col1, col2 = st.columns(2)
176
 
177
+ # coluna 1 β€” GO terms acima do threshold
178
  with col1:
179
  st.markdown(f"**GO terms com prob β‰₯ {THRESH}**")
180
  hits = mlb.inverse_transform((y_pred >= THRESH).astype(int))[0]
181
  if hits:
182
  for go_id in hits:
183
  name, defin = GO_INFO.get(go_id, ("β€” sem nome β€”", ""))
184
+ # limpeza: remove [ … ], aspas e espaΓ§os extra
185
+ defin = re.sub(r"\\[[^\\]]*\\]", "", defin or "")
186
+ defin = defin.strip(' "')
187
  st.markdown(f"- {go_link(go_id, name)}")
188
  if defin:
189
  st.caption(defin)
190
  else:
191
  st.code("β€” nenhum β€”")
192
 
193
+ # coluna 2 β€” top-20
194
  with col2:
195
  st.markdown(f"**Top {TOP_N} GO terms mais provΓ‘veis**")
196
  for rank, idx in enumerate(np.argsort(-y_pred[0])[:TOP_N], start=1):
 
200
 
201
  # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” INFERÊNCIA β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” #
202
  if predict_clicked:
203
+ for header, seq in parse_fasta_multiple(fasta_input):
 
 
 
 
 
204
  with st.spinner(f"A processar {header}… (pode demorar alguns minutos)"):
205
+ # embeddings
206
  emb_pb = embed_seq(FINETUNED_PB, seq, CHUNK_PB)
207
  emb_bfd = embed_seq(FINETUNED_BFD, seq, CHUNK_PB)
208
  emb_esm = embed_seq(BASE_ESM, seq, CHUNK_ESM)
209
 
210
+ # prediΓ§Γ΅es
211
  y_pb = mlp_pb.predict(emb_pb)
212
  y_bfd = mlp_bfd.predict(emb_bfd)
213
+ y_esm = mlp_esm.predict(emb_esm)[:, :597]
214
 
215
+ # stacking
216
+ y_ens = stacking.predict(np.concatenate([y_pb, y_bfd, y_esm], axis=1))
 
217
 
218
  mostrar(header, y_ens)
219
 
220
+ # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” LISTA COMPLETA DOS 597 TERMOS β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€” #
221
  with st.expander("Mostrar lista completa dos 597 GO terms possΓ­veis", expanded=False):
222
  cols = st.columns(3)
223
  for i, go_id in enumerate(GO):