openfree commited on
Commit
cb1dc3c
ยท
verified ยท
1 Parent(s): 8c47e15

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +261 -219
app.py CHANGED
@@ -1,56 +1,60 @@
1
  import os
2
  import json
3
  import time
4
- import hashlib
5
  from typing import List, Dict, Tuple
6
 
7
  import streamlit as st
8
  import requests
9
 
10
- # Optional heavy deps; guard imports so the app still loads
11
  try:
12
  import torch
13
  from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM
14
- except Exception as e:
15
- torch = None
16
- AutoTokenizer = None
17
- AutoModel = None
18
- AutoModelForMaskedLM = None
19
 
20
  try:
21
  from datasets import load_dataset
 
22
  except Exception:
23
- load_dataset = None
24
 
25
  try:
26
  from sentence_transformers import SentenceTransformer
 
27
  except Exception:
28
- SentenceTransformer = None
29
 
30
  try:
31
- import faiss # faiss-cpu
 
32
  except Exception:
33
- faiss = None
34
 
35
  try:
36
  from Bio import SeqIO
 
37
  except Exception:
38
- SeqIO = None
39
 
 
40
  APP_TITLE = "BioSeq Chat: Protein & DNA Assistant"
41
  DISCLAIMER = (
42
  "This tool is for research/education and is not a medical device. "
43
  "Do not use outputs for diagnosis or treatment decisions."
44
  )
45
 
46
- # --------------- Helpers ---------------
47
 
48
  def get_secret(name: str, fallback: str = "") -> str:
49
  """Get secret from st.secrets, environment, or fallback"""
50
  try:
51
- return st.secrets.get(name, os.environ.get(name, fallback))
52
- except Exception:
53
- return os.environ.get(name, fallback)
 
 
54
 
55
  def brave_search(query: str, count: int = 5) -> List[Dict]:
56
  """Search using Brave Search API"""
@@ -79,9 +83,7 @@ def brave_search(query: str, count: int = 5) -> List[Dict]:
79
  "url": item.get("url", ""),
80
  "snippet": item.get("description", ""),
81
  })
82
- if not results:
83
- results = [{"title": "No results", "url": "", "snippet": "Query returned no results."}]
84
- return results
85
  except Exception as e:
86
  return [{"title": "Search error", "url": "", "snippet": str(e)}]
87
 
@@ -123,11 +125,11 @@ def load_text_from_file(upload) -> str:
123
 
124
  try:
125
  text = content.decode("utf-8", errors="ignore")
126
- except Exception:
127
  text = str(content)
128
 
129
- # FASTA quick parse
130
- if name.endswith((".fa", ".fasta", ".faa", ".fna")) and SeqIO is not None:
131
  upload.seek(0)
132
  try:
133
  records = list(SeqIO.parse(upload, "fasta"))
@@ -135,14 +137,14 @@ def load_text_from_file(upload) -> str:
135
  for r in records:
136
  seqs.append(f">{r.id}\n{str(r.seq)}")
137
  return "\n\n".join(seqs)
138
- except Exception:
139
- return text
140
 
141
  return text
142
 
143
  def build_vector_index(texts: List[str], embedder_name: str = "sentence-transformers/all-MiniLM-L6-v2"):
144
  """Build FAISS vector index from texts"""
145
- if SentenceTransformer is None or faiss is None:
146
  return None, None, None
147
 
148
  try:
@@ -169,15 +171,18 @@ def search_index(query: str, index, model, texts: List[str], k: int = 4):
169
  if 0 <= idx < len(texts):
170
  hits.append({"score": float(score), "text": texts[idx]})
171
  return hits
172
- except Exception:
173
  return []
174
 
175
  def esm2_embed(seq: str, model_id: str = "facebook/esm2_t6_8M_UR50D") -> Dict:
176
  """Generate ESM-2 embedding for protein sequence"""
177
- if AutoTokenizer is None or AutoModelForMaskedLM is None or torch is None:
178
- return {"error": "Transformers/torch not available"}
179
 
180
  try:
 
 
 
181
  tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
182
  model = AutoModelForMaskedLM.from_pretrained(model_id, trust_remote_code=True)
183
  model.eval()
@@ -185,18 +190,21 @@ def esm2_embed(seq: str, model_id: str = "facebook/esm2_t6_8M_UR50D") -> Dict:
185
  with torch.no_grad():
186
  toks = tokenizer(seq, return_tensors="pt")
187
  out = model(**toks, output_hidden_states=True)
188
- hidden = out.hidden_states[-1].mean(dim=1).squeeze(0) # [hidden_size]
189
  vec = hidden.detach().cpu().numpy()
190
  return {"embedding": vec.tolist(), "hidden_size": vec.shape[0]}
191
  except Exception as e:
192
  return {"error": str(e)}
193
 
194
  def dna_embed(seq: str, model_id: str = "zhihan1996/DNABERT-2-117M") -> Dict:
195
- """Generate DNABERT-2 or Nucleotide Transformer embedding for DNA sequence"""
196
- if AutoTokenizer is None or AutoModel is None or torch is None:
197
- return {"error": "Transformers/torch not available"}
198
 
199
  try:
 
 
 
200
  tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
201
  model = AutoModel.from_pretrained(model_id, trust_remote_code=True)
202
  model.eval()
@@ -219,112 +227,13 @@ def chunk_text(text: str, chunk_size: int = 1200, overlap: int = 200) -> List[st
219
  while start < len(text):
220
  end = min(len(text), start + chunk_size)
221
  chunks.append(text[start:end])
222
- start = end - overlap
223
- if start < 0:
224
- start = 0
225
  if end >= len(text):
226
  break
 
227
 
228
  return chunks
229
 
230
- def safe_len(obj, default=0):
231
- """Safely get length of object"""
232
- try:
233
- return len(obj)
234
- except Exception:
235
- return default
236
-
237
- # --------------- UI ---------------
238
-
239
- st.set_page_config(page_title=APP_TITLE, page_icon="๐Ÿงฌ", layout="wide")
240
- st.title(APP_TITLE)
241
- st.caption(DISCLAIMER)
242
-
243
- # Sidebar configuration
244
- with st.sidebar:
245
- st.header("Keys and settings")
246
- fw_key = st.text_input("FIREWORKS_API_KEY", value=get_secret("FIREWORKS_API_KEY", ""), type="password")
247
- brave_key = st.text_input("BRAVE_API_KEY", value=get_secret("BRAVE_API_KEY", ""), type="password")
248
-
249
- if fw_key:
250
- os.environ["FIREWORKS_API_KEY"] = fw_key
251
- if brave_key:
252
- os.environ["BRAVE_API_KEY"] = brave_key
253
-
254
- st.markdown("### Model selections")
255
- esm2_id = st.text_input(
256
- "Protein model (ESM-2)",
257
- value="facebook/esm2_t6_8M_UR50D",
258
- help="Try larger models like facebook/esm2_t33_650M_UR50D if resources allow."
259
- )
260
- dna_id = st.text_input(
261
- "DNA model",
262
- value="zhihan1996/DNABERT-2-117M",
263
- help="Alternative: InstaDeepAI/nucleotide-transformer-500m-human-ref"
264
- )
265
-
266
- use_web = st.checkbox("Use Brave web search for context", value=True)
267
- web_k = st.slider("Web results", 1, 10, 4)
268
-
269
- st.markdown("### Datasets (optional)")
270
- ds_hint = "Enter a Hugging Face dataset repo id, e.g., 'genomics-benchmark/jaspar_motifs'"
271
- dataset_ids = st.text_area("Datasets to load (one per line)", value="", help=ds_hint)
272
-
273
- st.divider()
274
- st.markdown("Files you upload are indexed locally and used for answers.")
275
-
276
- # Main tabs
277
- tabs = st.tabs(["Chat", "Protein", "DNA", "Examples", "About"])
278
-
279
- # File upload and indexing
280
- with st.expander("Upload files for context (txt/csv/json/fasta/vcf)", expanded=True):
281
- uploads = st.file_uploader(
282
- "Add files",
283
- type=["txt", "md", "csv", "tsv", "json", "fa", "fasta", "faa", "fna", "vcf"],
284
- accept_multiple_files=True
285
- )
286
- docs = []
287
- if uploads:
288
- for up in uploads:
289
- try:
290
- txt = load_text_from_file(up)
291
- docs.extend(chunk_text(txt))
292
- except Exception as e:
293
- st.warning(f"Failed to read {up.name}: {e}")
294
- st.caption(f"Indexed chunks: {len(docs)}")
295
-
296
- # Build vector index
297
- index = None
298
- index_model = None
299
- if docs:
300
- with st.spinner("Building vector index..."):
301
- index, emb, index_model = build_vector_index(docs)
302
-
303
- # Load datasets
304
- loaded_datasets = []
305
- if dataset_ids.strip():
306
- if load_dataset is None:
307
- st.warning("datasets library not available")
308
- else:
309
- for rid in [x.strip() for x in dataset_ids.splitlines() if x.strip()]:
310
- with st.spinner(f"Loading dataset {rid} ..."):
311
- try:
312
- ds = load_dataset(rid)
313
- # Show a sample without materializing fully
314
- sample = ""
315
- for split in ds.keys():
316
- try:
317
- row = ds[split][0]
318
- sample = json.dumps(row, ensure_ascii=False)[:500]
319
- break
320
- except Exception:
321
- pass
322
- loaded_datasets.append((rid, sample))
323
- st.success(f"Loaded {rid}")
324
- except Exception as e:
325
- st.error(f"Failed to load {rid}: {e}")
326
-
327
- def build_context(user_query: str) -> Tuple[str, List[Dict]]:
328
  """Build context from various sources"""
329
  pieces = []
330
  sources = []
@@ -355,9 +264,9 @@ def build_context(user_query: str) -> Tuple[str, List[Dict]]:
355
  context = "\n\n---\n\n".join(pieces)[:6000]
356
  return context, sources
357
 
358
- def chat_answer(user_query: str) -> Tuple[str, List[Dict]]:
359
  """Generate chat answer with context"""
360
- context, sources = build_context(user_query)
361
  system = (
362
  "You are a concise, careful bioinformatics assistant for protein and DNA. "
363
  "Answer with factual, verifiable statements. "
@@ -373,93 +282,226 @@ def chat_answer(user_query: str) -> Tuple[str, List[Dict]]:
373
  answer = call_fireworks(messages, temperature=0.4, max_tokens=1200)
374
  return answer, sources
375
 
376
- # Chat tab
377
- with tabs[0]:
378
- st.subheader("Chat")
379
- q = st.text_area("Ask a question about protein/DNA", value="ESM-2 ์ž„๋ฒ ๋”ฉ์€ ๋‹จ๋ฐฑ์งˆ ๊ธฐ๋Šฅ ํ•ด์„์— ์–ด๋–ป๊ฒŒ ๋„์›€๋˜๋‚˜์š”?")
 
 
 
 
 
 
380
 
381
- if st.button("Answer", type="primary"):
382
- with st.spinner("Thinking..."):
383
- ans, srcs = chat_answer(q)
384
- st.write(ans)
 
 
 
 
 
 
 
 
 
 
 
385
 
386
- if srcs:
387
- st.markdown("#### Sources")
388
- for s in srcs:
389
- if s.get("type") == "web" and s.get("url"):
390
- st.markdown(f"- {s.get('title','web')}: {s.get('url')}")
391
- elif s.get("type") == "dataset":
392
- st.markdown(f"- dataset: {s.get('id')}")
393
- elif s.get("type") == "file":
394
- snippet = s.get("text", "")
395
- st.markdown(f"- file snippet: {snippet[:120]}...")
396
-
397
- # Protein tab
398
- with tabs[1]:
399
- st.subheader("Protein analysis")
400
- seq = st.text_area("Protein sequence (FASTA seq only; single sequence)", value="MKTIIALSYIFCLVFADYKDDDDK")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
401
 
402
- col1, col2 = st.columns(2)
403
- with col1:
404
- st.caption("ESM-2 embedding")
405
- if st.button("Run ESM-2", key="run_esm2"):
406
- with st.spinner("Computing ESM-2 embedding..."):
407
- out = esm2_embed(seq, esm2_id)
408
- if "error" in out:
409
- st.error(out["error"])
410
- else:
411
- st.success(f"Vector size: {out['hidden_size']}")
412
- st.json({"embedding_preview": out["embedding"][:8]})
413
-
414
- with col2:
415
- st.caption("Quick stats")
416
- s = seq.replace("\n", "").replace(" ", "")
417
- length = len(s)
418
- aa_set = sorted(set(list(s)))
419
- st.write(f"Length: {length}")
420
- st.write(f"Unique AAs: {''.join(aa_set)[:30]}")
421
-
422
- # DNA tab
423
- with tabs[2]:
424
- st.subheader("DNA analysis")
425
- dseq = st.text_area("DNA sequence (ACGT only)", value="ATGCGTACGTAGCTAGCTAGCTAGGCTAGC")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
426
 
427
- col3, col4 = st.columns(2)
428
- with col3:
429
- st.caption("DNABERT-2 / Nucleotide Transformer embedding")
430
- if st.button("Run DNA embed", key="run_dna"):
431
- with st.spinner("Computing DNA embedding..."):
432
- out = dna_embed(dseq, dna_id)
433
- if "error" in out:
434
- st.error(out["error"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
435
  else:
436
- st.success(f"Vector size: {out['hidden_size']}")
437
- st.json({"embedding_preview": out["embedding"][:8]})
 
438
 
439
- with col4:
440
- st.caption("GC content")
441
- s = dseq.upper().replace("N", "")
442
- if len(s) > 0:
443
- gc = (s.count("G") + s.count("C")) / len(s)
444
- else:
445
- gc = 0
446
- st.write(f"Length: {len(s)}")
447
- st.write(f"GC: {gc:.3f}")
448
-
449
- # Examples tab
450
- with tabs[3]:
451
- st.subheader("Examples")
452
- st.markdown("- ์—…๋กœ๋“œํ•œ FASTA์—์„œ ํŠน์ • ๋‹จ๋ฐฑ์งˆ์˜ ๊ธฐ๋Šฅ ์š”์•ฝ๊ณผ ๋ณ€์ด ์˜ํ–ฅ ์งˆ๋ฌธ")
453
- st.markdown("- DNA ์„œ์—ด์—์„œ ํ”„๋กœ๋ชจํ„ฐ ๊ฐ€๋Šฅ์„ฑ๊ณผ ์ „์‚ฌ์ธ์ž ๋ชจํ‹ฐํ”„ ๊ด€๋ จ ๊ทผ๊ฑฐ ์š”์ฒญ")
454
- st.markdown("- Enzyme active site ๊ทผ์ ‘ ๋ณ€์ด์˜ ๋ฆฌ์Šคํฌ ํ•ด์„(์—ฐ๊ตฌ ๊ด€์ )")
455
- st.markdown("- ENCODE/UniProt/AlphaFold ๊ฐœ๋… ์„ค๋ช… ์š”์ฒญ")
456
- st.markdown("- RAG ๊ธฐ๋ฐ˜์œผ๋กœ ๋ฌธ์„œ ์ธ์šฉ๊ณผ ํ•จ๊ป˜ ๊ฐ„๋žต ๋‹ต๋ณ€ ์š”์ฒญ")
457
-
458
- # About tab
459
- with tabs[4]:
460
- st.subheader("About this Space")
461
- st.write("Models suggested: ESM-2 for proteins; DNABERT-2 or Nucleotide Transformer for DNA.")
462
- st.write("Datasets commonly used: UniProtKB, AlphaFoldDB, ENCODE, JASPAR, ClinVar.")
463
- st.write("Web search powered by Brave Search if API key is provided.")
464
- st.write("")
465
- st.info(DISCLAIMER)
 
 
 
 
1
  import os
2
  import json
3
  import time
 
4
  from typing import List, Dict, Tuple
5
 
6
  import streamlit as st
7
  import requests
8
 
9
+ # Guard imports for optional dependencies
10
  try:
11
  import torch
12
  from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM
13
+ TORCH_AVAILABLE = True
14
+ except Exception:
15
+ TORCH_AVAILABLE = False
 
 
16
 
17
  try:
18
  from datasets import load_dataset
19
+ DATASETS_AVAILABLE = True
20
  except Exception:
21
+ DATASETS_AVAILABLE = False
22
 
23
  try:
24
  from sentence_transformers import SentenceTransformer
25
+ SENTENCE_TRANSFORMERS_AVAILABLE = True
26
  except Exception:
27
+ SENTENCE_TRANSFORMERS_AVAILABLE = False
28
 
29
  try:
30
+ import faiss
31
+ FAISS_AVAILABLE = True
32
  except Exception:
33
+ FAISS_AVAILABLE = False
34
 
35
  try:
36
  from Bio import SeqIO
37
+ BIOPYTHON_AVAILABLE = True
38
  except Exception:
39
+ BIOPYTHON_AVAILABLE = False
40
 
41
+ # Constants
42
  APP_TITLE = "BioSeq Chat: Protein & DNA Assistant"
43
  DISCLAIMER = (
44
  "This tool is for research/education and is not a medical device. "
45
  "Do not use outputs for diagnosis or treatment decisions."
46
  )
47
 
48
+ # --------------- Helper Functions ---------------
49
 
50
  def get_secret(name: str, fallback: str = "") -> str:
51
  """Get secret from st.secrets, environment, or fallback"""
52
  try:
53
+ if hasattr(st, 'secrets'):
54
+ return st.secrets.get(name, os.environ.get(name, fallback))
55
+ except:
56
+ pass
57
+ return os.environ.get(name, fallback)
58
 
59
  def brave_search(query: str, count: int = 5) -> List[Dict]:
60
  """Search using Brave Search API"""
 
83
  "url": item.get("url", ""),
84
  "snippet": item.get("description", ""),
85
  })
86
+ return results if results else [{"title": "No results", "url": "", "snippet": "Query returned no results."}]
 
 
87
  except Exception as e:
88
  return [{"title": "Search error", "url": "", "snippet": str(e)}]
89
 
 
125
 
126
  try:
127
  text = content.decode("utf-8", errors="ignore")
128
+ except:
129
  text = str(content)
130
 
131
+ # FASTA file handling
132
+ if name.endswith((".fa", ".fasta", ".faa", ".fna")) and BIOPYTHON_AVAILABLE:
133
  upload.seek(0)
134
  try:
135
  records = list(SeqIO.parse(upload, "fasta"))
 
137
  for r in records:
138
  seqs.append(f">{r.id}\n{str(r.seq)}")
139
  return "\n\n".join(seqs)
140
+ except:
141
+ pass
142
 
143
  return text
144
 
145
  def build_vector_index(texts: List[str], embedder_name: str = "sentence-transformers/all-MiniLM-L6-v2"):
146
  """Build FAISS vector index from texts"""
147
+ if not SENTENCE_TRANSFORMERS_AVAILABLE or not FAISS_AVAILABLE:
148
  return None, None, None
149
 
150
  try:
 
171
  if 0 <= idx < len(texts):
172
  hits.append({"score": float(score), "text": texts[idx]})
173
  return hits
174
+ except:
175
  return []
176
 
177
  def esm2_embed(seq: str, model_id: str = "facebook/esm2_t6_8M_UR50D") -> Dict:
178
  """Generate ESM-2 embedding for protein sequence"""
179
+ if not TORCH_AVAILABLE:
180
+ return {"error": "Transformers/torch not available. Please wait for dependencies to install."}
181
 
182
  try:
183
+ from transformers import AutoTokenizer, AutoModelForMaskedLM
184
+ import torch
185
+
186
  tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
187
  model = AutoModelForMaskedLM.from_pretrained(model_id, trust_remote_code=True)
188
  model.eval()
 
190
  with torch.no_grad():
191
  toks = tokenizer(seq, return_tensors="pt")
192
  out = model(**toks, output_hidden_states=True)
193
+ hidden = out.hidden_states[-1].mean(dim=1).squeeze(0)
194
  vec = hidden.detach().cpu().numpy()
195
  return {"embedding": vec.tolist(), "hidden_size": vec.shape[0]}
196
  except Exception as e:
197
  return {"error": str(e)}
198
 
199
  def dna_embed(seq: str, model_id: str = "zhihan1996/DNABERT-2-117M") -> Dict:
200
+ """Generate DNA embedding"""
201
+ if not TORCH_AVAILABLE:
202
+ return {"error": "Transformers/torch not available. Please wait for dependencies to install."}
203
 
204
  try:
205
+ from transformers import AutoTokenizer, AutoModel
206
+ import torch
207
+
208
  tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
209
  model = AutoModel.from_pretrained(model_id, trust_remote_code=True)
210
  model.eval()
 
227
  while start < len(text):
228
  end = min(len(text), start + chunk_size)
229
  chunks.append(text[start:end])
 
 
 
230
  if end >= len(text):
231
  break
232
+ start = end - overlap
233
 
234
  return chunks
235
 
236
+ def build_context(user_query: str, index, index_model, docs: List[str], loaded_datasets: List, use_web: bool, web_k: int) -> Tuple[str, List[Dict]]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
  """Build context from various sources"""
238
  pieces = []
239
  sources = []
 
264
  context = "\n\n---\n\n".join(pieces)[:6000]
265
  return context, sources
266
 
267
+ def chat_answer(user_query: str, index, index_model, docs: List[str], loaded_datasets: List, use_web: bool, web_k: int) -> Tuple[str, List[Dict]]:
268
  """Generate chat answer with context"""
269
+ context, sources = build_context(user_query, index, index_model, docs, loaded_datasets, use_web, web_k)
270
  system = (
271
  "You are a concise, careful bioinformatics assistant for protein and DNA. "
272
  "Answer with factual, verifiable statements. "
 
282
  answer = call_fireworks(messages, temperature=0.4, max_tokens=1200)
283
  return answer, sources
284
 
285
+ # --------------- Main Application ---------------
286
+
287
+ def main():
288
+ st.set_page_config(page_title=APP_TITLE, page_icon="๐Ÿงฌ", layout="wide")
289
+ st.title(APP_TITLE)
290
+ st.caption(DISCLAIMER)
291
+
292
+ # Check dependencies status
293
+ if not TORCH_AVAILABLE:
294
+ st.warning("โณ PyTorch is being installed. Some features may be unavailable initially. Please refresh in a minute.")
295
 
296
+ # Initialize session state
297
+ if 'docs' not in st.session_state:
298
+ st.session_state.docs = []
299
+ if 'index' not in st.session_state:
300
+ st.session_state.index = None
301
+ if 'index_model' not in st.session_state:
302
+ st.session_state.index_model = None
303
+ if 'loaded_datasets' not in st.session_state:
304
+ st.session_state.loaded_datasets = []
305
+
306
+ # Sidebar configuration
307
+ with st.sidebar:
308
+ st.header("Keys and settings")
309
+ fw_key = st.text_input("FIREWORKS_API_KEY", value=get_secret("FIREWORKS_API_KEY", ""), type="password")
310
+ brave_key = st.text_input("BRAVE_API_KEY", value=get_secret("BRAVE_API_KEY", ""), type="password")
311
 
312
+ if fw_key:
313
+ os.environ["FIREWORKS_API_KEY"] = fw_key
314
+ if brave_key:
315
+ os.environ["BRAVE_API_KEY"] = brave_key
316
+
317
+ st.markdown("### Model selections")
318
+ esm2_id = st.text_input(
319
+ "Protein model (ESM-2)",
320
+ value="facebook/esm2_t6_8M_UR50D",
321
+ help="Try larger models like facebook/esm2_t33_650M_UR50D if resources allow."
322
+ )
323
+ dna_id = st.text_input(
324
+ "DNA model",
325
+ value="zhihan1996/DNABERT-2-117M",
326
+ help="Alternative: InstaDeepAI/nucleotide-transformer-500m-human-ref"
327
+ )
328
+
329
+ use_web = st.checkbox("Use Brave web search for context", value=True)
330
+ web_k = st.slider("Web results", 1, 10, 4)
331
+
332
+ st.markdown("### Datasets (optional)")
333
+ dataset_ids = st.text_area(
334
+ "Datasets to load (one per line)",
335
+ value="",
336
+ help="Enter Hugging Face dataset repo ids, e.g., 'genomics-benchmark/jaspar_motifs'"
337
+ )
338
+
339
+ st.divider()
340
+ st.markdown("Files you upload are indexed locally and used for answers.")
341
 
342
+ # Main tabs
343
+ tabs = st.tabs(["Chat", "Protein", "DNA", "Examples", "About"])
344
+
345
+ # File upload section
346
+ with st.expander("Upload files for context (txt/csv/json/fasta/vcf)", expanded=True):
347
+ uploads = st.file_uploader(
348
+ "Add files",
349
+ type=["txt", "md", "csv", "tsv", "json", "fa", "fasta", "faa", "fna", "vcf"],
350
+ accept_multiple_files=True,
351
+ key="file_uploader"
352
+ )
353
+
354
+ if uploads:
355
+ docs = []
356
+ for up in uploads:
357
+ try:
358
+ txt = load_text_from_file(up)
359
+ docs.extend(chunk_text(txt))
360
+ except Exception as e:
361
+ st.warning(f"Failed to read {up.name}: {e}")
362
+
363
+ st.session_state.docs = docs
364
+ st.caption(f"Indexed chunks: {len(docs)}")
365
+
366
+ # Build index if docs available
367
+ if docs and SENTENCE_TRANSFORMERS_AVAILABLE and FAISS_AVAILABLE:
368
+ with st.spinner("Building vector index..."):
369
+ index, emb, index_model = build_vector_index(docs)
370
+ st.session_state.index = index
371
+ st.session_state.index_model = index_model
372
+ else:
373
+ st.caption("No files uploaded yet")
374
+
375
+ # Load datasets if specified
376
+ if dataset_ids.strip() and DATASETS_AVAILABLE:
377
+ dataset_list = [x.strip() for x in dataset_ids.splitlines() if x.strip()]
378
+ if dataset_list != [d[0] for d in st.session_state.loaded_datasets]:
379
+ st.session_state.loaded_datasets = []
380
+ for rid in dataset_list:
381
+ with st.spinner(f"Loading dataset {rid}..."):
382
+ try:
383
+ ds = load_dataset(rid)
384
+ sample = ""
385
+ for split in ds.keys():
386
+ try:
387
+ row = ds[split][0]
388
+ sample = json.dumps(row, ensure_ascii=False)[:500]
389
+ break
390
+ except:
391
+ pass
392
+ st.session_state.loaded_datasets.append((rid, sample))
393
+ st.success(f"Loaded {rid}")
394
+ except Exception as e:
395
+ st.error(f"Failed to load {rid}: {e}")
396
+
397
+ # Chat tab
398
+ with tabs[0]:
399
+ st.subheader("Chat")
400
+ q = st.text_area("Ask a question about protein/DNA", value="ESM-2 ์ž„๋ฒ ๋”ฉ์€ ๋‹จ๋ฐฑ์งˆ ๊ธฐ๋Šฅ ํ•ด์„์— ์–ด๋–ป๊ฒŒ ๋„์›€๋˜๋‚˜์š”?")
401
+
402
+ if st.button("Answer", type="primary"):
403
+ with st.spinner("Thinking..."):
404
+ ans, srcs = chat_answer(
405
+ q,
406
+ st.session_state.index,
407
+ st.session_state.index_model,
408
+ st.session_state.docs,
409
+ st.session_state.loaded_datasets,
410
+ use_web,
411
+ web_k
412
+ )
413
+ st.write(ans)
414
+
415
+ if srcs:
416
+ st.markdown("#### Sources")
417
+ for s in srcs:
418
+ if s.get("type") == "web" and s.get("url"):
419
+ st.markdown(f"- {s.get('title', 'web')}: {s.get('url')}")
420
+ elif s.get("type") == "dataset":
421
+ st.markdown(f"- dataset: {s.get('id')}")
422
+ elif s.get("type") == "file":
423
+ snippet = s.get("text", "")
424
+ st.markdown(f"- file snippet: {snippet[:120]}...")
425
+
426
+ # Protein tab
427
+ with tabs[1]:
428
+ st.subheader("Protein analysis")
429
+ seq = st.text_area("Protein sequence (amino acids only)", value="MKTIIALSYIFCLVFADYKDDDDK")
430
+
431
+ col1, col2 = st.columns(2)
432
+ with col1:
433
+ st.caption("ESM-2 embedding")
434
+ if st.button("Run ESM-2", key="run_esm2"):
435
+ with st.spinner("Computing ESM-2 embedding..."):
436
+ out = esm2_embed(seq.strip(), esm2_id)
437
+ if "error" in out:
438
+ st.error(out["error"])
439
+ else:
440
+ st.success(f"Vector size: {out['hidden_size']}")
441
+ st.json({"embedding_preview": out["embedding"][:8]})
442
+
443
+ with col2:
444
+ st.caption("Quick stats")
445
+ s = seq.replace("\n", "").replace(" ", "").upper()
446
+ length = len(s)
447
+ aa_set = sorted(set(list(s)))
448
+ st.write(f"Length: {length}")
449
+ st.write(f"Unique AAs: {''.join(aa_set)[:30]}")
450
 
451
+ # DNA tab
452
+ with tabs[2]:
453
+ st.subheader("DNA analysis")
454
+ dseq = st.text_area("DNA sequence (ACGT only)", value="ATGCGTACGTAGCTAGCTAGCTAGGCTAGC")
455
+
456
+ col3, col4 = st.columns(2)
457
+ with col3:
458
+ st.caption("DNA embedding")
459
+ if st.button("Run DNA embed", key="run_dna"):
460
+ with st.spinner("Computing DNA embedding..."):
461
+ out = dna_embed(dseq.strip(), dna_id)
462
+ if "error" in out:
463
+ st.error(out["error"])
464
+ else:
465
+ st.success(f"Vector size: {out['hidden_size']}")
466
+ st.json({"embedding_preview": out["embedding"][:8]})
467
+
468
+ with col4:
469
+ st.caption("GC content")
470
+ s = dseq.upper().replace("N", "").replace(" ", "").replace("\n", "")
471
+ if len(s) > 0:
472
+ gc = (s.count("G") + s.count("C")) / len(s)
473
  else:
474
+ gc = 0
475
+ st.write(f"Length: {len(s)}")
476
+ st.write(f"GC: {gc:.3f}")
477
 
478
+ # Examples tab
479
+ with tabs[3]:
480
+ st.subheader("Examples")
481
+ st.markdown("### Example questions you can ask:")
482
+ st.markdown("- ์—…๋กœ๋“œํ•œ FASTA์—์„œ ํŠน์ • ๋‹จ๋ฐฑ์งˆ์˜ ๊ธฐ๋Šฅ ์š”์•ฝ๊ณผ ๋ณ€์ด ์˜ํ–ฅ ์งˆ๋ฌธ")
483
+ st.markdown("- DNA ์„œ์—ด์—์„œ ํ”„๋กœ๋ชจํ„ฐ ๊ฐ€๋Šฅ์„ฑ๊ณผ ์ „์‚ฌ์ธ์ž ๋ชจํ‹ฐํ”„ ๊ด€๋ จ ๊ทผ๊ฑฐ ์š”์ฒญ")
484
+ st.markdown("- Enzyme active site ๊ทผ์ ‘ ๋ณ€์ด์˜ ๋ฆฌ์Šคํฌ ํ•ด์„ (์—ฐ๊ตฌ ๊ด€์ )")
485
+ st.markdown("- ENCODE/UniProt/AlphaFold ๊ฐœ๋… ์„ค๋ช… ์š”์ฒญ")
486
+ st.markdown("- RAG ๊ธฐ๋ฐ˜์œผ๋กœ ๋ฌธ์„œ ์ธ์šฉ๊ณผ ํ•จ๊ป˜ ๊ฐ„๋žต ๋‹ต๋ณ€ ์š”์ฒญ")
487
+
488
+ # About tab
489
+ with tabs[4]:
490
+ st.subheader("About this Space")
491
+ st.write("**Models suggested:**")
492
+ st.write("- ESM-2 for proteins")
493
+ st.write("- DNABERT-2 or Nucleotide Transformer for DNA")
494
+ st.write("")
495
+ st.write("**Common datasets:**")
496
+ st.write("- UniProtKB, AlphaFoldDB, ENCODE, JASPAR, ClinVar")
497
+ st.write("")
498
+ st.write("**Features:**")
499
+ st.write("- Web search powered by Brave Search API")
500
+ st.write("- LLM powered by Fireworks AI")
501
+ st.write("- Vector search with FAISS")
502
+ st.write("")
503
+ st.info(DISCLAIMER)
504
+
505
+ # Run the app
506
+ if __name__ == "__main__":
507
+ main()