openfree commited on
Commit
c383a1a
·
verified ·
1 Parent(s): 8ee77e2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +727 -268
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import os
2
  import json
3
  from typing import List, Dict, Tuple
 
4
 
5
  import streamlit as st
6
  import requests
@@ -64,7 +65,7 @@ except ImportError:
64
  print("[WARNING] PyPDF2 not available")
65
 
66
  # 상수
67
- APP_TITLE = "BioSeq Chat: Protein & DNA Assistant"
68
  DISCLAIMER = "This tool is for research/education and is not a medical device. Do not use outputs for diagnosis or treatment decisions."
69
 
70
  # --------------- Helper Functions ---------------
@@ -72,12 +73,10 @@ DISCLAIMER = "This tool is for research/education and is not a medical device. D
72
  def get_secret(name: str, fallback: str = "") -> str:
73
  """Get secret from st.secrets or environment"""
74
  try:
75
- # Streamlit secrets
76
  if hasattr(st, 'secrets') and name in st.secrets:
77
  return st.secrets[name]
78
  except:
79
  pass
80
- # Environment variable
81
  return os.environ.get(name, fallback)
82
 
83
  def brave_search(query: str, count: int = 5) -> List[Dict]:
@@ -112,8 +111,8 @@ def brave_search(query: str, count: int = 5) -> List[Dict]:
112
  except Exception as e:
113
  return [{"title": "Error", "url": "", "snippet": str(e)}]
114
 
115
- def call_llm(messages: List[Dict], temperature: float = 0.6, max_tokens: int = 4000) -> str:
116
- """Call Fireworks AI API"""
117
  api_key = get_secret("FIREWORKS_API_KEY", "")
118
  if not api_key:
119
  return "FIREWORKS_API_KEY missing. Set it in Secrets or sidebar."
@@ -122,7 +121,7 @@ def call_llm(messages: List[Dict], temperature: float = 0.6, max_tokens: int = 4
122
  payload = {
123
  "model": "accounts/fireworks/models/llama-v3p1-70b-instruct",
124
  "messages": messages,
125
- "max_tokens": max_tokens,
126
  "temperature": temperature,
127
  "top_p": 1,
128
  "frequency_penalty": 0,
@@ -134,12 +133,152 @@ def call_llm(messages: List[Dict], temperature: float = 0.6, max_tokens: int = 4
134
  }
135
 
136
  try:
137
- r = requests.post(url, headers=headers, json=payload, timeout=60)
138
  r.raise_for_status()
139
  return r.json()["choices"][0]["message"]["content"]
140
  except Exception as e:
141
  return f"[LLM Error] {e}"
142
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
  def load_file_text(upload) -> str:
144
  """Load text from uploaded file (PDF 지원 포함)"""
145
  name = upload.name.lower()
@@ -194,8 +333,8 @@ def load_file_text(upload) -> str:
194
 
195
  return text
196
 
197
- def chunk_text(text: str, size: int = 1200, overlap: int = 200) -> List[str]:
198
- """Split text into chunks"""
199
  chunks = []
200
  start = 0
201
  text_len = len(text)
@@ -210,12 +349,13 @@ def chunk_text(text: str, size: int = 1200, overlap: int = 200) -> List[str]:
210
  return chunks
211
 
212
  def build_index(texts: List[str]):
213
- """Build vector index"""
214
  if not SENTENCE_TRANSFORMERS_AVAILABLE or not FAISS_AVAILABLE:
215
  return None, None
216
 
217
  try:
218
- model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
 
219
  embeddings = model.encode(texts, show_progress_bar=False)
220
 
221
  dim = embeddings.shape[1]
@@ -227,8 +367,8 @@ def build_index(texts: List[str]):
227
  st.warning(f"Index build failed: {e}")
228
  return None, None
229
 
230
- def search_index(query: str, index, model, texts: List[str], k: int = 4) -> List[Dict]:
231
- """Search vector index"""
232
  if index is None or model is None:
233
  return []
234
 
@@ -247,8 +387,33 @@ def search_index(query: str, index, model, texts: List[str], k: int = 4) -> List
247
  except:
248
  return []
249
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
  def esm2_embed(seq: str, model_name: str = "facebook/esm2_t6_8M_UR50D") -> Dict:
251
- """ESM-2 protein embedding"""
252
  if not TORCH_AVAILABLE or not TRANSFORMERS_AVAILABLE:
253
  return {"error": "PyTorch/Transformers not available"}
254
 
@@ -262,6 +427,9 @@ def esm2_embed(seq: str, model_name: str = "facebook/esm2_t6_8M_UR50D") -> Dict:
262
  outputs = model(**inputs, output_hidden_states=True)
263
  hidden = outputs.hidden_states[-1].mean(dim=1).squeeze(0)
264
  vec = hidden.cpu().numpy()
 
 
 
265
 
266
  # 메모리 정리
267
  del model
@@ -270,14 +438,17 @@ def esm2_embed(seq: str, model_name: str = "facebook/esm2_t6_8M_UR50D") -> Dict:
270
  torch.cuda.empty_cache()
271
 
272
  return {
273
- "embedding": vec.tolist()[:10], # 미리보기용 첫 10개만
274
- "size": vec.shape[0]
 
 
 
275
  }
276
  except Exception as e:
277
  return {"error": str(e)}
278
 
279
  def dna_embed(seq: str, model_name: str = "zhihan1996/DNABERT-2-117M") -> Dict:
280
- """DNA embedding"""
281
  if not TORCH_AVAILABLE or not TRANSFORMERS_AVAILABLE:
282
  return {"error": "PyTorch/Transformers not available"}
283
 
@@ -288,8 +459,14 @@ def dna_embed(seq: str, model_name: str = "zhihan1996/DNABERT-2-117M") -> Dict:
288
  except ImportError:
289
  return {"error": "einops package required. Please wait for installation and refresh the page."}
290
 
291
- # 간단한 대안: 더 안정적인 모델 사용
292
- # DNABERT-2가 문제를 일으키면 기본 BERT 사용
 
 
 
 
 
 
293
  try:
294
  from transformers import AutoTokenizer, AutoModel
295
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
@@ -298,7 +475,6 @@ def dna_embed(seq: str, model_name: str = "zhihan1996/DNABERT-2-117M") -> Dict:
298
  # 대체 모델 사용
299
  try:
300
  from transformers import BertTokenizer, BertModel
301
- # 기본 BERT 모델로 폴백
302
  fallback_model = "bert-base-uncased"
303
  tokenizer = BertTokenizer.from_pretrained(fallback_model)
304
  model = BertModel.from_pretrained(fallback_model)
@@ -308,19 +484,13 @@ def dna_embed(seq: str, model_name: str = "zhihan1996/DNABERT-2-117M") -> Dict:
308
 
309
  model.eval()
310
 
311
- # DNA 서열을 k-mer 변환 (DNABERT 스타일)
312
- def seq_to_kmer(seq, k=6):
313
- """DNA 서열을 k-mer로 변환"""
314
- kmers = []
315
- for i in range(len(seq) - k + 1):
316
- kmers.append(seq[i:i+k])
317
- return ' '.join(kmers)
318
-
319
- # k-mer 변환 또는 직접 사용
320
  if len(seq) > 6:
321
  input_seq = seq_to_kmer(seq, k=6)
 
322
  else:
323
  input_seq = seq
 
324
 
325
  with torch.no_grad():
326
  inputs = tokenizer(
@@ -332,7 +502,6 @@ def dna_embed(seq: str, model_name: str = "zhihan1996/DNABERT-2-117M") -> Dict:
332
  )
333
  outputs = model(**inputs)
334
 
335
- # last_hidden_state 또는 pooler_output 사용
336
  if hasattr(outputs, 'pooler_output') and outputs.pooler_output is not None:
337
  vec = outputs.pooler_output.squeeze(0).cpu().numpy()
338
  else:
@@ -346,67 +515,16 @@ def dna_embed(seq: str, model_name: str = "zhihan1996/DNABERT-2-117M") -> Dict:
346
  torch.cuda.empty_cache()
347
 
348
  return {
349
- "embedding": vec.tolist()[:10], # 미리보기용 첫 10개만
350
- "size": vec.shape[0]
 
 
 
351
  }
352
 
353
  except Exception as e:
354
  return {"error": f"분석 중 오류 발생: {str(e)[:200]}"}
355
 
356
- def build_context(query: str, docs: List[str], index, model, use_web: bool, web_k: int) -> Tuple[str, List[Dict]]:
357
- """Build context from sources"""
358
- pieces = []
359
- sources = []
360
-
361
- # File search
362
- if index and model and docs:
363
- hits = search_index(query, index, model, docs, k=4)
364
- for h in hits:
365
- pieces.append(f"[FILE] {h['text'][:500]}")
366
- sources.append({"type": "file", "text": h['text'][:100]})
367
-
368
- # Web search
369
- if use_web:
370
- results = brave_search(query, count=web_k)
371
- for r in results:
372
- pieces.append(f"[WEB] {r['title']}\n{r['snippet']}")
373
- sources.append({"type": "web", "title": r['title'], "url": r['url']})
374
-
375
- context = "\n\n---\n\n".join(pieces)[:4000]
376
- return context, sources
377
-
378
- def answer_question(query: str, context: str) -> str:
379
- """Generate answer"""
380
- system = (
381
- "You are an expert bioinformatics assistant who explains complex biological concepts in an accessible way. "
382
- "Your responses should be:\n"
383
- "1. Comprehensive yet easy to understand\n"
384
- "2. Well-structured with clear sections\n"
385
- "3. Include relevant examples and analogies\n"
386
- "4. Provide actionable insights when appropriate\n"
387
- "5. Use Korean if the user writes in Korean, otherwise English\n"
388
- "6. Never provide medical diagnosis or treatment advice\n"
389
- "7. Format your response with headers, bullet points, and clear paragraphs\n"
390
- "8. Aim for 300-500 words minimum for complex questions"
391
- )
392
-
393
- user_msg = f"""Context information:\n{context}\n\n
394
- User Question: {query}
395
-
396
- Please provide a detailed, well-structured response that:
397
- - Directly answers the question
398
- - Explains the biological background
399
- - Includes practical implications when relevant
400
- - Uses simple analogies to explain complex concepts
401
- - Cites the context when appropriate"""
402
-
403
- messages = [
404
- {"role": "system", "content": system},
405
- {"role": "user", "content": user_msg}
406
- ]
407
-
408
- return call_llm(messages, temperature=0.4, max_tokens=4000)
409
-
410
  # --------------- Streamlit UI ---------------
411
 
412
  st.set_page_config(page_title=APP_TITLE, page_icon="🧬", layout="wide")
@@ -420,20 +538,24 @@ if "index" not in st.session_state:
420
  st.session_state.index = None
421
  if "model" not in st.session_state:
422
  st.session_state.model = None
 
 
423
 
424
  # Sidebar
425
  with st.sidebar:
426
- st.header("Configuration")
427
 
428
  fw_key = st.text_input(
429
  "FIREWORKS_API_KEY",
430
  value=get_secret("FIREWORKS_API_KEY", ""),
431
- type="password"
 
432
  )
433
  brave_key = st.text_input(
434
  "BRAVE_API_KEY",
435
  value=get_secret("BRAVE_API_KEY", ""),
436
- type="password"
 
437
  )
438
 
439
  if fw_key:
@@ -443,73 +565,115 @@ with st.sidebar:
443
 
444
  st.divider()
445
 
 
446
  esm_model = st.text_input(
447
  "ESM-2 Model",
448
- value="facebook/esm2_t6_8M_UR50D"
 
449
  )
450
  dna_model = st.text_input(
451
  "DNA Model",
452
- value="bert-base-uncased", # 더 안정적인 기본 모델
453
- help="Options: bert-base-uncased (stable), zhihan1996/DNABERT-2-117M (specialized but may require more memory)"
454
  )
455
 
 
 
 
456
  use_web = st.checkbox("Enable web search", value=True)
457
- web_results = st.slider("Web results", 1, 10, 3)
 
 
 
 
 
 
 
 
 
 
458
 
459
  # Tabs
460
- tab1, tab2, tab3, tab4 = st.tabs(["Chat", "Protein", "DNA", "About"])
461
 
462
  # File upload
463
  with st.expander("📁 Upload Files", expanded=True):
464
  files = st.file_uploader(
465
- "Upload text/FASTA/PDF files", # PDF 추가
466
- type=["txt", "fa", "fasta", "csv", "json", "pdf"], # PDF 추가
467
- accept_multiple_files=True
 
468
  )
469
 
470
  if files:
471
  docs = []
472
  for f in files:
473
  try:
474
- # PDF 파일인 경우 경고 메시지 추가
475
  if f.name.lower().endswith(".pdf"):
476
  if not (PDFPLUMBER_AVAILABLE or PYPDF2_AVAILABLE):
477
- st.warning(f"⚠️ PDF 지원을 위해 pdfplumber 설치 필요: pip install pdfplumber")
478
  continue
479
 
480
  text = load_file_text(f)
481
  if text:
482
  docs.extend(chunk_text(text))
483
- st.success(f"✅ {f.name} 로드 완료")
484
  except Exception as e:
485
  st.error(f"Error reading {f.name}: {e}")
486
 
487
  if docs:
488
  st.session_state.docs = docs
489
- st.success(f" {len(docs)}개 청크 생성 완료")
490
 
491
  if SENTENCE_TRANSFORMERS_AVAILABLE and FAISS_AVAILABLE:
492
- with st.spinner("인덱스 구축 중..."):
493
  index, model = build_index(docs)
494
  if index:
495
  st.session_state.index = index
496
  st.session_state.model = model
 
497
 
498
- # Chat tab
499
  with tab1:
500
- st.subheader("💬 Chat Assistant")
 
 
 
 
 
 
 
 
 
 
 
 
 
501
 
502
  question = st.text_area(
503
- "Ask about proteins, DNA, or bioinformatics:",
504
- value="What is the role of ESM-2 embeddings in protein analysis?",
505
  height=100
506
  )
507
 
508
- if st.button("Get Answer", type="primary"):
 
 
 
 
 
 
509
  if not get_secret("FIREWORKS_API_KEY"):
510
- st.error("Please set FIREWORKS_API_KEY")
511
  else:
512
- with st.spinner("Thinking..."):
 
 
 
 
 
 
 
513
  context, sources = build_context(
514
  question,
515
  st.session_state.docs,
@@ -519,246 +683,541 @@ with tab1:
519
  web_results
520
  )
521
 
522
- answer = answer_question(question, context)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
523
 
524
- st.markdown("### Answer")
525
- st.write(answer)
526
 
527
- if sources:
528
- st.markdown("### Sources")
 
 
 
 
 
 
 
 
529
  for s in sources:
530
  if s["type"] == "web":
531
  st.write(f"- 🌐 [{s['title']}]({s['url']})")
532
  elif s["type"] == "file":
533
- st.write(f"- 📄 File: {s['text'][:80]}...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
534
 
535
- # Protein tab
536
  with tab2:
537
- st.subheader("🧬 Protein Analysis")
538
 
539
- st.info("""
540
- **단백질 서열 분석이란?**
541
- - 단백질의 아미노산 서열을 AI가 분석하여 기능과 구조를 예측합니다
542
- - ESM-2는 Meta가 개발한 AI로, 6억 5천만개 단백질을 학습했습니다
543
- - 용도: 신약 개발, 질병 연구, 진화 분석
544
- """)
 
 
 
 
 
 
 
 
 
545
 
546
  protein_seq = st.text_area(
547
- "단백질 서열 입력 (복사-붙여넣기 가능):",
548
  value="MKTIIALSYIFCLVFA",
549
- help="단백질 서열은 20개 아미노산 문자(A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y)로 구성됩니다",
550
  height=100
551
  )
552
 
553
- st.markdown("**예제 서열 (클릭해서 복사):**")
554
- col1, col2, col3 = st.columns(3)
 
555
  with col1:
556
- if st.button("인슐린", key="ins"):
557
  st.code("FVNQHLCGSHLVEALYLVCGERGFFYTPKT", language=None)
558
  with col2:
559
- if st.button("엔돌핀", key="end"):
560
  st.code("YGGFMTSEKSQTPLVTLFKNAIIKNAYKKGE", language=None)
561
  with col3:
562
- if st.button("옥시토신", key="oxy"):
563
  st.code("CYIQNCPLG", language=None)
 
 
 
564
 
565
- if st.button("🔬 단백질 분석 시작", type="primary"):
566
  seq = protein_seq.strip().upper()
567
 
568
- # Basic stats
569
- st.markdown("### 📊 기본 분석 결과")
570
- col1, col2 = st.columns(2)
 
 
 
571
 
572
- with col1:
573
- st.metric("서열 길이", f"{len(seq)} 아미노산")
574
- st.metric("분자량 (추정)", f"~{len(seq) * 110} Da")
 
 
 
575
 
576
- with col2:
577
- unique_aa = len(set(seq))
578
- st.metric("사용된 아미노산 종류", f"{unique_aa}")
579
- hydrophobic = sum(1 for aa in seq if aa in "AVILMFYW")
580
- st.metric("소수성 비율", f"{hydrophobic/len(seq)*100:.1f}%")
581
-
582
- # AI Analysis
583
- if TORCH_AVAILABLE and TRANSFORMERS_AVAILABLE:
584
- st.markdown("### 🤖 AI 임베딩 분석")
585
- with st.spinner("AI 모델이 단백질을 분석중... (10-30초)"):
586
- result = esm2_embed(seq, esm_model)
587
- if "error" in result:
588
- st.error(result["error"])
589
- else:
590
- st.success(" AI 분석 완료!")
591
-
592
- col1, col2 = st.columns(2)
593
- with col1:
594
- st.metric("벡터 차원", result['size'])
595
- st.caption("이 숫자들은 단백질의 특성을 수치화한 것입니다")
596
-
597
- with col2:
598
- st.markdown("**임베딩 벡터 미리보기:**")
599
- st.code(result["embedding"][:5])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
600
 
601
- st.markdown("""
602
- **🎯 분석의 활용:**
603
- - 유사한 기능의 단백질 찾기
604
- - 구조 예측의 기초 데이터
605
- - 돌연변이 영향 예측
606
- - 신약 타겟 발굴
607
- """)
608
- else:
609
- st.warning("⚠️ AI 모델 로딩 중... 잠시 후 다시 시도해주세요")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
610
 
611
- # DNA tab
612
  with tab3:
613
- st.subheader("🧬 DNA Analysis")
614
 
615
- st.info("""
616
- **DNA 서열 분석이란?**
617
- - DNA 염기서열(A,T,G,C)을 AI가 분석하여 기능을 예측합니다
618
- - DNABERT-2는 인간 게놈 전체를 학습한 AI 모델입니다
619
- - 용도: 유전자 기능 예측, 질병 변이 발견, 진화 연구
620
- """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
621
 
622
  dna_seq = st.text_area(
623
- "DNA 서열 입력 (복사-붙여넣기 가능):",
624
  value="ATGCGATCGTAGC",
625
- help="DNA 4개 염기(A: 아데닌, T: 티민, G: 구아닌, C: 시토신)로 구성됩니다",
626
  height=100
627
  )
628
 
629
- st.markdown("**예제 서열 (클릭해서 복사):**")
630
- col1, col2, col3 = st.columns(3)
 
631
  with col1:
632
- if st.button("TATA 박스", key="tata"):
633
- st.code("GCGCGATATAAAGGCGCGGGCGCGCG", language=None)
634
- st.caption("유전자 발현 시작 신호")
635
  with col2:
636
- if st.button("프로모터", key="prom"):
637
  st.code("TTGACAGGCTAGCTCAGTCCTAGGTATAATGCTAGC", language=None)
638
- st.caption("유전자 조절 영역")
639
  with col3:
640
- if st.button("CRISPR 타겟", key="crispr"):
641
  st.code("GTCACCTCCAATGACTAGGGTGG", language=None)
642
- st.caption("유전자 편집 부위")
 
 
 
 
643
 
644
- if st.button("🔬 DNA 분석 시작", type="primary"):
645
- seq = dna_seq.strip().upper().replace("U", "T") # RNA의 U를 T로 변환
646
- seq = ''.join(c for c in seq if c in 'ATGC') # ATGC만 남기기
647
 
648
  if len(seq) < 3:
649
- st.error("최소 3개 이상의 염기를 입력해주세요")
650
  else:
651
- st.markdown("### 📊 기본 분석 결과")
652
- col1, col2 = st.columns(2)
 
 
653
 
654
  with col1:
655
- st.metric("서열 길이", f"{len(seq)} bp")
 
 
 
656
  gc = (seq.count("G") + seq.count("C")) / len(seq) * 100
657
- st.metric("GC 함량", f"{gc:.1f}%")
658
- if gc > 60:
659
- st.caption("🔴 높음: 안정적이지만 복제 어려움")
660
- elif gc < 40:
661
- st.caption("🔵 낮음: 불안정하지만 복제 용이")
 
 
 
 
662
  else:
663
- st.caption("🟢 적정: 일반적인 범위")
664
 
665
- with col2:
666
- at = (seq.count("A") + seq.count("T")) / len(seq) * 100
667
- st.metric("AT 함량", f"{at:.1f}%")
668
-
669
- # 코돈 분석 (3의 배수인 경우)
670
- if len(seq) % 3 == 0:
671
- st.metric("가능한 코돈 수", f"{len(seq)//3}개")
672
- st.caption("단백질로 번역 가능")
 
 
 
 
 
 
673
 
674
- # 특별 서열 찾기
675
- st.markdown("### 🔍 주요 모티프 검색")
676
  motifs_found = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
677
 
678
- if "TATAAAA" in seq or "TATAAA" in seq:
679
- motifs_found.append("✅ TATA box 발견 (전사 시작 신호)")
680
- if "CAAT" in seq or "CCAAT" in seq:
681
- motifs_found.append(" CAAT box 발견 (전사 조절)")
682
- if "ATG" in seq:
683
- motifs_found.append("✅ 시작 코돈(ATG) 발견")
684
- if "TAA" in seq or "TAG" in seq or "TGA" in seq:
685
- motifs_found.append("✅ 정지 코돈 발견")
686
- if seq.count("CG") > len(seq)/20:
687
- motifs_found.append("✅ CpG 섬 가능성 (유전자 조절)")
688
 
689
  if motifs_found:
690
  for motif in motifs_found:
691
  st.write(motif)
692
  else:
693
- st.write("특별한 모티프가 발견되지 않았습니다")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
694
 
695
  # AI Analysis
696
  if TORCH_AVAILABLE and TRANSFORMERS_AVAILABLE:
697
- st.markdown("### 🤖 AI 임베딩 분석")
698
- with st.spinner("AI 모델이 DNA를 분석중... (10-30초)"):
699
  result = dna_embed(seq, dna_model)
 
700
  if "error" in result:
701
- st.error(result["error"])
702
  else:
703
- st.success("✅ AI 분석 완료!")
704
 
705
- col1, col2 = st.columns(2)
706
  with col1:
707
- st.metric("벡터 차원", result['size'])
708
- st.caption("DNA 특성을 수치화한 결과입니다")
709
-
710
  with col2:
711
- st.markdown("**임베딩 벡터 미리보기:**")
712
- st.code(result["embedding"][:5])
 
713
 
714
  st.markdown("""
715
- **🎯 분석의 활용:**
716
- - 유전자 기능 예측
717
- - 프로모터/인핸서 찾기
718
- - 진화적 보존 영역 발견
719
- - 질병 관련 변이 예측
720
- - CRISPR 타겟 부위 평가
 
 
721
  """)
722
  else:
723
- st.warning("⚠️ AI 모델 로딩 중... 잠시 다시 시도해주세요")
724
 
725
- # About tab
726
  with tab4:
727
- st.subheader("ℹ️ About")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
728
  st.markdown("""
729
- ### Features
730
- - 💬 RAG-based chat for bioinformatics questions
731
- - 🧬 Protein sequence analysis with ESM-2
732
- - 🧬 DNA sequence analysis with DNABERT-2
733
- - 🔍 Web search integration via Brave API
734
- - 📁 File upload and vector search (including PDF support)
735
-
736
- ### Models
737
- - **Proteins:** ESM-2 (Facebook)
738
- - **DNA:** DNABERT-2 (Microsoft) / BERT (fallback)
739
- - **LLM:** Llama 3.1 70B (via Fireworks)
740
-
741
- ### Disclaimer
742
- This tool is for research and educational purposes only.
743
- Not for medical diagnosis or treatment decisions.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
744
  """)
745
 
746
- # Dependency check
747
- st.divider()
748
- st.subheader("System Status")
749
- deps = {
750
  "PyTorch": TORCH_AVAILABLE,
751
  "Transformers": TRANSFORMERS_AVAILABLE,
752
  "Sentence Transformers": SENTENCE_TRANSFORMERS_AVAILABLE,
753
  "FAISS": FAISS_AVAILABLE,
 
 
 
754
  "BioPython": BIOPYTHON_AVAILABLE,
755
  "Datasets": DATASETS_AVAILABLE,
756
- "PDF Support (pdfplumber)": PDFPLUMBER_AVAILABLE, # PDF 지원 추가
757
- "PDF Support (PyPDF2)": PYPDF2_AVAILABLE # PDF 지원 추가
758
  }
759
 
760
- for name, available in deps.items():
761
- if available:
762
- st.success(f"✅ {name}")
763
- else:
764
- st.warning(f"⚠️ {name} not available")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import json
3
  from typing import List, Dict, Tuple
4
+ import time
5
 
6
  import streamlit as st
7
  import requests
 
65
  print("[WARNING] PyPDF2 not available")
66
 
67
  # 상수
68
+ APP_TITLE = "BioSeq Chat Pro: Advanced Collaborative AI System"
69
  DISCLAIMER = "This tool is for research/education and is not a medical device. Do not use outputs for diagnosis or treatment decisions."
70
 
71
  # --------------- Helper Functions ---------------
 
73
  def get_secret(name: str, fallback: str = "") -> str:
74
  """Get secret from st.secrets or environment"""
75
  try:
 
76
  if hasattr(st, 'secrets') and name in st.secrets:
77
  return st.secrets[name]
78
  except:
79
  pass
 
80
  return os.environ.get(name, fallback)
81
 
82
  def brave_search(query: str, count: int = 5) -> List[Dict]:
 
111
  except Exception as e:
112
  return [{"title": "Error", "url": "", "snippet": str(e)}]
113
 
114
+ def call_llm(messages: List[Dict], temperature: float = 0.6, max_tokens: int = 8000) -> str:
115
+ """Call Fireworks AI API with increased token limit"""
116
  api_key = get_secret("FIREWORKS_API_KEY", "")
117
  if not api_key:
118
  return "FIREWORKS_API_KEY missing. Set it in Secrets or sidebar."
 
121
  payload = {
122
  "model": "accounts/fireworks/models/llama-v3p1-70b-instruct",
123
  "messages": messages,
124
+ "max_tokens": max_tokens, # 8000으로 증가
125
  "temperature": temperature,
126
  "top_p": 1,
127
  "frequency_penalty": 0,
 
133
  }
134
 
135
  try:
136
+ r = requests.post(url, headers=headers, json=payload, timeout=120)
137
  r.raise_for_status()
138
  return r.json()["choices"][0]["message"]["content"]
139
  except Exception as e:
140
  return f"[LLM Error] {e}"
141
 
142
+ def collaborative_answer(query: str, context: str, collaboration_type: str = "full") -> Dict[str, str]:
143
+ """
144
+ 협업 AI 시스템: 감독자, 비평자, 조사자가 협력하여 답변 생성
145
+
146
+ Args:
147
+ query: 사용자 질문
148
+ context: 검색된 문맥 정보
149
+ collaboration_type: "full" (전체 협업), "quick" (빠른 답변), "deep" (심층 분석)
150
+
151
+ Returns:
152
+ 각 역할자의 기여와 최종 답변을 포함한 딕셔너리
153
+ """
154
+
155
+ # 1. 조사자(Investigator) - 사실 수집 및 검증
156
+ investigator_prompt = f"""You are an INVESTIGATOR specializing in bioinformatics fact-checking.
157
+
158
+ Context: {context}
159
+ Question: {query}
160
+
161
+ Your task:
162
+ 1. Extract and verify all relevant facts from the context
163
+ 2. Identify any missing information that would improve the answer
164
+ 3. Flag any potentially conflicting or uncertain information
165
+ 4. Suggest additional areas for research
166
+ 5. Provide confidence scores for key facts (0-100%)
167
+
168
+ Format your response with:
169
+ - VERIFIED FACTS: (with confidence scores)
170
+ - UNCERTAIN AREAS:
171
+ - MISSING INFORMATION:
172
+ - RESEARCH SUGGESTIONS:
173
+ - KEY CITATIONS:"""
174
+
175
+ investigator_msg = [
176
+ {"role": "system", "content": "You are a meticulous scientific fact-checker and researcher."},
177
+ {"role": "user", "content": investigator_prompt}
178
+ ]
179
+
180
+ investigator_response = call_llm(investigator_msg, temperature=0.2, max_tokens=2000)
181
+
182
+ # 2. 감독자(Supervisor) - 구조화된 답변 생성
183
+ supervisor_prompt = f"""You are a SUPERVISOR creating a comprehensive answer.
184
+
185
+ Question: {query}
186
+ Context: {context}
187
+ Investigator's Analysis:
188
+ {investigator_response}
189
+
190
+ Your task:
191
+ 1. Create a well-structured, scientifically accurate answer
192
+ 2. Include:
193
+ - Executive Summary (2-3 sentences)
194
+ - Background & Context
195
+ - Detailed Explanation with subsections
196
+ - Practical Applications
197
+ - Current Research Status
198
+ - Future Perspectives
199
+ 3. Use clear headings and logical flow
200
+ 4. Integrate verified facts from the investigator
201
+ 5. Aim for 500-1000 words minimum
202
+ 6. Include relevant examples and analogies
203
+
204
+ Format with clear markdown headers and bullet points where appropriate."""
205
+
206
+ supervisor_msg = [
207
+ {"role": "system", "content": "You are an expert bioinformatics educator who creates comprehensive, well-structured scientific explanations."},
208
+ {"role": "user", "content": supervisor_prompt}
209
+ ]
210
+
211
+ supervisor_response = call_llm(supervisor_msg, temperature=0.4, max_tokens=3500)
212
+
213
+ # 3. 비평자(Critic) - 품질 검증 및 개선
214
+ critic_prompt = f"""You are a CRITIC reviewing the following answer for scientific accuracy.
215
+
216
+ Original Question: {query}
217
+ Supervisor's Answer:
218
+ {supervisor_response}
219
+
220
+ Investigator's Facts:
221
+ {investigator_response}
222
+
223
+ Your task:
224
+ 1. Check for scientific accuracy and completeness
225
+ 2. Identify any errors, omissions, or unclear explanations
226
+ 3. Verify that all claims are properly supported
227
+ 4. Assess the answer's clarity and accessibility
228
+ 5. Suggest specific improvements
229
+ 6. Provide a quality score (0-100)
230
+
231
+ Format your critique:
232
+ - ACCURACY ASSESSMENT:
233
+ - COMPLETENESS CHECK:
234
+ - CLARITY EVALUATION:
235
+ - ERRORS/ISSUES FOUND:
236
+ - IMPROVEMENT SUGGESTIONS:
237
+ - QUALITY SCORE: X/100"""
238
+
239
+ critic_msg = [
240
+ {"role": "system", "content": "You are a rigorous scientific peer reviewer specializing in bioinformatics."},
241
+ {"role": "user", "content": critic_prompt}
242
+ ]
243
+
244
+ critic_response = call_llm(critic_msg, temperature=0.3, max_tokens=1500)
245
+
246
+ # 4. 최종 통합 답변 (Final Integration)
247
+ if collaboration_type == "full":
248
+ integration_prompt = f"""Create the FINAL INTEGRATED ANSWER incorporating all feedback.
249
+
250
+ Question: {query}
251
+ Supervisor's Answer: {supervisor_response}
252
+ Critic's Feedback: {critic_response}
253
+ Verified Facts: {investigator_response}
254
+
255
+ Create a polished, final answer that:
256
+ 1. Addresses all critic's concerns
257
+ 2. Maintains scientific rigor
258
+ 3. Includes proper citations
259
+ 4. Uses clear structure with markdown formatting
260
+ 5. Provides comprehensive coverage (800-1500 words)
261
+ 6. Includes a TL;DR section at the beginning
262
+ 7. Ends with key takeaways and further reading suggestions
263
+
264
+ Use Korean if the question is in Korean, otherwise English."""
265
+
266
+ integration_msg = [
267
+ {"role": "system", "content": "You are a master science communicator creating the definitive answer by integrating all expert inputs."},
268
+ {"role": "user", "content": integration_prompt}
269
+ ]
270
+
271
+ final_answer = call_llm(integration_msg, temperature=0.35, max_tokens=8000)
272
+ else:
273
+ final_answer = supervisor_response
274
+
275
+ return {
276
+ "investigator": investigator_response,
277
+ "supervisor": supervisor_response,
278
+ "critic": critic_response,
279
+ "final": final_answer
280
+ }
281
+
282
  def load_file_text(upload) -> str:
283
  """Load text from uploaded file (PDF 지원 포함)"""
284
  name = upload.name.lower()
 
333
 
334
  return text
335
 
336
+ def chunk_text(text: str, size: int = 1500, overlap: int = 300) -> List[str]:
337
+ """Split text into chunks with larger size for better context"""
338
  chunks = []
339
  start = 0
340
  text_len = len(text)
 
349
  return chunks
350
 
351
  def build_index(texts: List[str]):
352
+ """Build vector index with better model"""
353
  if not SENTENCE_TRANSFORMERS_AVAILABLE or not FAISS_AVAILABLE:
354
  return None, None
355
 
356
  try:
357
+ # 나은 임베딩 모델 사용
358
+ model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
359
  embeddings = model.encode(texts, show_progress_bar=False)
360
 
361
  dim = embeddings.shape[1]
 
367
  st.warning(f"Index build failed: {e}")
368
  return None, None
369
 
370
+ def search_index(query: str, index, model, texts: List[str], k: int = 5) -> List[Dict]:
371
+ """Search vector index with more results"""
372
  if index is None or model is None:
373
  return []
374
 
 
387
  except:
388
  return []
389
 
390
+ def build_context(query: str, docs: List[str], index, model, use_web: bool, web_k: int) -> Tuple[str, List[Dict]]:
391
+ """Build enhanced context from sources"""
392
+ pieces = []
393
+ sources = []
394
+
395
+ # File search with more results
396
+ if index and model and docs:
397
+ hits = search_index(query, index, model, docs, k=6)
398
+ for h in hits:
399
+ pieces.append(f"[FILE SOURCE] {h['text'][:800]}")
400
+ sources.append({"type": "file", "text": h['text'][:150], "score": h['score']})
401
+
402
+ # Web search with scientific focus
403
+ if use_web:
404
+ # 과학적 키워드 추가
405
+ scientific_query = f"{query} scientific research pubmed nature science"
406
+ results = brave_search(scientific_query, count=web_k)
407
+ for r in results:
408
+ pieces.append(f"[WEB SOURCE] {r['title']}\n{r['snippet']}")
409
+ sources.append({"type": "web", "title": r['title'], "url": r['url']})
410
+
411
+ context = "\n\n---\n\n".join(pieces)[:6000] # 컨텍스트 크기 증��
412
+ return context, sources
413
+
414
+ # Enhanced analysis functions
415
  def esm2_embed(seq: str, model_name: str = "facebook/esm2_t6_8M_UR50D") -> Dict:
416
+ """Enhanced ESM-2 protein embedding with more analysis"""
417
  if not TORCH_AVAILABLE or not TRANSFORMERS_AVAILABLE:
418
  return {"error": "PyTorch/Transformers not available"}
419
 
 
427
  outputs = model(**inputs, output_hidden_states=True)
428
  hidden = outputs.hidden_states[-1].mean(dim=1).squeeze(0)
429
  vec = hidden.cpu().numpy()
430
+
431
+ # 추가 분석
432
+ attention_weights = outputs.hidden_states[-1].std(dim=1).squeeze(0).cpu().numpy()
433
 
434
  # 메모리 정리
435
  del model
 
438
  torch.cuda.empty_cache()
439
 
440
  return {
441
+ "embedding": vec.tolist()[:10],
442
+ "size": vec.shape[0],
443
+ "mean": float(vec.mean()),
444
+ "std": float(vec.std()),
445
+ "attention_peaks": attention_weights.tolist()[:10]
446
  }
447
  except Exception as e:
448
  return {"error": str(e)}
449
 
450
  def dna_embed(seq: str, model_name: str = "zhihan1996/DNABERT-2-117M") -> Dict:
451
+ """Enhanced DNA embedding with k-mer analysis"""
452
  if not TORCH_AVAILABLE or not TRANSFORMERS_AVAILABLE:
453
  return {"error": "PyTorch/Transformers not available"}
454
 
 
459
  except ImportError:
460
  return {"error": "einops package required. Please wait for installation and refresh the page."}
461
 
462
+ # k-mer 변환 함수
463
+ def seq_to_kmer(seq, k=6):
464
+ kmers = []
465
+ for i in range(len(seq) - k + 1):
466
+ kmers.append(seq[i:i+k])
467
+ return ' '.join(kmers)
468
+
469
+ # 모델 로딩 시도
470
  try:
471
  from transformers import AutoTokenizer, AutoModel
472
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 
475
  # 대체 모델 사용
476
  try:
477
  from transformers import BertTokenizer, BertModel
 
478
  fallback_model = "bert-base-uncased"
479
  tokenizer = BertTokenizer.from_pretrained(fallback_model)
480
  model = BertModel.from_pretrained(fallback_model)
 
484
 
485
  model.eval()
486
 
487
+ # k-mer 변환
 
 
 
 
 
 
 
 
488
  if len(seq) > 6:
489
  input_seq = seq_to_kmer(seq, k=6)
490
+ kmer_count = len(seq) - 5
491
  else:
492
  input_seq = seq
493
+ kmer_count = 1
494
 
495
  with torch.no_grad():
496
  inputs = tokenizer(
 
502
  )
503
  outputs = model(**inputs)
504
 
 
505
  if hasattr(outputs, 'pooler_output') and outputs.pooler_output is not None:
506
  vec = outputs.pooler_output.squeeze(0).cpu().numpy()
507
  else:
 
515
  torch.cuda.empty_cache()
516
 
517
  return {
518
+ "embedding": vec.tolist()[:10],
519
+ "size": vec.shape[0],
520
+ "kmer_count": kmer_count,
521
+ "mean": float(vec.mean()),
522
+ "std": float(vec.std())
523
  }
524
 
525
  except Exception as e:
526
  return {"error": f"분석 중 오류 발생: {str(e)[:200]}"}
527
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
528
  # --------------- Streamlit UI ---------------
529
 
530
  st.set_page_config(page_title=APP_TITLE, page_icon="🧬", layout="wide")
 
538
  st.session_state.index = None
539
  if "model" not in st.session_state:
540
  st.session_state.model = None
541
+ if "chat_history" not in st.session_state:
542
+ st.session_state.chat_history = []
543
 
544
  # Sidebar
545
  with st.sidebar:
546
+ st.header("⚙️ Configuration")
547
 
548
  fw_key = st.text_input(
549
  "FIREWORKS_API_KEY",
550
  value=get_secret("FIREWORKS_API_KEY", ""),
551
+ type="password",
552
+ help="Required for AI responses"
553
  )
554
  brave_key = st.text_input(
555
  "BRAVE_API_KEY",
556
  value=get_secret("BRAVE_API_KEY", ""),
557
+ type="password",
558
+ help="Required for web search"
559
  )
560
 
561
  if fw_key:
 
565
 
566
  st.divider()
567
 
568
+ st.subheader("🤖 AI Models")
569
  esm_model = st.text_input(
570
  "ESM-2 Model",
571
+ value="facebook/esm2_t6_8M_UR50D",
572
+ help="Protein analysis model"
573
  )
574
  dna_model = st.text_input(
575
  "DNA Model",
576
+ value="bert-base-uncased",
577
+ help="DNA analysis model"
578
  )
579
 
580
+ st.divider()
581
+
582
+ st.subheader("🔍 Search Settings")
583
  use_web = st.checkbox("Enable web search", value=True)
584
+ web_results = st.slider("Web results", 1, 10, 5)
585
+
586
+ st.divider()
587
+
588
+ st.subheader("🎭 Collaboration Mode")
589
+ collab_mode = st.radio(
590
+ "AI Collaboration Type",
591
+ ["full", "quick", "deep"],
592
+ index=0,
593
+ help="Full: Complete collaboration\nQuick: Fast response\nDeep: In-depth analysis"
594
+ )
595
 
596
  # Tabs
597
+ tab1, tab2, tab3, tab4, tab5 = st.tabs(["💬 Chat", "🧬 Protein", "🧬 DNA", "📊 Analysis", "ℹ️ About"])
598
 
599
  # File upload
600
  with st.expander("📁 Upload Files", expanded=True):
601
  files = st.file_uploader(
602
+ "Upload text/FASTA/PDF files",
603
+ type=["txt", "fa", "fasta", "csv", "json", "pdf"],
604
+ accept_multiple_files=True,
605
+ help="Support for multiple file types including PDF"
606
  )
607
 
608
  if files:
609
  docs = []
610
  for f in files:
611
  try:
 
612
  if f.name.lower().endswith(".pdf"):
613
  if not (PDFPLUMBER_AVAILABLE or PYPDF2_AVAILABLE):
614
+ st.warning(f"⚠️ PDF support requires: pip install pdfplumber")
615
  continue
616
 
617
  text = load_file_text(f)
618
  if text:
619
  docs.extend(chunk_text(text))
620
+ st.success(f"✅ {f.name} loaded ({len(text)} chars)")
621
  except Exception as e:
622
  st.error(f"Error reading {f.name}: {e}")
623
 
624
  if docs:
625
  st.session_state.docs = docs
626
+ st.info(f"📚 Total chunks created: {len(docs)}")
627
 
628
  if SENTENCE_TRANSFORMERS_AVAILABLE and FAISS_AVAILABLE:
629
+ with st.spinner("Building semantic index..."):
630
  index, model = build_index(docs)
631
  if index:
632
  st.session_state.index = index
633
  st.session_state.model = model
634
+ st.success("✅ Index built successfully")
635
 
636
+ # Chat tab with collaborative AI
637
  with tab1:
638
+ st.subheader("💬 Advanced Collaborative Chat")
639
+
640
+ # 협업 시스템 설명
641
+ with st.expander("🎭 How Collaborative AI Works", expanded=False):
642
+ st.markdown("""
643
+ ### Three AI Experts Work Together:
644
+
645
+ 1. **🔍 Investigator**: Fact-checks and verifies information
646
+ 2. **📝 Supervisor**: Creates structured, comprehensive answers
647
+ 3. **✅ Critic**: Reviews for accuracy and clarity
648
+ 4. **🎯 Integrator**: Combines all inputs for the final answer
649
+
650
+ This system ensures maximum accuracy and comprehensiveness.
651
+ """)
652
 
653
  question = st.text_area(
654
+ "Ask about proteins, DNA, or any bioinformatics topic:",
655
+ value="Explain how AlphaFold revolutionized protein structure prediction and its impact on drug discovery.",
656
  height=100
657
  )
658
 
659
+ col1, col2 = st.columns([3, 1])
660
+ with col1:
661
+ answer_button = st.button("🚀 Get Collaborative Answer", type="primary", use_container_width=True)
662
+ with col2:
663
+ show_process = st.checkbox("Show process", value=False, help="Display each AI's contribution")
664
+
665
+ if answer_button:
666
  if not get_secret("FIREWORKS_API_KEY"):
667
+ st.error("⚠️ Please set FIREWORKS_API_KEY")
668
  else:
669
+ # Progress tracking
670
+ progress_bar = st.progress(0)
671
+ status_text = st.empty()
672
+
673
+ with st.spinner("🔍 Building knowledge base..."):
674
+ status_text.text("Searching sources...")
675
+ progress_bar.progress(10)
676
+
677
  context, sources = build_context(
678
  question,
679
  st.session_state.docs,
 
683
  web_results
684
  )
685
 
686
+ progress_bar.progress(20)
687
+ status_text.text("Collaborative AI system working...")
688
+
689
+ # Get collaborative answer
690
+ start_time = time.time()
691
+ collaborative_result = collaborative_answer(
692
+ question,
693
+ context,
694
+ collaboration_type=collab_mode
695
+ )
696
+ elapsed_time = time.time() - start_time
697
+
698
+ progress_bar.progress(100)
699
+ status_text.text(f"✅ Completed in {elapsed_time:.1f} seconds")
700
+
701
+ # Display results
702
+ if show_process:
703
+ # Show each AI's contribution
704
+ with st.expander("🔍 Investigator's Analysis", expanded=False):
705
+ st.markdown(collaborative_result["investigator"])
706
 
707
+ with st.expander("📝 Supervisor's Draft", expanded=False):
708
+ st.markdown(collaborative_result["supervisor"])
709
 
710
+ with st.expander("✅ Critic's Review", expanded=False):
711
+ st.markdown(collaborative_result["critic"])
712
+
713
+ # Final answer
714
+ st.markdown("### 🎯 Final Integrated Answer")
715
+ st.markdown(collaborative_result["final"])
716
+
717
+ # Sources
718
+ if sources:
719
+ with st.expander("📚 Sources & References", expanded=False):
720
  for s in sources:
721
  if s["type"] == "web":
722
  st.write(f"- 🌐 [{s['title']}]({s['url']})")
723
  elif s["type"] == "file":
724
+ st.write(f"- 📄 File: {s['text'][:100]}... (Score: {s.get('score', 0):.2f})")
725
+
726
+ # Save to history
727
+ st.session_state.chat_history.append({
728
+ "question": question,
729
+ "answer": collaborative_result["final"],
730
+ "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
731
+ "mode": collab_mode
732
+ })
733
+
734
+ # Feedback
735
+ col1, col2, col3 = st.columns(3)
736
+ with col1:
737
+ if st.button("👍 Helpful"):
738
+ st.success("Thank you for your feedback!")
739
+ with col2:
740
+ if st.button("👎 Not helpful"):
741
+ st.info("We'll work on improving our responses.")
742
+ with col3:
743
+ if st.button("💾 Save Answer"):
744
+ st.download_button(
745
+ label="Download",
746
+ data=collaborative_result["final"],
747
+ file_name=f"bioseq_answer_{time.strftime('%Y%m%d_%H%M%S')}.md",
748
+ mime="text/markdown"
749
+ )
750
 
751
+ # Enhanced Protein tab
752
  with tab2:
753
+ st.subheader("🧬 Advanced Protein Analysis")
754
 
755
+ with st.expander("📚 Learn About Protein Analysis", expanded=False):
756
+ st.markdown("""
757
+ ### What is Protein Sequence Analysis?
758
+
759
+ **Proteins** are the workhorses of cells, performing nearly every function necessary for life:
760
+ - 🧪 **Enzymes**: Catalyze chemical reactions
761
+ - 🛡️ **Antibodies**: Defend against pathogens
762
+ - 🚚 **Transporters**: Move molecules across membranes
763
+ - 📡 **Receptors**: Receive and transmit signals
764
+
765
+ **ESM-2** (Evolutionary Scale Modeling) is Meta's breakthrough AI that:
766
+ - Trained on 65 million protein sequences
767
+ - Predicts structure and function from sequence alone
768
+ - Enables drug discovery and protein engineering
769
+ """)
770
 
771
  protein_seq = st.text_area(
772
+ "Enter protein sequence (single letter amino acid code):",
773
  value="MKTIIALSYIFCLVFA",
774
+ help="Standard amino acids: A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y",
775
  height=100
776
  )
777
 
778
+ # Example sequences
779
+ st.markdown("**🧪 Example Sequences (Click to copy):**")
780
+ col1, col2, col3, col4 = st.columns(4)
781
  with col1:
782
+ if st.button("💉 Insulin", key="ins"):
783
  st.code("FVNQHLCGSHLVEALYLVCGERGFFYTPKT", language=None)
784
  with col2:
785
+ if st.button("😊 Endorphin", key="end"):
786
  st.code("YGGFMTSEKSQTPLVTLFKNAIIKNAYKKGE", language=None)
787
  with col3:
788
+ if st.button("❤️ Oxytocin", key="oxy"):
789
  st.code("CYIQNCPLG", language=None)
790
+ with col4:
791
+ if st.button("🦠 Lysozyme", key="lys"):
792
+ st.code("KVFGRCELAAAMKRHGLDNYRGYSLGNWVCAAKFESNFNTQATNR", language=None)
793
 
794
+ if st.button("🔬 Analyze Protein", type="primary", use_container_width=True):
795
  seq = protein_seq.strip().upper()
796
 
797
+ # Validation
798
+ valid_aa = set("ACDEFGHIKLMNPQRSTVWY")
799
+ invalid = set(seq) - valid_aa
800
+ if invalid:
801
+ st.warning(f"⚠️ Invalid amino acids detected: {', '.join(invalid)}")
802
+ seq = ''.join([aa for aa in seq if aa in valid_aa])
803
 
804
+ if len(seq) < 3:
805
+ st.error("Sequence too short. Please enter at least 3 amino acids.")
806
+ else:
807
+ # Basic analysis
808
+ st.markdown("### 📊 Sequence Statistics")
809
+ col1, col2, col3, col4 = st.columns(4)
810
 
811
+ with col1:
812
+ st.metric("Length", f"{len(seq)} aa")
813
+ st.metric("Mol. Weight", f"~{len(seq) * 110:.1f} Da")
814
+
815
+ with col2:
816
+ unique_aa = len(set(seq))
817
+ st.metric("Unique AA", f"{unique_aa}/20")
818
+ charged = sum(1 for aa in seq if aa in "DEKR")
819
+ st.metric("Charged", f"{charged/len(seq)*100:.1f}%")
820
+
821
+ with col3:
822
+ hydrophobic = sum(1 for aa in seq if aa in "AVILMFYW")
823
+ st.metric("Hydrophobic", f"{hydrophobic/len(seq)*100:.1f}%")
824
+ aromatic = sum(1 for aa in seq if aa in "FWY")
825
+ st.metric("Aromatic", f"{aromatic/len(seq)*100:.1f}%")
826
+
827
+ with col4:
828
+ basic = sum(1 for aa in seq if aa in "KRH")
829
+ acidic = sum(1 for aa in seq if aa in "DE")
830
+ pi_estimate = 7 + (basic - acidic) * 0.5
831
+ st.metric("pI (est.)", f"~{pi_estimate:.1f}")
832
+ st.metric("Basic/Acidic", f"{basic}/{acidic}")
833
+
834
+ # Secondary structure prediction (simplified)
835
+ st.markdown("### 🔮 Predicted Properties")
836
+ col1, col2 = st.columns(2)
837
+
838
+ with col1:
839
+ # Helix propensity
840
+ helix_aa = "AELMQKRH"
841
+ helix_score = sum(1 for aa in seq if aa in helix_aa) / len(seq)
842
+ st.metric("α-Helix Propensity", f"{helix_score*100:.1f}%")
843
+
844
+ # Beta propensity
845
+ beta_aa = "FIVWY"
846
+ beta_score = sum(1 for aa in seq if aa in beta_aa) / len(seq)
847
+ st.metric("β-Sheet Propensity", f"{beta_score*100:.1f}%")
848
+
849
+ with col2:
850
+ # Disorder prediction
851
+ disorder_aa = "PESKTQ"
852
+ disorder_score = sum(1 for aa in seq if aa in disorder_aa) / len(seq)
853
+ st.metric("Disorder Tendency", f"{disorder_score*100:.1f}%")
854
+
855
+ # Solubility estimate
856
+ soluble_score = 100 - (hydrophobic/len(seq)*100)
857
+ st.metric("Solubility Score", f"{soluble_score:.1f}%")
858
+
859
+ # AI Analysis
860
+ if TORCH_AVAILABLE and TRANSFORMERS_AVAILABLE:
861
+ st.markdown("### 🤖 AI-Powered Analysis")
862
+ with st.spinner("Running ESM-2 analysis... This may take 10-30 seconds"):
863
+ result = esm2_embed(seq, esm_model)
864
 
865
+ if "error" in result:
866
+ st.error(f"Analysis failed: {result['error']}")
867
+ else:
868
+ st.success("✅ AI analysis complete!")
869
+
870
+ col1, col2, col3 = st.columns(3)
871
+ with col1:
872
+ st.metric("Embedding Dimension", result['size'])
873
+ with col2:
874
+ st.metric("Mean Value", f"{result.get('mean', 0):.3f}")
875
+ with col3:
876
+ st.metric("Std Dev", f"{result.get('std', 0):.3f}")
877
+
878
+ # Visualization placeholder
879
+ st.markdown("**🎨 Embedding Visualization:**")
880
+ st.info("The protein has been encoded into a high-dimensional space where similar proteins cluster together.")
881
+
882
+ # Applications
883
+ st.markdown("""
884
+ ### 🎯 Applications of This Analysis:
885
+
886
+ 1. **🔍 Similar Protein Search**: Find proteins with similar functions
887
+ 2. **💊 Drug Target Identification**: Predict binding sites and interactions
888
+ 3. **🧬 Mutation Impact**: Assess how changes affect protein function
889
+ 4. **🏗️ Structure Prediction**: Input for AlphaFold-like systems
890
+ 5. **⚗️ Protein Engineering**: Design improved variants
891
+ """)
892
+ else:
893
+ st.warning("⚠️ AI models are loading. Please refresh in a moment.")
894
 
895
+ # Enhanced DNA tab
896
  with tab3:
897
+ st.subheader("🧬 Advanced DNA Analysis")
898
 
899
+ with st.expander("📚 Learn About DNA Analysis", expanded=False):
900
+ st.markdown("""
901
+ ### Understanding DNA Sequences
902
+
903
+ **DNA** is the blueprint of life, encoding all genetic information in four bases:
904
+ - **A** (Adenine): Pairs with T
905
+ - **T** (Thymine): Pairs with A
906
+ - **G** (Guanine): Pairs with C
907
+ - **C** (Cytosine): Pairs with G
908
+
909
+ **Key Concepts:**
910
+ - **Gene**: A DNA segment that codes for a protein
911
+ - **Promoter**: Controls when genes are turned on/off
912
+ - **Codon**: Three bases that code for one amino acid
913
+ - **GC Content**: Affects stability and gene expression
914
+
915
+ **DNABERT-2** is an AI model that understands DNA "language" to predict:
916
+ - Gene function
917
+ - Regulatory elements
918
+ - Disease-causing mutations
919
+ - Evolution patterns
920
+ """)
921
 
922
  dna_seq = st.text_area(
923
+ "Enter DNA sequence:",
924
  value="ATGCGATCGTAGC",
925
+ help="Use A, T, G, C for DNA (U will be converted to T for RNA)",
926
  height=100
927
  )
928
 
929
+ # Example sequences
930
+ st.markdown("**🧪 Example Sequences (Click to analyze):**")
931
+ col1, col2, col3, col4 = st.columns(4)
932
  with col1:
933
+ if st.button("📋 TATA Box", key="tata"):
934
+ st.code("TATAAAAGCGCGCGCG", language=None)
935
+ st.caption("Gene start signal")
936
  with col2:
937
+ if st.button("🎯 Promoter", key="prom"):
938
  st.code("TTGACAGGCTAGCTCAGTCCTAGGTATAATGCTAGC", language=None)
939
+ st.caption("Gene control region")
940
  with col3:
941
+ if st.button("✂️ CRISPR", key="crispr"):
942
  st.code("GTCACCTCCAATGACTAGGGTGG", language=None)
943
+ st.caption("Gene editing target")
944
+ with col4:
945
+ if st.button("🧬 Telomere", key="telo"):
946
+ st.code("TTAGGGTTAGGGTTAGGG", language=None)
947
+ st.caption("Chromosome end")
948
 
949
+ if st.button("🔬 Analyze DNA", type="primary", use_container_width=True):
950
+ seq = dna_seq.strip().upper().replace("U", "T")
951
+ seq = ''.join(c for c in seq if c in 'ATGC')
952
 
953
  if len(seq) < 3:
954
+ st.error("Sequence too short. Please enter at least 3 bases.")
955
  else:
956
+ # Advanced statistics
957
+ st.markdown("### 📊 Sequence Analysis")
958
+
959
+ col1, col2, col3, col4 = st.columns(4)
960
 
961
  with col1:
962
+ st.metric("Length", f"{len(seq)} bp")
963
+ st.metric("Size", f"~{len(seq)*660:.0f} Da")
964
+
965
+ with col2:
966
  gc = (seq.count("G") + seq.count("C")) / len(seq) * 100
967
+ st.metric("GC Content", f"{gc:.1f}%")
968
+ if gc > 65:
969
+ st.caption("🔴 Very high")
970
+ elif gc > 55:
971
+ st.caption("🟠 High")
972
+ elif gc < 35:
973
+ st.caption("🔵 Low")
974
+ elif gc < 25:
975
+ st.caption("🟣 Very low")
976
  else:
977
+ st.caption("🟢 Normal")
978
 
979
+ with col3:
980
+ at = 100 - gc
981
+ st.metric("AT Content", f"{at:.1f}%")
982
+ tm = 4 * (seq.count("G") + seq.count("C")) + 2 * (seq.count("A") + seq.count("T"))
983
+ st.metric("Tm (est.)", f"{tm}°C")
984
+
985
+ with col4:
986
+ cpg = seq.count("CG")
987
+ cpg_ratio = (cpg * len(seq)) / (seq.count("C") * seq.count("G")) if seq.count("C") * seq.count("G") > 0 else 0
988
+ st.metric("CpG Sites", cpg)
989
+ st.metric("CpG O/E", f"{cpg_ratio:.2f}")
990
+
991
+ # Motif search
992
+ st.markdown("### 🔍 Regulatory Elements & Motifs")
993
 
 
 
994
  motifs_found = []
995
+ motif_positions = []
996
+
997
+ # Extended motif database
998
+ motif_db = {
999
+ "TATA Box": ["TATAAA", "TATAWAW"],
1000
+ "CAAT Box": ["CAAT", "CCAAT", "GGCCAATCT"],
1001
+ "GC Box": ["GGGCGG", "GGCGGG"],
1002
+ "Start Codon": ["ATG"],
1003
+ "Stop Codons": ["TAA", "TAG", "TGA"],
1004
+ "Kozak Sequence": ["GCCRCCATGG"],
1005
+ "Poly-A Signal": ["AATAAA", "ATTAAA"],
1006
+ "E-box": ["CANNTG"],
1007
+ "CRE": ["TGACGTCA"],
1008
+ "NF-κB": ["GGGACTTTCC"]
1009
+ }
1010
 
1011
+ for motif_name, patterns in motif_db.items():
1012
+ for pattern in patterns:
1013
+ # Simple pattern matching (R=A/G, W=A/T, N=any)
1014
+ simple_pattern = pattern.replace("R", "[AG]").replace("W", "[AT]").replace("N", "[ATGC]")
1015
+ import re
1016
+ if re.search(simple_pattern, seq):
1017
+ motifs_found.append(f" {motif_name}: {pattern}")
1018
+ break
 
 
1019
 
1020
  if motifs_found:
1021
  for motif in motifs_found:
1022
  st.write(motif)
1023
  else:
1024
+ st.info("No known regulatory motifs detected")
1025
+
1026
+ # Codon analysis
1027
+ if len(seq) >= 3:
1028
+ st.markdown("### 🧬 Coding Potential Analysis")
1029
+
1030
+ col1, col2 = st.columns(2)
1031
+
1032
+ with col1:
1033
+ # Reading frames
1034
+ st.markdown("**Open Reading Frames:**")
1035
+ for frame in range(3):
1036
+ frame_seq = seq[frame:]
1037
+ if "ATG" in frame_seq:
1038
+ start_pos = frame_seq.index("ATG") + frame
1039
+ st.write(f"Frame {frame+1}: Start at position {start_pos+1}")
1040
+
1041
+ with col2:
1042
+ # Codon usage
1043
+ if len(seq) % 3 == 0:
1044
+ st.markdown("**Codon Statistics:**")
1045
+ codon_count = len(seq) // 3
1046
+ st.metric("Total Codons", codon_count)
1047
+
1048
+ # Count stops
1049
+ stops = seq.count("TAA") + seq.count("TAG") + seq.count("TGA")
1050
+ st.metric("Stop Codons", stops)
1051
 
1052
  # AI Analysis
1053
  if TORCH_AVAILABLE and TRANSFORMERS_AVAILABLE:
1054
+ st.markdown("### 🤖 AI-Powered Genomic Analysis")
1055
+ with st.spinner("Running DNABERT analysis... This may take 10-30 seconds"):
1056
  result = dna_embed(seq, dna_model)
1057
+
1058
  if "error" in result:
1059
+ st.error(f"Analysis failed: {result['error']}")
1060
  else:
1061
+ st.success("✅ AI analysis complete!")
1062
 
1063
+ col1, col2, col3 = st.columns(3)
1064
  with col1:
1065
+ st.metric("Embedding Dimension", result['size'])
 
 
1066
  with col2:
1067
+ st.metric("k-mer Count", result.get('kmer_count', 'N/A'))
1068
+ with col3:
1069
+ st.metric("Mean Value", f"{result.get('mean', 0):.3f}")
1070
 
1071
  st.markdown("""
1072
+ ### 🎯 Applications of DNA Analysis:
1073
+
1074
+ 1. **🔬 Gene Discovery**: Identify coding and regulatory regions
1075
+ 2. **🏥 Disease Diagnosis**: Detect pathogenic mutations
1076
+ 3. **✂️ CRISPR Design**: Find optimal gene editing sites
1077
+ 4. **🌱 Evolution Studies**: Compare sequences across species
1078
+ 5. **💊 Personalized Medicine**: Tailor treatments to genetic profiles
1079
+ 6. **🦠 Pathogen Detection**: Identify viral/bacterial DNA
1080
  """)
1081
  else:
1082
+ st.warning("⚠️ AI models are loading. Please refresh in a moment.")
1083
 
1084
+ # Analysis History tab
1085
  with tab4:
1086
+ st.subheader("📊 Analysis History & Insights")
1087
+
1088
+ if st.session_state.chat_history:
1089
+ st.markdown(f"### 💾 Previous Analyses ({len(st.session_state.chat_history)} total)")
1090
+
1091
+ for i, entry in enumerate(reversed(st.session_state.chat_history[-5:])):
1092
+ with st.expander(f"🕐 {entry['timestamp']} - Mode: {entry['mode']}", expanded=False):
1093
+ st.markdown("**Question:**")
1094
+ st.write(entry['question'])
1095
+ st.markdown("**Answer:**")
1096
+ st.write(entry['answer'][:500] + "..." if len(entry['answer']) > 500 else entry['answer'])
1097
+
1098
+ if st.button(f"View Full", key=f"view_{i}"):
1099
+ st.markdown(entry['answer'])
1100
+ else:
1101
+ st.info("No analysis history yet. Start by asking a question in the Chat tab!")
1102
+
1103
+ # Export options
1104
+ if st.session_state.chat_history:
1105
+ st.markdown("### 📤 Export Options")
1106
+ col1, col2 = st.columns(2)
1107
+
1108
+ with col1:
1109
+ if st.button("Export as Markdown"):
1110
+ md_content = "\n\n---\n\n".join([
1111
+ f"## {entry['timestamp']}\n\n**Q:** {entry['question']}\n\n**A:** {entry['answer']}"
1112
+ for entry in st.session_state.chat_history
1113
+ ])
1114
+ st.download_button(
1115
+ "Download MD",
1116
+ md_content,
1117
+ f"bioseq_history_{time.strftime('%Y%m%d')}.md",
1118
+ "text/markdown"
1119
+ )
1120
+
1121
+ with col2:
1122
+ if st.button("Clear History"):
1123
+ st.session_state.chat_history = []
1124
+ st.rerun()
1125
+
1126
+ # Enhanced About tab
1127
+ with tab5:
1128
+ st.subheader("ℹ️ About BioSeq Chat Pro")
1129
+
1130
  st.markdown("""
1131
+ ### 🚀 Enhanced Features
1132
+
1133
+ #### **Collaborative AI System**
1134
+ - 🔍 **Investigator**: Verifies facts and identifies knowledge gaps
1135
+ - 📝 **Supervisor**: Creates comprehensive, structured answers
1136
+ - **Critic**: Reviews for accuracy and clarity
1137
+ - 🎯 **Integrator**: Synthesizes all inputs into final answer
1138
+
1139
+ #### **Technical Improvements**
1140
+ - **8000 token responses** for comprehensive answers
1141
+ - **Enhanced context building** with semantic search
1142
+ - **Multiple collaboration modes** (Full, Quick, Deep)
1143
+ - **Scientific source prioritization** in web search
1144
+ - **Larger embedding models** for better accuracy
1145
+
1146
+ ### 🧬 Supported Analyses
1147
+ - **Protein Analysis**: ESM-2 embeddings, property prediction
1148
+ - **DNA Analysis**: DNABERT-2/BERT embeddings, motif search
1149
+ - **RAG Chat**: Context-aware Q&A with file integration
1150
+ - **PDF Support**: Direct analysis of research papers
1151
+
1152
+ ### 📚 Models & Technologies
1153
+ - **LLM**: Llama 3.1 70B (via Fireworks AI)
1154
+ - **Protein**: ESM-2 (Meta/Facebook)
1155
+ - **DNA**: DNABERT-2 (Microsoft) / BERT (Google)
1156
+ - **Embeddings**: all-mpnet-base-v2 (Sentence Transformers)
1157
+ - **Vector Search**: FAISS (Facebook)
1158
+
1159
+ ### ⚠️ Disclaimer
1160
+ This tool is designed for **research and educational purposes only**.
1161
+ - Not intended for medical diagnosis or treatment
1162
+ - Not validated for clinical use
1163
+ - Always consult qualified professionals for medical decisions
1164
+
1165
+ ### 🔧 System Status
1166
  """)
1167
 
1168
+ # System status with better formatting
1169
+ col1, col2 = st.columns(2)
1170
+
1171
+ deps_essential = {
1172
  "PyTorch": TORCH_AVAILABLE,
1173
  "Transformers": TRANSFORMERS_AVAILABLE,
1174
  "Sentence Transformers": SENTENCE_TRANSFORMERS_AVAILABLE,
1175
  "FAISS": FAISS_AVAILABLE,
1176
+ }
1177
+
1178
+ deps_optional = {
1179
  "BioPython": BIOPYTHON_AVAILABLE,
1180
  "Datasets": DATASETS_AVAILABLE,
1181
+ "PDF (pdfplumber)": PDFPLUMBER_AVAILABLE,
1182
+ "PDF (PyPDF2)": PYPDF2_AVAILABLE
1183
  }
1184
 
1185
+ with col1:
1186
+ st.markdown("**Essential Components:**")
1187
+ for name, available in deps_essential.items():
1188
+ if available:
1189
+ st.success(f" {name}")
1190
+ else:
1191
+ st.error(f"❌ {name}")
1192
+
1193
+ with col2:
1194
+ st.markdown("**Optional Components:**")
1195
+ for name, available in deps_optional.items():
1196
+ if available:
1197
+ st.success(f"✅ {name}")
1198
+ else:
1199
+ st.warning(f"⚠️ {name}")
1200
+
1201
+ # Performance metrics
1202
+ if st.session_state.chat_history:
1203
+ st.markdown("### 📈 Usage Statistics")
1204
+ col1, col2, col3 = st.columns(3)
1205
+ with col1:
1206
+ st.metric("Total Queries", len(st.session_state.chat_history))
1207
+ with col2:
1208
+ modes = [h['mode'] for h in st.session_state.chat_history]
1209
+ most_used = max(set(modes), key=modes.count) if modes else "N/A"
1210
+ st.metric("Most Used Mode", most_used)
1211
+ with col3:
1212
+ avg_length = sum(len(h['answer']) for h in st.session_state.chat_history) / len(st.session_state.chat_history)
1213
+ st.metric("Avg Answer Length", f"{avg_length:.0f} chars")
1214
+
1215
+ st.markdown("""
1216
+ ---
1217
+ ### 📞 Support & Feedback
1218
+ - Report issues or suggest features
1219
+ - Contribute to development
1220
+ - Share your research results
1221
+
1222
+ **Version**: 2.0.0 Pro | **Last Updated**: 2025
1223
+ """)