Update app.py
Browse files
app.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
import os
|
| 2 |
import json
|
| 3 |
from typing import List, Dict, Tuple
|
|
|
|
| 4 |
|
| 5 |
import streamlit as st
|
| 6 |
import requests
|
|
@@ -64,7 +65,7 @@ except ImportError:
|
|
| 64 |
print("[WARNING] PyPDF2 not available")
|
| 65 |
|
| 66 |
# 상수
|
| 67 |
-
APP_TITLE = "BioSeq Chat:
|
| 68 |
DISCLAIMER = "This tool is for research/education and is not a medical device. Do not use outputs for diagnosis or treatment decisions."
|
| 69 |
|
| 70 |
# --------------- Helper Functions ---------------
|
|
@@ -72,12 +73,10 @@ DISCLAIMER = "This tool is for research/education and is not a medical device. D
|
|
| 72 |
def get_secret(name: str, fallback: str = "") -> str:
|
| 73 |
"""Get secret from st.secrets or environment"""
|
| 74 |
try:
|
| 75 |
-
# Streamlit secrets
|
| 76 |
if hasattr(st, 'secrets') and name in st.secrets:
|
| 77 |
return st.secrets[name]
|
| 78 |
except:
|
| 79 |
pass
|
| 80 |
-
# Environment variable
|
| 81 |
return os.environ.get(name, fallback)
|
| 82 |
|
| 83 |
def brave_search(query: str, count: int = 5) -> List[Dict]:
|
|
@@ -112,8 +111,8 @@ def brave_search(query: str, count: int = 5) -> List[Dict]:
|
|
| 112 |
except Exception as e:
|
| 113 |
return [{"title": "Error", "url": "", "snippet": str(e)}]
|
| 114 |
|
| 115 |
-
def call_llm(messages: List[Dict], temperature: float = 0.6, max_tokens: int =
|
| 116 |
-
"""Call Fireworks AI API"""
|
| 117 |
api_key = get_secret("FIREWORKS_API_KEY", "")
|
| 118 |
if not api_key:
|
| 119 |
return "FIREWORKS_API_KEY missing. Set it in Secrets or sidebar."
|
|
@@ -122,7 +121,7 @@ def call_llm(messages: List[Dict], temperature: float = 0.6, max_tokens: int = 4
|
|
| 122 |
payload = {
|
| 123 |
"model": "accounts/fireworks/models/llama-v3p1-70b-instruct",
|
| 124 |
"messages": messages,
|
| 125 |
-
"max_tokens": max_tokens,
|
| 126 |
"temperature": temperature,
|
| 127 |
"top_p": 1,
|
| 128 |
"frequency_penalty": 0,
|
|
@@ -134,12 +133,152 @@ def call_llm(messages: List[Dict], temperature: float = 0.6, max_tokens: int = 4
|
|
| 134 |
}
|
| 135 |
|
| 136 |
try:
|
| 137 |
-
r = requests.post(url, headers=headers, json=payload, timeout=
|
| 138 |
r.raise_for_status()
|
| 139 |
return r.json()["choices"][0]["message"]["content"]
|
| 140 |
except Exception as e:
|
| 141 |
return f"[LLM Error] {e}"
|
| 142 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 143 |
def load_file_text(upload) -> str:
|
| 144 |
"""Load text from uploaded file (PDF 지원 포함)"""
|
| 145 |
name = upload.name.lower()
|
|
@@ -194,8 +333,8 @@ def load_file_text(upload) -> str:
|
|
| 194 |
|
| 195 |
return text
|
| 196 |
|
| 197 |
-
def chunk_text(text: str, size: int =
|
| 198 |
-
"""Split text into chunks"""
|
| 199 |
chunks = []
|
| 200 |
start = 0
|
| 201 |
text_len = len(text)
|
|
@@ -210,12 +349,13 @@ def chunk_text(text: str, size: int = 1200, overlap: int = 200) -> List[str]:
|
|
| 210 |
return chunks
|
| 211 |
|
| 212 |
def build_index(texts: List[str]):
|
| 213 |
-
"""Build vector index"""
|
| 214 |
if not SENTENCE_TRANSFORMERS_AVAILABLE or not FAISS_AVAILABLE:
|
| 215 |
return None, None
|
| 216 |
|
| 217 |
try:
|
| 218 |
-
|
|
|
|
| 219 |
embeddings = model.encode(texts, show_progress_bar=False)
|
| 220 |
|
| 221 |
dim = embeddings.shape[1]
|
|
@@ -227,8 +367,8 @@ def build_index(texts: List[str]):
|
|
| 227 |
st.warning(f"Index build failed: {e}")
|
| 228 |
return None, None
|
| 229 |
|
| 230 |
-
def search_index(query: str, index, model, texts: List[str], k: int =
|
| 231 |
-
"""Search vector index"""
|
| 232 |
if index is None or model is None:
|
| 233 |
return []
|
| 234 |
|
|
@@ -247,8 +387,33 @@ def search_index(query: str, index, model, texts: List[str], k: int = 4) -> List
|
|
| 247 |
except:
|
| 248 |
return []
|
| 249 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 250 |
def esm2_embed(seq: str, model_name: str = "facebook/esm2_t6_8M_UR50D") -> Dict:
|
| 251 |
-
"""ESM-2 protein embedding"""
|
| 252 |
if not TORCH_AVAILABLE or not TRANSFORMERS_AVAILABLE:
|
| 253 |
return {"error": "PyTorch/Transformers not available"}
|
| 254 |
|
|
@@ -262,6 +427,9 @@ def esm2_embed(seq: str, model_name: str = "facebook/esm2_t6_8M_UR50D") -> Dict:
|
|
| 262 |
outputs = model(**inputs, output_hidden_states=True)
|
| 263 |
hidden = outputs.hidden_states[-1].mean(dim=1).squeeze(0)
|
| 264 |
vec = hidden.cpu().numpy()
|
|
|
|
|
|
|
|
|
|
| 265 |
|
| 266 |
# 메모리 정리
|
| 267 |
del model
|
|
@@ -270,14 +438,17 @@ def esm2_embed(seq: str, model_name: str = "facebook/esm2_t6_8M_UR50D") -> Dict:
|
|
| 270 |
torch.cuda.empty_cache()
|
| 271 |
|
| 272 |
return {
|
| 273 |
-
"embedding": vec.tolist()[:10],
|
| 274 |
-
"size": vec.shape[0]
|
|
|
|
|
|
|
|
|
|
| 275 |
}
|
| 276 |
except Exception as e:
|
| 277 |
return {"error": str(e)}
|
| 278 |
|
| 279 |
def dna_embed(seq: str, model_name: str = "zhihan1996/DNABERT-2-117M") -> Dict:
|
| 280 |
-
"""DNA embedding"""
|
| 281 |
if not TORCH_AVAILABLE or not TRANSFORMERS_AVAILABLE:
|
| 282 |
return {"error": "PyTorch/Transformers not available"}
|
| 283 |
|
|
@@ -288,8 +459,14 @@ def dna_embed(seq: str, model_name: str = "zhihan1996/DNABERT-2-117M") -> Dict:
|
|
| 288 |
except ImportError:
|
| 289 |
return {"error": "einops package required. Please wait for installation and refresh the page."}
|
| 290 |
|
| 291 |
-
#
|
| 292 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 293 |
try:
|
| 294 |
from transformers import AutoTokenizer, AutoModel
|
| 295 |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
|
@@ -298,7 +475,6 @@ def dna_embed(seq: str, model_name: str = "zhihan1996/DNABERT-2-117M") -> Dict:
|
|
| 298 |
# 대체 모델 사용
|
| 299 |
try:
|
| 300 |
from transformers import BertTokenizer, BertModel
|
| 301 |
-
# 기본 BERT 모델로 폴백
|
| 302 |
fallback_model = "bert-base-uncased"
|
| 303 |
tokenizer = BertTokenizer.from_pretrained(fallback_model)
|
| 304 |
model = BertModel.from_pretrained(fallback_model)
|
|
@@ -308,19 +484,13 @@ def dna_embed(seq: str, model_name: str = "zhihan1996/DNABERT-2-117M") -> Dict:
|
|
| 308 |
|
| 309 |
model.eval()
|
| 310 |
|
| 311 |
-
#
|
| 312 |
-
def seq_to_kmer(seq, k=6):
|
| 313 |
-
"""DNA 서열을 k-mer로 변환"""
|
| 314 |
-
kmers = []
|
| 315 |
-
for i in range(len(seq) - k + 1):
|
| 316 |
-
kmers.append(seq[i:i+k])
|
| 317 |
-
return ' '.join(kmers)
|
| 318 |
-
|
| 319 |
-
# k-mer 변환 또는 직접 사용
|
| 320 |
if len(seq) > 6:
|
| 321 |
input_seq = seq_to_kmer(seq, k=6)
|
|
|
|
| 322 |
else:
|
| 323 |
input_seq = seq
|
|
|
|
| 324 |
|
| 325 |
with torch.no_grad():
|
| 326 |
inputs = tokenizer(
|
|
@@ -332,7 +502,6 @@ def dna_embed(seq: str, model_name: str = "zhihan1996/DNABERT-2-117M") -> Dict:
|
|
| 332 |
)
|
| 333 |
outputs = model(**inputs)
|
| 334 |
|
| 335 |
-
# last_hidden_state 또는 pooler_output 사용
|
| 336 |
if hasattr(outputs, 'pooler_output') and outputs.pooler_output is not None:
|
| 337 |
vec = outputs.pooler_output.squeeze(0).cpu().numpy()
|
| 338 |
else:
|
|
@@ -346,67 +515,16 @@ def dna_embed(seq: str, model_name: str = "zhihan1996/DNABERT-2-117M") -> Dict:
|
|
| 346 |
torch.cuda.empty_cache()
|
| 347 |
|
| 348 |
return {
|
| 349 |
-
"embedding": vec.tolist()[:10],
|
| 350 |
-
"size": vec.shape[0]
|
|
|
|
|
|
|
|
|
|
| 351 |
}
|
| 352 |
|
| 353 |
except Exception as e:
|
| 354 |
return {"error": f"분석 중 오류 발생: {str(e)[:200]}"}
|
| 355 |
|
| 356 |
-
def build_context(query: str, docs: List[str], index, model, use_web: bool, web_k: int) -> Tuple[str, List[Dict]]:
|
| 357 |
-
"""Build context from sources"""
|
| 358 |
-
pieces = []
|
| 359 |
-
sources = []
|
| 360 |
-
|
| 361 |
-
# File search
|
| 362 |
-
if index and model and docs:
|
| 363 |
-
hits = search_index(query, index, model, docs, k=4)
|
| 364 |
-
for h in hits:
|
| 365 |
-
pieces.append(f"[FILE] {h['text'][:500]}")
|
| 366 |
-
sources.append({"type": "file", "text": h['text'][:100]})
|
| 367 |
-
|
| 368 |
-
# Web search
|
| 369 |
-
if use_web:
|
| 370 |
-
results = brave_search(query, count=web_k)
|
| 371 |
-
for r in results:
|
| 372 |
-
pieces.append(f"[WEB] {r['title']}\n{r['snippet']}")
|
| 373 |
-
sources.append({"type": "web", "title": r['title'], "url": r['url']})
|
| 374 |
-
|
| 375 |
-
context = "\n\n---\n\n".join(pieces)[:4000]
|
| 376 |
-
return context, sources
|
| 377 |
-
|
| 378 |
-
def answer_question(query: str, context: str) -> str:
|
| 379 |
-
"""Generate answer"""
|
| 380 |
-
system = (
|
| 381 |
-
"You are an expert bioinformatics assistant who explains complex biological concepts in an accessible way. "
|
| 382 |
-
"Your responses should be:\n"
|
| 383 |
-
"1. Comprehensive yet easy to understand\n"
|
| 384 |
-
"2. Well-structured with clear sections\n"
|
| 385 |
-
"3. Include relevant examples and analogies\n"
|
| 386 |
-
"4. Provide actionable insights when appropriate\n"
|
| 387 |
-
"5. Use Korean if the user writes in Korean, otherwise English\n"
|
| 388 |
-
"6. Never provide medical diagnosis or treatment advice\n"
|
| 389 |
-
"7. Format your response with headers, bullet points, and clear paragraphs\n"
|
| 390 |
-
"8. Aim for 300-500 words minimum for complex questions"
|
| 391 |
-
)
|
| 392 |
-
|
| 393 |
-
user_msg = f"""Context information:\n{context}\n\n
|
| 394 |
-
User Question: {query}
|
| 395 |
-
|
| 396 |
-
Please provide a detailed, well-structured response that:
|
| 397 |
-
- Directly answers the question
|
| 398 |
-
- Explains the biological background
|
| 399 |
-
- Includes practical implications when relevant
|
| 400 |
-
- Uses simple analogies to explain complex concepts
|
| 401 |
-
- Cites the context when appropriate"""
|
| 402 |
-
|
| 403 |
-
messages = [
|
| 404 |
-
{"role": "system", "content": system},
|
| 405 |
-
{"role": "user", "content": user_msg}
|
| 406 |
-
]
|
| 407 |
-
|
| 408 |
-
return call_llm(messages, temperature=0.4, max_tokens=4000)
|
| 409 |
-
|
| 410 |
# --------------- Streamlit UI ---------------
|
| 411 |
|
| 412 |
st.set_page_config(page_title=APP_TITLE, page_icon="🧬", layout="wide")
|
|
@@ -420,20 +538,24 @@ if "index" not in st.session_state:
|
|
| 420 |
st.session_state.index = None
|
| 421 |
if "model" not in st.session_state:
|
| 422 |
st.session_state.model = None
|
|
|
|
|
|
|
| 423 |
|
| 424 |
# Sidebar
|
| 425 |
with st.sidebar:
|
| 426 |
-
st.header("Configuration")
|
| 427 |
|
| 428 |
fw_key = st.text_input(
|
| 429 |
"FIREWORKS_API_KEY",
|
| 430 |
value=get_secret("FIREWORKS_API_KEY", ""),
|
| 431 |
-
type="password"
|
|
|
|
| 432 |
)
|
| 433 |
brave_key = st.text_input(
|
| 434 |
"BRAVE_API_KEY",
|
| 435 |
value=get_secret("BRAVE_API_KEY", ""),
|
| 436 |
-
type="password"
|
|
|
|
| 437 |
)
|
| 438 |
|
| 439 |
if fw_key:
|
|
@@ -443,73 +565,115 @@ with st.sidebar:
|
|
| 443 |
|
| 444 |
st.divider()
|
| 445 |
|
|
|
|
| 446 |
esm_model = st.text_input(
|
| 447 |
"ESM-2 Model",
|
| 448 |
-
value="facebook/esm2_t6_8M_UR50D"
|
|
|
|
| 449 |
)
|
| 450 |
dna_model = st.text_input(
|
| 451 |
"DNA Model",
|
| 452 |
-
value="bert-base-uncased",
|
| 453 |
-
help="
|
| 454 |
)
|
| 455 |
|
|
|
|
|
|
|
|
|
|
| 456 |
use_web = st.checkbox("Enable web search", value=True)
|
| 457 |
-
web_results = st.slider("Web results", 1, 10,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 458 |
|
| 459 |
# Tabs
|
| 460 |
-
tab1, tab2, tab3, tab4 = st.tabs(["Chat", "Protein", "DNA", "About"])
|
| 461 |
|
| 462 |
# File upload
|
| 463 |
with st.expander("📁 Upload Files", expanded=True):
|
| 464 |
files = st.file_uploader(
|
| 465 |
-
"Upload text/FASTA/PDF files",
|
| 466 |
-
type=["txt", "fa", "fasta", "csv", "json", "pdf"],
|
| 467 |
-
accept_multiple_files=True
|
|
|
|
| 468 |
)
|
| 469 |
|
| 470 |
if files:
|
| 471 |
docs = []
|
| 472 |
for f in files:
|
| 473 |
try:
|
| 474 |
-
# PDF 파일인 경우 경고 메시지 추가
|
| 475 |
if f.name.lower().endswith(".pdf"):
|
| 476 |
if not (PDFPLUMBER_AVAILABLE or PYPDF2_AVAILABLE):
|
| 477 |
-
st.warning(f"⚠️ PDF
|
| 478 |
continue
|
| 479 |
|
| 480 |
text = load_file_text(f)
|
| 481 |
if text:
|
| 482 |
docs.extend(chunk_text(text))
|
| 483 |
-
st.success(f"✅ {f.name}
|
| 484 |
except Exception as e:
|
| 485 |
st.error(f"Error reading {f.name}: {e}")
|
| 486 |
|
| 487 |
if docs:
|
| 488 |
st.session_state.docs = docs
|
| 489 |
-
st.
|
| 490 |
|
| 491 |
if SENTENCE_TRANSFORMERS_AVAILABLE and FAISS_AVAILABLE:
|
| 492 |
-
with st.spinner("
|
| 493 |
index, model = build_index(docs)
|
| 494 |
if index:
|
| 495 |
st.session_state.index = index
|
| 496 |
st.session_state.model = model
|
|
|
|
| 497 |
|
| 498 |
-
# Chat tab
|
| 499 |
with tab1:
|
| 500 |
-
st.subheader("💬 Chat
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 501 |
|
| 502 |
question = st.text_area(
|
| 503 |
-
"Ask about proteins, DNA, or bioinformatics:",
|
| 504 |
-
value="
|
| 505 |
height=100
|
| 506 |
)
|
| 507 |
|
| 508 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 509 |
if not get_secret("FIREWORKS_API_KEY"):
|
| 510 |
-
st.error("Please set FIREWORKS_API_KEY")
|
| 511 |
else:
|
| 512 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 513 |
context, sources = build_context(
|
| 514 |
question,
|
| 515 |
st.session_state.docs,
|
|
@@ -519,246 +683,541 @@ with tab1:
|
|
| 519 |
web_results
|
| 520 |
)
|
| 521 |
|
| 522 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 523 |
|
| 524 |
-
st.
|
| 525 |
-
|
| 526 |
|
| 527 |
-
|
| 528 |
-
st.markdown("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 529 |
for s in sources:
|
| 530 |
if s["type"] == "web":
|
| 531 |
st.write(f"- 🌐 [{s['title']}]({s['url']})")
|
| 532 |
elif s["type"] == "file":
|
| 533 |
-
st.write(f"- 📄 File: {s['text'][:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 534 |
|
| 535 |
-
# Protein tab
|
| 536 |
with tab2:
|
| 537 |
-
st.subheader("🧬 Protein Analysis")
|
| 538 |
|
| 539 |
-
st.
|
| 540 |
-
|
| 541 |
-
|
| 542 |
-
|
| 543 |
-
|
| 544 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 545 |
|
| 546 |
protein_seq = st.text_area(
|
| 547 |
-
"
|
| 548 |
value="MKTIIALSYIFCLVFA",
|
| 549 |
-
help="
|
| 550 |
height=100
|
| 551 |
)
|
| 552 |
|
| 553 |
-
|
| 554 |
-
|
|
|
|
| 555 |
with col1:
|
| 556 |
-
if st.button("
|
| 557 |
st.code("FVNQHLCGSHLVEALYLVCGERGFFYTPKT", language=None)
|
| 558 |
with col2:
|
| 559 |
-
if st.button("
|
| 560 |
st.code("YGGFMTSEKSQTPLVTLFKNAIIKNAYKKGE", language=None)
|
| 561 |
with col3:
|
| 562 |
-
if st.button("
|
| 563 |
st.code("CYIQNCPLG", language=None)
|
|
|
|
|
|
|
|
|
|
| 564 |
|
| 565 |
-
if st.button("🔬
|
| 566 |
seq = protein_seq.strip().upper()
|
| 567 |
|
| 568 |
-
#
|
| 569 |
-
|
| 570 |
-
|
|
|
|
|
|
|
|
|
|
| 571 |
|
| 572 |
-
|
| 573 |
-
st.
|
| 574 |
-
|
|
|
|
|
|
|
|
|
|
| 575 |
|
| 576 |
-
|
| 577 |
-
|
| 578 |
-
|
| 579 |
-
|
| 580 |
-
|
| 581 |
-
|
| 582 |
-
|
| 583 |
-
|
| 584 |
-
|
| 585 |
-
|
| 586 |
-
|
| 587 |
-
if
|
| 588 |
-
|
| 589 |
-
|
| 590 |
-
|
| 591 |
-
|
| 592 |
-
|
| 593 |
-
|
| 594 |
-
|
| 595 |
-
|
| 596 |
-
|
| 597 |
-
|
| 598 |
-
|
| 599 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 600 |
|
| 601 |
-
|
| 602 |
-
|
| 603 |
-
|
| 604 |
-
|
| 605 |
-
|
| 606 |
-
|
| 607 |
-
|
| 608 |
-
|
| 609 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 610 |
|
| 611 |
-
# DNA tab
|
| 612 |
with tab3:
|
| 613 |
-
st.subheader("🧬 DNA Analysis")
|
| 614 |
|
| 615 |
-
st.
|
| 616 |
-
|
| 617 |
-
|
| 618 |
-
|
| 619 |
-
|
| 620 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 621 |
|
| 622 |
dna_seq = st.text_area(
|
| 623 |
-
"DNA
|
| 624 |
value="ATGCGATCGTAGC",
|
| 625 |
-
help="DNA
|
| 626 |
height=100
|
| 627 |
)
|
| 628 |
|
| 629 |
-
|
| 630 |
-
|
|
|
|
| 631 |
with col1:
|
| 632 |
-
if st.button("TATA
|
| 633 |
-
st.code("
|
| 634 |
-
st.caption("
|
| 635 |
with col2:
|
| 636 |
-
if st.button("
|
| 637 |
st.code("TTGACAGGCTAGCTCAGTCCTAGGTATAATGCTAGC", language=None)
|
| 638 |
-
st.caption("
|
| 639 |
with col3:
|
| 640 |
-
if st.button("CRISPR
|
| 641 |
st.code("GTCACCTCCAATGACTAGGGTGG", language=None)
|
| 642 |
-
st.caption("
|
|
|
|
|
|
|
|
|
|
|
|
|
| 643 |
|
| 644 |
-
if st.button("🔬 DNA
|
| 645 |
-
seq = dna_seq.strip().upper().replace("U", "T")
|
| 646 |
-
seq = ''.join(c for c in seq if c in 'ATGC')
|
| 647 |
|
| 648 |
if len(seq) < 3:
|
| 649 |
-
st.error("
|
| 650 |
else:
|
| 651 |
-
|
| 652 |
-
|
|
|
|
|
|
|
| 653 |
|
| 654 |
with col1:
|
| 655 |
-
st.metric("
|
|
|
|
|
|
|
|
|
|
| 656 |
gc = (seq.count("G") + seq.count("C")) / len(seq) * 100
|
| 657 |
-
st.metric("GC
|
| 658 |
-
if gc >
|
| 659 |
-
st.caption("🔴
|
| 660 |
-
elif gc
|
| 661 |
-
st.caption("
|
|
|
|
|
|
|
|
|
|
|
|
|
| 662 |
else:
|
| 663 |
-
st.caption("🟢
|
| 664 |
|
| 665 |
-
with
|
| 666 |
-
at =
|
| 667 |
-
st.metric("AT
|
| 668 |
-
|
| 669 |
-
|
| 670 |
-
|
| 671 |
-
|
| 672 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 673 |
|
| 674 |
-
# 특별 서열 찾기
|
| 675 |
-
st.markdown("### 🔍 주요 모티프 검색")
|
| 676 |
motifs_found = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 677 |
|
| 678 |
-
|
| 679 |
-
|
| 680 |
-
|
| 681 |
-
|
| 682 |
-
|
| 683 |
-
|
| 684 |
-
|
| 685 |
-
|
| 686 |
-
if seq.count("CG") > len(seq)/20:
|
| 687 |
-
motifs_found.append("✅ CpG 섬 가능성 (유전자 조절)")
|
| 688 |
|
| 689 |
if motifs_found:
|
| 690 |
for motif in motifs_found:
|
| 691 |
st.write(motif)
|
| 692 |
else:
|
| 693 |
-
st.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 694 |
|
| 695 |
# AI Analysis
|
| 696 |
if TORCH_AVAILABLE and TRANSFORMERS_AVAILABLE:
|
| 697 |
-
st.markdown("### 🤖 AI
|
| 698 |
-
with st.spinner("
|
| 699 |
result = dna_embed(seq, dna_model)
|
|
|
|
| 700 |
if "error" in result:
|
| 701 |
-
st.error(result[
|
| 702 |
else:
|
| 703 |
-
st.success("✅ AI
|
| 704 |
|
| 705 |
-
col1, col2 = st.columns(
|
| 706 |
with col1:
|
| 707 |
-
st.metric("
|
| 708 |
-
st.caption("DNA 특성을 수치화한 결과입니다")
|
| 709 |
-
|
| 710 |
with col2:
|
| 711 |
-
st.
|
| 712 |
-
|
|
|
|
| 713 |
|
| 714 |
st.markdown("""
|
| 715 |
-
|
| 716 |
-
|
| 717 |
-
|
| 718 |
-
|
| 719 |
-
|
| 720 |
-
|
|
|
|
|
|
|
| 721 |
""")
|
| 722 |
else:
|
| 723 |
-
st.warning("⚠️ AI
|
| 724 |
|
| 725 |
-
#
|
| 726 |
with tab4:
|
| 727 |
-
st.subheader("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 728 |
st.markdown("""
|
| 729 |
-
### Features
|
| 730 |
-
|
| 731 |
-
|
| 732 |
-
-
|
| 733 |
-
-
|
| 734 |
-
-
|
| 735 |
-
|
| 736 |
-
|
| 737 |
-
|
| 738 |
-
- **
|
| 739 |
-
- **
|
| 740 |
-
|
| 741 |
-
|
| 742 |
-
|
| 743 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 744 |
""")
|
| 745 |
|
| 746 |
-
#
|
| 747 |
-
st.
|
| 748 |
-
|
| 749 |
-
|
| 750 |
"PyTorch": TORCH_AVAILABLE,
|
| 751 |
"Transformers": TRANSFORMERS_AVAILABLE,
|
| 752 |
"Sentence Transformers": SENTENCE_TRANSFORMERS_AVAILABLE,
|
| 753 |
"FAISS": FAISS_AVAILABLE,
|
|
|
|
|
|
|
|
|
|
| 754 |
"BioPython": BIOPYTHON_AVAILABLE,
|
| 755 |
"Datasets": DATASETS_AVAILABLE,
|
| 756 |
-
"PDF
|
| 757 |
-
"PDF
|
| 758 |
}
|
| 759 |
|
| 760 |
-
|
| 761 |
-
|
| 762 |
-
|
| 763 |
-
|
| 764 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
import json
|
| 3 |
from typing import List, Dict, Tuple
|
| 4 |
+
import time
|
| 5 |
|
| 6 |
import streamlit as st
|
| 7 |
import requests
|
|
|
|
| 65 |
print("[WARNING] PyPDF2 not available")
|
| 66 |
|
| 67 |
# 상수
|
| 68 |
+
APP_TITLE = "BioSeq Chat Pro: Advanced Collaborative AI System"
|
| 69 |
DISCLAIMER = "This tool is for research/education and is not a medical device. Do not use outputs for diagnosis or treatment decisions."
|
| 70 |
|
| 71 |
# --------------- Helper Functions ---------------
|
|
|
|
| 73 |
def get_secret(name: str, fallback: str = "") -> str:
|
| 74 |
"""Get secret from st.secrets or environment"""
|
| 75 |
try:
|
|
|
|
| 76 |
if hasattr(st, 'secrets') and name in st.secrets:
|
| 77 |
return st.secrets[name]
|
| 78 |
except:
|
| 79 |
pass
|
|
|
|
| 80 |
return os.environ.get(name, fallback)
|
| 81 |
|
| 82 |
def brave_search(query: str, count: int = 5) -> List[Dict]:
|
|
|
|
| 111 |
except Exception as e:
|
| 112 |
return [{"title": "Error", "url": "", "snippet": str(e)}]
|
| 113 |
|
| 114 |
+
def call_llm(messages: List[Dict], temperature: float = 0.6, max_tokens: int = 8000) -> str:
|
| 115 |
+
"""Call Fireworks AI API with increased token limit"""
|
| 116 |
api_key = get_secret("FIREWORKS_API_KEY", "")
|
| 117 |
if not api_key:
|
| 118 |
return "FIREWORKS_API_KEY missing. Set it in Secrets or sidebar."
|
|
|
|
| 121 |
payload = {
|
| 122 |
"model": "accounts/fireworks/models/llama-v3p1-70b-instruct",
|
| 123 |
"messages": messages,
|
| 124 |
+
"max_tokens": max_tokens, # 8000으로 증가
|
| 125 |
"temperature": temperature,
|
| 126 |
"top_p": 1,
|
| 127 |
"frequency_penalty": 0,
|
|
|
|
| 133 |
}
|
| 134 |
|
| 135 |
try:
|
| 136 |
+
r = requests.post(url, headers=headers, json=payload, timeout=120)
|
| 137 |
r.raise_for_status()
|
| 138 |
return r.json()["choices"][0]["message"]["content"]
|
| 139 |
except Exception as e:
|
| 140 |
return f"[LLM Error] {e}"
|
| 141 |
|
| 142 |
+
def collaborative_answer(query: str, context: str, collaboration_type: str = "full") -> Dict[str, str]:
|
| 143 |
+
"""
|
| 144 |
+
협업 AI 시스템: 감독자, 비평자, 조사자가 협력하여 답변 생성
|
| 145 |
+
|
| 146 |
+
Args:
|
| 147 |
+
query: 사용자 질문
|
| 148 |
+
context: 검색된 문맥 정보
|
| 149 |
+
collaboration_type: "full" (전체 협업), "quick" (빠른 답변), "deep" (심층 분석)
|
| 150 |
+
|
| 151 |
+
Returns:
|
| 152 |
+
각 역할자의 기여와 최종 답변을 포함한 딕셔너리
|
| 153 |
+
"""
|
| 154 |
+
|
| 155 |
+
# 1. 조사자(Investigator) - 사실 수집 및 검증
|
| 156 |
+
investigator_prompt = f"""You are an INVESTIGATOR specializing in bioinformatics fact-checking.
|
| 157 |
+
|
| 158 |
+
Context: {context}
|
| 159 |
+
Question: {query}
|
| 160 |
+
|
| 161 |
+
Your task:
|
| 162 |
+
1. Extract and verify all relevant facts from the context
|
| 163 |
+
2. Identify any missing information that would improve the answer
|
| 164 |
+
3. Flag any potentially conflicting or uncertain information
|
| 165 |
+
4. Suggest additional areas for research
|
| 166 |
+
5. Provide confidence scores for key facts (0-100%)
|
| 167 |
+
|
| 168 |
+
Format your response with:
|
| 169 |
+
- VERIFIED FACTS: (with confidence scores)
|
| 170 |
+
- UNCERTAIN AREAS:
|
| 171 |
+
- MISSING INFORMATION:
|
| 172 |
+
- RESEARCH SUGGESTIONS:
|
| 173 |
+
- KEY CITATIONS:"""
|
| 174 |
+
|
| 175 |
+
investigator_msg = [
|
| 176 |
+
{"role": "system", "content": "You are a meticulous scientific fact-checker and researcher."},
|
| 177 |
+
{"role": "user", "content": investigator_prompt}
|
| 178 |
+
]
|
| 179 |
+
|
| 180 |
+
investigator_response = call_llm(investigator_msg, temperature=0.2, max_tokens=2000)
|
| 181 |
+
|
| 182 |
+
# 2. 감독자(Supervisor) - 구조화된 답변 생성
|
| 183 |
+
supervisor_prompt = f"""You are a SUPERVISOR creating a comprehensive answer.
|
| 184 |
+
|
| 185 |
+
Question: {query}
|
| 186 |
+
Context: {context}
|
| 187 |
+
Investigator's Analysis:
|
| 188 |
+
{investigator_response}
|
| 189 |
+
|
| 190 |
+
Your task:
|
| 191 |
+
1. Create a well-structured, scientifically accurate answer
|
| 192 |
+
2. Include:
|
| 193 |
+
- Executive Summary (2-3 sentences)
|
| 194 |
+
- Background & Context
|
| 195 |
+
- Detailed Explanation with subsections
|
| 196 |
+
- Practical Applications
|
| 197 |
+
- Current Research Status
|
| 198 |
+
- Future Perspectives
|
| 199 |
+
3. Use clear headings and logical flow
|
| 200 |
+
4. Integrate verified facts from the investigator
|
| 201 |
+
5. Aim for 500-1000 words minimum
|
| 202 |
+
6. Include relevant examples and analogies
|
| 203 |
+
|
| 204 |
+
Format with clear markdown headers and bullet points where appropriate."""
|
| 205 |
+
|
| 206 |
+
supervisor_msg = [
|
| 207 |
+
{"role": "system", "content": "You are an expert bioinformatics educator who creates comprehensive, well-structured scientific explanations."},
|
| 208 |
+
{"role": "user", "content": supervisor_prompt}
|
| 209 |
+
]
|
| 210 |
+
|
| 211 |
+
supervisor_response = call_llm(supervisor_msg, temperature=0.4, max_tokens=3500)
|
| 212 |
+
|
| 213 |
+
# 3. 비평자(Critic) - 품질 검증 및 개선
|
| 214 |
+
critic_prompt = f"""You are a CRITIC reviewing the following answer for scientific accuracy.
|
| 215 |
+
|
| 216 |
+
Original Question: {query}
|
| 217 |
+
Supervisor's Answer:
|
| 218 |
+
{supervisor_response}
|
| 219 |
+
|
| 220 |
+
Investigator's Facts:
|
| 221 |
+
{investigator_response}
|
| 222 |
+
|
| 223 |
+
Your task:
|
| 224 |
+
1. Check for scientific accuracy and completeness
|
| 225 |
+
2. Identify any errors, omissions, or unclear explanations
|
| 226 |
+
3. Verify that all claims are properly supported
|
| 227 |
+
4. Assess the answer's clarity and accessibility
|
| 228 |
+
5. Suggest specific improvements
|
| 229 |
+
6. Provide a quality score (0-100)
|
| 230 |
+
|
| 231 |
+
Format your critique:
|
| 232 |
+
- ACCURACY ASSESSMENT:
|
| 233 |
+
- COMPLETENESS CHECK:
|
| 234 |
+
- CLARITY EVALUATION:
|
| 235 |
+
- ERRORS/ISSUES FOUND:
|
| 236 |
+
- IMPROVEMENT SUGGESTIONS:
|
| 237 |
+
- QUALITY SCORE: X/100"""
|
| 238 |
+
|
| 239 |
+
critic_msg = [
|
| 240 |
+
{"role": "system", "content": "You are a rigorous scientific peer reviewer specializing in bioinformatics."},
|
| 241 |
+
{"role": "user", "content": critic_prompt}
|
| 242 |
+
]
|
| 243 |
+
|
| 244 |
+
critic_response = call_llm(critic_msg, temperature=0.3, max_tokens=1500)
|
| 245 |
+
|
| 246 |
+
# 4. 최종 통합 답변 (Final Integration)
|
| 247 |
+
if collaboration_type == "full":
|
| 248 |
+
integration_prompt = f"""Create the FINAL INTEGRATED ANSWER incorporating all feedback.
|
| 249 |
+
|
| 250 |
+
Question: {query}
|
| 251 |
+
Supervisor's Answer: {supervisor_response}
|
| 252 |
+
Critic's Feedback: {critic_response}
|
| 253 |
+
Verified Facts: {investigator_response}
|
| 254 |
+
|
| 255 |
+
Create a polished, final answer that:
|
| 256 |
+
1. Addresses all critic's concerns
|
| 257 |
+
2. Maintains scientific rigor
|
| 258 |
+
3. Includes proper citations
|
| 259 |
+
4. Uses clear structure with markdown formatting
|
| 260 |
+
5. Provides comprehensive coverage (800-1500 words)
|
| 261 |
+
6. Includes a TL;DR section at the beginning
|
| 262 |
+
7. Ends with key takeaways and further reading suggestions
|
| 263 |
+
|
| 264 |
+
Use Korean if the question is in Korean, otherwise English."""
|
| 265 |
+
|
| 266 |
+
integration_msg = [
|
| 267 |
+
{"role": "system", "content": "You are a master science communicator creating the definitive answer by integrating all expert inputs."},
|
| 268 |
+
{"role": "user", "content": integration_prompt}
|
| 269 |
+
]
|
| 270 |
+
|
| 271 |
+
final_answer = call_llm(integration_msg, temperature=0.35, max_tokens=8000)
|
| 272 |
+
else:
|
| 273 |
+
final_answer = supervisor_response
|
| 274 |
+
|
| 275 |
+
return {
|
| 276 |
+
"investigator": investigator_response,
|
| 277 |
+
"supervisor": supervisor_response,
|
| 278 |
+
"critic": critic_response,
|
| 279 |
+
"final": final_answer
|
| 280 |
+
}
|
| 281 |
+
|
| 282 |
def load_file_text(upload) -> str:
|
| 283 |
"""Load text from uploaded file (PDF 지원 포함)"""
|
| 284 |
name = upload.name.lower()
|
|
|
|
| 333 |
|
| 334 |
return text
|
| 335 |
|
| 336 |
+
def chunk_text(text: str, size: int = 1500, overlap: int = 300) -> List[str]:
|
| 337 |
+
"""Split text into chunks with larger size for better context"""
|
| 338 |
chunks = []
|
| 339 |
start = 0
|
| 340 |
text_len = len(text)
|
|
|
|
| 349 |
return chunks
|
| 350 |
|
| 351 |
def build_index(texts: List[str]):
|
| 352 |
+
"""Build vector index with better model"""
|
| 353 |
if not SENTENCE_TRANSFORMERS_AVAILABLE or not FAISS_AVAILABLE:
|
| 354 |
return None, None
|
| 355 |
|
| 356 |
try:
|
| 357 |
+
# 더 나은 임베딩 모델 사용
|
| 358 |
+
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
|
| 359 |
embeddings = model.encode(texts, show_progress_bar=False)
|
| 360 |
|
| 361 |
dim = embeddings.shape[1]
|
|
|
|
| 367 |
st.warning(f"Index build failed: {e}")
|
| 368 |
return None, None
|
| 369 |
|
| 370 |
+
def search_index(query: str, index, model, texts: List[str], k: int = 5) -> List[Dict]:
|
| 371 |
+
"""Search vector index with more results"""
|
| 372 |
if index is None or model is None:
|
| 373 |
return []
|
| 374 |
|
|
|
|
| 387 |
except:
|
| 388 |
return []
|
| 389 |
|
| 390 |
+
def build_context(query: str, docs: List[str], index, model, use_web: bool, web_k: int) -> Tuple[str, List[Dict]]:
|
| 391 |
+
"""Build enhanced context from sources"""
|
| 392 |
+
pieces = []
|
| 393 |
+
sources = []
|
| 394 |
+
|
| 395 |
+
# File search with more results
|
| 396 |
+
if index and model and docs:
|
| 397 |
+
hits = search_index(query, index, model, docs, k=6)
|
| 398 |
+
for h in hits:
|
| 399 |
+
pieces.append(f"[FILE SOURCE] {h['text'][:800]}")
|
| 400 |
+
sources.append({"type": "file", "text": h['text'][:150], "score": h['score']})
|
| 401 |
+
|
| 402 |
+
# Web search with scientific focus
|
| 403 |
+
if use_web:
|
| 404 |
+
# 과학적 키워드 추가
|
| 405 |
+
scientific_query = f"{query} scientific research pubmed nature science"
|
| 406 |
+
results = brave_search(scientific_query, count=web_k)
|
| 407 |
+
for r in results:
|
| 408 |
+
pieces.append(f"[WEB SOURCE] {r['title']}\n{r['snippet']}")
|
| 409 |
+
sources.append({"type": "web", "title": r['title'], "url": r['url']})
|
| 410 |
+
|
| 411 |
+
context = "\n\n---\n\n".join(pieces)[:6000] # 컨텍스트 크기 증��
|
| 412 |
+
return context, sources
|
| 413 |
+
|
| 414 |
+
# Enhanced analysis functions
|
| 415 |
def esm2_embed(seq: str, model_name: str = "facebook/esm2_t6_8M_UR50D") -> Dict:
|
| 416 |
+
"""Enhanced ESM-2 protein embedding with more analysis"""
|
| 417 |
if not TORCH_AVAILABLE or not TRANSFORMERS_AVAILABLE:
|
| 418 |
return {"error": "PyTorch/Transformers not available"}
|
| 419 |
|
|
|
|
| 427 |
outputs = model(**inputs, output_hidden_states=True)
|
| 428 |
hidden = outputs.hidden_states[-1].mean(dim=1).squeeze(0)
|
| 429 |
vec = hidden.cpu().numpy()
|
| 430 |
+
|
| 431 |
+
# 추가 분석
|
| 432 |
+
attention_weights = outputs.hidden_states[-1].std(dim=1).squeeze(0).cpu().numpy()
|
| 433 |
|
| 434 |
# 메모리 정리
|
| 435 |
del model
|
|
|
|
| 438 |
torch.cuda.empty_cache()
|
| 439 |
|
| 440 |
return {
|
| 441 |
+
"embedding": vec.tolist()[:10],
|
| 442 |
+
"size": vec.shape[0],
|
| 443 |
+
"mean": float(vec.mean()),
|
| 444 |
+
"std": float(vec.std()),
|
| 445 |
+
"attention_peaks": attention_weights.tolist()[:10]
|
| 446 |
}
|
| 447 |
except Exception as e:
|
| 448 |
return {"error": str(e)}
|
| 449 |
|
| 450 |
def dna_embed(seq: str, model_name: str = "zhihan1996/DNABERT-2-117M") -> Dict:
|
| 451 |
+
"""Enhanced DNA embedding with k-mer analysis"""
|
| 452 |
if not TORCH_AVAILABLE or not TRANSFORMERS_AVAILABLE:
|
| 453 |
return {"error": "PyTorch/Transformers not available"}
|
| 454 |
|
|
|
|
| 459 |
except ImportError:
|
| 460 |
return {"error": "einops package required. Please wait for installation and refresh the page."}
|
| 461 |
|
| 462 |
+
# k-mer 변환 함수
|
| 463 |
+
def seq_to_kmer(seq, k=6):
|
| 464 |
+
kmers = []
|
| 465 |
+
for i in range(len(seq) - k + 1):
|
| 466 |
+
kmers.append(seq[i:i+k])
|
| 467 |
+
return ' '.join(kmers)
|
| 468 |
+
|
| 469 |
+
# 모델 로딩 시도
|
| 470 |
try:
|
| 471 |
from transformers import AutoTokenizer, AutoModel
|
| 472 |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
|
|
|
| 475 |
# 대체 모델 사용
|
| 476 |
try:
|
| 477 |
from transformers import BertTokenizer, BertModel
|
|
|
|
| 478 |
fallback_model = "bert-base-uncased"
|
| 479 |
tokenizer = BertTokenizer.from_pretrained(fallback_model)
|
| 480 |
model = BertModel.from_pretrained(fallback_model)
|
|
|
|
| 484 |
|
| 485 |
model.eval()
|
| 486 |
|
| 487 |
+
# k-mer 변환
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 488 |
if len(seq) > 6:
|
| 489 |
input_seq = seq_to_kmer(seq, k=6)
|
| 490 |
+
kmer_count = len(seq) - 5
|
| 491 |
else:
|
| 492 |
input_seq = seq
|
| 493 |
+
kmer_count = 1
|
| 494 |
|
| 495 |
with torch.no_grad():
|
| 496 |
inputs = tokenizer(
|
|
|
|
| 502 |
)
|
| 503 |
outputs = model(**inputs)
|
| 504 |
|
|
|
|
| 505 |
if hasattr(outputs, 'pooler_output') and outputs.pooler_output is not None:
|
| 506 |
vec = outputs.pooler_output.squeeze(0).cpu().numpy()
|
| 507 |
else:
|
|
|
|
| 515 |
torch.cuda.empty_cache()
|
| 516 |
|
| 517 |
return {
|
| 518 |
+
"embedding": vec.tolist()[:10],
|
| 519 |
+
"size": vec.shape[0],
|
| 520 |
+
"kmer_count": kmer_count,
|
| 521 |
+
"mean": float(vec.mean()),
|
| 522 |
+
"std": float(vec.std())
|
| 523 |
}
|
| 524 |
|
| 525 |
except Exception as e:
|
| 526 |
return {"error": f"분석 중 오류 발생: {str(e)[:200]}"}
|
| 527 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 528 |
# --------------- Streamlit UI ---------------
|
| 529 |
|
| 530 |
st.set_page_config(page_title=APP_TITLE, page_icon="🧬", layout="wide")
|
|
|
|
| 538 |
st.session_state.index = None
|
| 539 |
if "model" not in st.session_state:
|
| 540 |
st.session_state.model = None
|
| 541 |
+
if "chat_history" not in st.session_state:
|
| 542 |
+
st.session_state.chat_history = []
|
| 543 |
|
| 544 |
# Sidebar
|
| 545 |
with st.sidebar:
|
| 546 |
+
st.header("⚙️ Configuration")
|
| 547 |
|
| 548 |
fw_key = st.text_input(
|
| 549 |
"FIREWORKS_API_KEY",
|
| 550 |
value=get_secret("FIREWORKS_API_KEY", ""),
|
| 551 |
+
type="password",
|
| 552 |
+
help="Required for AI responses"
|
| 553 |
)
|
| 554 |
brave_key = st.text_input(
|
| 555 |
"BRAVE_API_KEY",
|
| 556 |
value=get_secret("BRAVE_API_KEY", ""),
|
| 557 |
+
type="password",
|
| 558 |
+
help="Required for web search"
|
| 559 |
)
|
| 560 |
|
| 561 |
if fw_key:
|
|
|
|
| 565 |
|
| 566 |
st.divider()
|
| 567 |
|
| 568 |
+
st.subheader("🤖 AI Models")
|
| 569 |
esm_model = st.text_input(
|
| 570 |
"ESM-2 Model",
|
| 571 |
+
value="facebook/esm2_t6_8M_UR50D",
|
| 572 |
+
help="Protein analysis model"
|
| 573 |
)
|
| 574 |
dna_model = st.text_input(
|
| 575 |
"DNA Model",
|
| 576 |
+
value="bert-base-uncased",
|
| 577 |
+
help="DNA analysis model"
|
| 578 |
)
|
| 579 |
|
| 580 |
+
st.divider()
|
| 581 |
+
|
| 582 |
+
st.subheader("🔍 Search Settings")
|
| 583 |
use_web = st.checkbox("Enable web search", value=True)
|
| 584 |
+
web_results = st.slider("Web results", 1, 10, 5)
|
| 585 |
+
|
| 586 |
+
st.divider()
|
| 587 |
+
|
| 588 |
+
st.subheader("🎭 Collaboration Mode")
|
| 589 |
+
collab_mode = st.radio(
|
| 590 |
+
"AI Collaboration Type",
|
| 591 |
+
["full", "quick", "deep"],
|
| 592 |
+
index=0,
|
| 593 |
+
help="Full: Complete collaboration\nQuick: Fast response\nDeep: In-depth analysis"
|
| 594 |
+
)
|
| 595 |
|
| 596 |
# Tabs
|
| 597 |
+
tab1, tab2, tab3, tab4, tab5 = st.tabs(["💬 Chat", "🧬 Protein", "🧬 DNA", "📊 Analysis", "ℹ️ About"])
|
| 598 |
|
| 599 |
# File upload
|
| 600 |
with st.expander("📁 Upload Files", expanded=True):
|
| 601 |
files = st.file_uploader(
|
| 602 |
+
"Upload text/FASTA/PDF files",
|
| 603 |
+
type=["txt", "fa", "fasta", "csv", "json", "pdf"],
|
| 604 |
+
accept_multiple_files=True,
|
| 605 |
+
help="Support for multiple file types including PDF"
|
| 606 |
)
|
| 607 |
|
| 608 |
if files:
|
| 609 |
docs = []
|
| 610 |
for f in files:
|
| 611 |
try:
|
|
|
|
| 612 |
if f.name.lower().endswith(".pdf"):
|
| 613 |
if not (PDFPLUMBER_AVAILABLE or PYPDF2_AVAILABLE):
|
| 614 |
+
st.warning(f"⚠️ PDF support requires: pip install pdfplumber")
|
| 615 |
continue
|
| 616 |
|
| 617 |
text = load_file_text(f)
|
| 618 |
if text:
|
| 619 |
docs.extend(chunk_text(text))
|
| 620 |
+
st.success(f"✅ {f.name} loaded ({len(text)} chars)")
|
| 621 |
except Exception as e:
|
| 622 |
st.error(f"Error reading {f.name}: {e}")
|
| 623 |
|
| 624 |
if docs:
|
| 625 |
st.session_state.docs = docs
|
| 626 |
+
st.info(f"📚 Total chunks created: {len(docs)}")
|
| 627 |
|
| 628 |
if SENTENCE_TRANSFORMERS_AVAILABLE and FAISS_AVAILABLE:
|
| 629 |
+
with st.spinner("Building semantic index..."):
|
| 630 |
index, model = build_index(docs)
|
| 631 |
if index:
|
| 632 |
st.session_state.index = index
|
| 633 |
st.session_state.model = model
|
| 634 |
+
st.success("✅ Index built successfully")
|
| 635 |
|
| 636 |
+
# Chat tab with collaborative AI
|
| 637 |
with tab1:
|
| 638 |
+
st.subheader("💬 Advanced Collaborative Chat")
|
| 639 |
+
|
| 640 |
+
# 협업 시스템 설명
|
| 641 |
+
with st.expander("🎭 How Collaborative AI Works", expanded=False):
|
| 642 |
+
st.markdown("""
|
| 643 |
+
### Three AI Experts Work Together:
|
| 644 |
+
|
| 645 |
+
1. **🔍 Investigator**: Fact-checks and verifies information
|
| 646 |
+
2. **📝 Supervisor**: Creates structured, comprehensive answers
|
| 647 |
+
3. **✅ Critic**: Reviews for accuracy and clarity
|
| 648 |
+
4. **🎯 Integrator**: Combines all inputs for the final answer
|
| 649 |
+
|
| 650 |
+
This system ensures maximum accuracy and comprehensiveness.
|
| 651 |
+
""")
|
| 652 |
|
| 653 |
question = st.text_area(
|
| 654 |
+
"Ask about proteins, DNA, or any bioinformatics topic:",
|
| 655 |
+
value="Explain how AlphaFold revolutionized protein structure prediction and its impact on drug discovery.",
|
| 656 |
height=100
|
| 657 |
)
|
| 658 |
|
| 659 |
+
col1, col2 = st.columns([3, 1])
|
| 660 |
+
with col1:
|
| 661 |
+
answer_button = st.button("🚀 Get Collaborative Answer", type="primary", use_container_width=True)
|
| 662 |
+
with col2:
|
| 663 |
+
show_process = st.checkbox("Show process", value=False, help="Display each AI's contribution")
|
| 664 |
+
|
| 665 |
+
if answer_button:
|
| 666 |
if not get_secret("FIREWORKS_API_KEY"):
|
| 667 |
+
st.error("⚠️ Please set FIREWORKS_API_KEY")
|
| 668 |
else:
|
| 669 |
+
# Progress tracking
|
| 670 |
+
progress_bar = st.progress(0)
|
| 671 |
+
status_text = st.empty()
|
| 672 |
+
|
| 673 |
+
with st.spinner("🔍 Building knowledge base..."):
|
| 674 |
+
status_text.text("Searching sources...")
|
| 675 |
+
progress_bar.progress(10)
|
| 676 |
+
|
| 677 |
context, sources = build_context(
|
| 678 |
question,
|
| 679 |
st.session_state.docs,
|
|
|
|
| 683 |
web_results
|
| 684 |
)
|
| 685 |
|
| 686 |
+
progress_bar.progress(20)
|
| 687 |
+
status_text.text("Collaborative AI system working...")
|
| 688 |
+
|
| 689 |
+
# Get collaborative answer
|
| 690 |
+
start_time = time.time()
|
| 691 |
+
collaborative_result = collaborative_answer(
|
| 692 |
+
question,
|
| 693 |
+
context,
|
| 694 |
+
collaboration_type=collab_mode
|
| 695 |
+
)
|
| 696 |
+
elapsed_time = time.time() - start_time
|
| 697 |
+
|
| 698 |
+
progress_bar.progress(100)
|
| 699 |
+
status_text.text(f"✅ Completed in {elapsed_time:.1f} seconds")
|
| 700 |
+
|
| 701 |
+
# Display results
|
| 702 |
+
if show_process:
|
| 703 |
+
# Show each AI's contribution
|
| 704 |
+
with st.expander("🔍 Investigator's Analysis", expanded=False):
|
| 705 |
+
st.markdown(collaborative_result["investigator"])
|
| 706 |
|
| 707 |
+
with st.expander("📝 Supervisor's Draft", expanded=False):
|
| 708 |
+
st.markdown(collaborative_result["supervisor"])
|
| 709 |
|
| 710 |
+
with st.expander("✅ Critic's Review", expanded=False):
|
| 711 |
+
st.markdown(collaborative_result["critic"])
|
| 712 |
+
|
| 713 |
+
# Final answer
|
| 714 |
+
st.markdown("### 🎯 Final Integrated Answer")
|
| 715 |
+
st.markdown(collaborative_result["final"])
|
| 716 |
+
|
| 717 |
+
# Sources
|
| 718 |
+
if sources:
|
| 719 |
+
with st.expander("📚 Sources & References", expanded=False):
|
| 720 |
for s in sources:
|
| 721 |
if s["type"] == "web":
|
| 722 |
st.write(f"- 🌐 [{s['title']}]({s['url']})")
|
| 723 |
elif s["type"] == "file":
|
| 724 |
+
st.write(f"- 📄 File: {s['text'][:100]}... (Score: {s.get('score', 0):.2f})")
|
| 725 |
+
|
| 726 |
+
# Save to history
|
| 727 |
+
st.session_state.chat_history.append({
|
| 728 |
+
"question": question,
|
| 729 |
+
"answer": collaborative_result["final"],
|
| 730 |
+
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
|
| 731 |
+
"mode": collab_mode
|
| 732 |
+
})
|
| 733 |
+
|
| 734 |
+
# Feedback
|
| 735 |
+
col1, col2, col3 = st.columns(3)
|
| 736 |
+
with col1:
|
| 737 |
+
if st.button("👍 Helpful"):
|
| 738 |
+
st.success("Thank you for your feedback!")
|
| 739 |
+
with col2:
|
| 740 |
+
if st.button("👎 Not helpful"):
|
| 741 |
+
st.info("We'll work on improving our responses.")
|
| 742 |
+
with col3:
|
| 743 |
+
if st.button("💾 Save Answer"):
|
| 744 |
+
st.download_button(
|
| 745 |
+
label="Download",
|
| 746 |
+
data=collaborative_result["final"],
|
| 747 |
+
file_name=f"bioseq_answer_{time.strftime('%Y%m%d_%H%M%S')}.md",
|
| 748 |
+
mime="text/markdown"
|
| 749 |
+
)
|
| 750 |
|
| 751 |
+
# Enhanced Protein tab
|
| 752 |
with tab2:
|
| 753 |
+
st.subheader("🧬 Advanced Protein Analysis")
|
| 754 |
|
| 755 |
+
with st.expander("📚 Learn About Protein Analysis", expanded=False):
|
| 756 |
+
st.markdown("""
|
| 757 |
+
### What is Protein Sequence Analysis?
|
| 758 |
+
|
| 759 |
+
**Proteins** are the workhorses of cells, performing nearly every function necessary for life:
|
| 760 |
+
- 🧪 **Enzymes**: Catalyze chemical reactions
|
| 761 |
+
- 🛡️ **Antibodies**: Defend against pathogens
|
| 762 |
+
- 🚚 **Transporters**: Move molecules across membranes
|
| 763 |
+
- 📡 **Receptors**: Receive and transmit signals
|
| 764 |
+
|
| 765 |
+
**ESM-2** (Evolutionary Scale Modeling) is Meta's breakthrough AI that:
|
| 766 |
+
- Trained on 65 million protein sequences
|
| 767 |
+
- Predicts structure and function from sequence alone
|
| 768 |
+
- Enables drug discovery and protein engineering
|
| 769 |
+
""")
|
| 770 |
|
| 771 |
protein_seq = st.text_area(
|
| 772 |
+
"Enter protein sequence (single letter amino acid code):",
|
| 773 |
value="MKTIIALSYIFCLVFA",
|
| 774 |
+
help="Standard amino acids: A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y",
|
| 775 |
height=100
|
| 776 |
)
|
| 777 |
|
| 778 |
+
# Example sequences
|
| 779 |
+
st.markdown("**🧪 Example Sequences (Click to copy):**")
|
| 780 |
+
col1, col2, col3, col4 = st.columns(4)
|
| 781 |
with col1:
|
| 782 |
+
if st.button("💉 Insulin", key="ins"):
|
| 783 |
st.code("FVNQHLCGSHLVEALYLVCGERGFFYTPKT", language=None)
|
| 784 |
with col2:
|
| 785 |
+
if st.button("😊 Endorphin", key="end"):
|
| 786 |
st.code("YGGFMTSEKSQTPLVTLFKNAIIKNAYKKGE", language=None)
|
| 787 |
with col3:
|
| 788 |
+
if st.button("❤️ Oxytocin", key="oxy"):
|
| 789 |
st.code("CYIQNCPLG", language=None)
|
| 790 |
+
with col4:
|
| 791 |
+
if st.button("🦠 Lysozyme", key="lys"):
|
| 792 |
+
st.code("KVFGRCELAAAMKRHGLDNYRGYSLGNWVCAAKFESNFNTQATNR", language=None)
|
| 793 |
|
| 794 |
+
if st.button("🔬 Analyze Protein", type="primary", use_container_width=True):
|
| 795 |
seq = protein_seq.strip().upper()
|
| 796 |
|
| 797 |
+
# Validation
|
| 798 |
+
valid_aa = set("ACDEFGHIKLMNPQRSTVWY")
|
| 799 |
+
invalid = set(seq) - valid_aa
|
| 800 |
+
if invalid:
|
| 801 |
+
st.warning(f"⚠️ Invalid amino acids detected: {', '.join(invalid)}")
|
| 802 |
+
seq = ''.join([aa for aa in seq if aa in valid_aa])
|
| 803 |
|
| 804 |
+
if len(seq) < 3:
|
| 805 |
+
st.error("Sequence too short. Please enter at least 3 amino acids.")
|
| 806 |
+
else:
|
| 807 |
+
# Basic analysis
|
| 808 |
+
st.markdown("### 📊 Sequence Statistics")
|
| 809 |
+
col1, col2, col3, col4 = st.columns(4)
|
| 810 |
|
| 811 |
+
with col1:
|
| 812 |
+
st.metric("Length", f"{len(seq)} aa")
|
| 813 |
+
st.metric("Mol. Weight", f"~{len(seq) * 110:.1f} Da")
|
| 814 |
+
|
| 815 |
+
with col2:
|
| 816 |
+
unique_aa = len(set(seq))
|
| 817 |
+
st.metric("Unique AA", f"{unique_aa}/20")
|
| 818 |
+
charged = sum(1 for aa in seq if aa in "DEKR")
|
| 819 |
+
st.metric("Charged", f"{charged/len(seq)*100:.1f}%")
|
| 820 |
+
|
| 821 |
+
with col3:
|
| 822 |
+
hydrophobic = sum(1 for aa in seq if aa in "AVILMFYW")
|
| 823 |
+
st.metric("Hydrophobic", f"{hydrophobic/len(seq)*100:.1f}%")
|
| 824 |
+
aromatic = sum(1 for aa in seq if aa in "FWY")
|
| 825 |
+
st.metric("Aromatic", f"{aromatic/len(seq)*100:.1f}%")
|
| 826 |
+
|
| 827 |
+
with col4:
|
| 828 |
+
basic = sum(1 for aa in seq if aa in "KRH")
|
| 829 |
+
acidic = sum(1 for aa in seq if aa in "DE")
|
| 830 |
+
pi_estimate = 7 + (basic - acidic) * 0.5
|
| 831 |
+
st.metric("pI (est.)", f"~{pi_estimate:.1f}")
|
| 832 |
+
st.metric("Basic/Acidic", f"{basic}/{acidic}")
|
| 833 |
+
|
| 834 |
+
# Secondary structure prediction (simplified)
|
| 835 |
+
st.markdown("### 🔮 Predicted Properties")
|
| 836 |
+
col1, col2 = st.columns(2)
|
| 837 |
+
|
| 838 |
+
with col1:
|
| 839 |
+
# Helix propensity
|
| 840 |
+
helix_aa = "AELMQKRH"
|
| 841 |
+
helix_score = sum(1 for aa in seq if aa in helix_aa) / len(seq)
|
| 842 |
+
st.metric("α-Helix Propensity", f"{helix_score*100:.1f}%")
|
| 843 |
+
|
| 844 |
+
# Beta propensity
|
| 845 |
+
beta_aa = "FIVWY"
|
| 846 |
+
beta_score = sum(1 for aa in seq if aa in beta_aa) / len(seq)
|
| 847 |
+
st.metric("β-Sheet Propensity", f"{beta_score*100:.1f}%")
|
| 848 |
+
|
| 849 |
+
with col2:
|
| 850 |
+
# Disorder prediction
|
| 851 |
+
disorder_aa = "PESKTQ"
|
| 852 |
+
disorder_score = sum(1 for aa in seq if aa in disorder_aa) / len(seq)
|
| 853 |
+
st.metric("Disorder Tendency", f"{disorder_score*100:.1f}%")
|
| 854 |
+
|
| 855 |
+
# Solubility estimate
|
| 856 |
+
soluble_score = 100 - (hydrophobic/len(seq)*100)
|
| 857 |
+
st.metric("Solubility Score", f"{soluble_score:.1f}%")
|
| 858 |
+
|
| 859 |
+
# AI Analysis
|
| 860 |
+
if TORCH_AVAILABLE and TRANSFORMERS_AVAILABLE:
|
| 861 |
+
st.markdown("### 🤖 AI-Powered Analysis")
|
| 862 |
+
with st.spinner("Running ESM-2 analysis... This may take 10-30 seconds"):
|
| 863 |
+
result = esm2_embed(seq, esm_model)
|
| 864 |
|
| 865 |
+
if "error" in result:
|
| 866 |
+
st.error(f"Analysis failed: {result['error']}")
|
| 867 |
+
else:
|
| 868 |
+
st.success("✅ AI analysis complete!")
|
| 869 |
+
|
| 870 |
+
col1, col2, col3 = st.columns(3)
|
| 871 |
+
with col1:
|
| 872 |
+
st.metric("Embedding Dimension", result['size'])
|
| 873 |
+
with col2:
|
| 874 |
+
st.metric("Mean Value", f"{result.get('mean', 0):.3f}")
|
| 875 |
+
with col3:
|
| 876 |
+
st.metric("Std Dev", f"{result.get('std', 0):.3f}")
|
| 877 |
+
|
| 878 |
+
# Visualization placeholder
|
| 879 |
+
st.markdown("**🎨 Embedding Visualization:**")
|
| 880 |
+
st.info("The protein has been encoded into a high-dimensional space where similar proteins cluster together.")
|
| 881 |
+
|
| 882 |
+
# Applications
|
| 883 |
+
st.markdown("""
|
| 884 |
+
### 🎯 Applications of This Analysis:
|
| 885 |
+
|
| 886 |
+
1. **🔍 Similar Protein Search**: Find proteins with similar functions
|
| 887 |
+
2. **💊 Drug Target Identification**: Predict binding sites and interactions
|
| 888 |
+
3. **🧬 Mutation Impact**: Assess how changes affect protein function
|
| 889 |
+
4. **🏗️ Structure Prediction**: Input for AlphaFold-like systems
|
| 890 |
+
5. **⚗️ Protein Engineering**: Design improved variants
|
| 891 |
+
""")
|
| 892 |
+
else:
|
| 893 |
+
st.warning("⚠️ AI models are loading. Please refresh in a moment.")
|
| 894 |
|
| 895 |
+
# Enhanced DNA tab
|
| 896 |
with tab3:
|
| 897 |
+
st.subheader("🧬 Advanced DNA Analysis")
|
| 898 |
|
| 899 |
+
with st.expander("📚 Learn About DNA Analysis", expanded=False):
|
| 900 |
+
st.markdown("""
|
| 901 |
+
### Understanding DNA Sequences
|
| 902 |
+
|
| 903 |
+
**DNA** is the blueprint of life, encoding all genetic information in four bases:
|
| 904 |
+
- **A** (Adenine): Pairs with T
|
| 905 |
+
- **T** (Thymine): Pairs with A
|
| 906 |
+
- **G** (Guanine): Pairs with C
|
| 907 |
+
- **C** (Cytosine): Pairs with G
|
| 908 |
+
|
| 909 |
+
**Key Concepts:**
|
| 910 |
+
- **Gene**: A DNA segment that codes for a protein
|
| 911 |
+
- **Promoter**: Controls when genes are turned on/off
|
| 912 |
+
- **Codon**: Three bases that code for one amino acid
|
| 913 |
+
- **GC Content**: Affects stability and gene expression
|
| 914 |
+
|
| 915 |
+
**DNABERT-2** is an AI model that understands DNA "language" to predict:
|
| 916 |
+
- Gene function
|
| 917 |
+
- Regulatory elements
|
| 918 |
+
- Disease-causing mutations
|
| 919 |
+
- Evolution patterns
|
| 920 |
+
""")
|
| 921 |
|
| 922 |
dna_seq = st.text_area(
|
| 923 |
+
"Enter DNA sequence:",
|
| 924 |
value="ATGCGATCGTAGC",
|
| 925 |
+
help="Use A, T, G, C for DNA (U will be converted to T for RNA)",
|
| 926 |
height=100
|
| 927 |
)
|
| 928 |
|
| 929 |
+
# Example sequences
|
| 930 |
+
st.markdown("**🧪 Example Sequences (Click to analyze):**")
|
| 931 |
+
col1, col2, col3, col4 = st.columns(4)
|
| 932 |
with col1:
|
| 933 |
+
if st.button("📋 TATA Box", key="tata"):
|
| 934 |
+
st.code("TATAAAAGCGCGCGCG", language=None)
|
| 935 |
+
st.caption("Gene start signal")
|
| 936 |
with col2:
|
| 937 |
+
if st.button("🎯 Promoter", key="prom"):
|
| 938 |
st.code("TTGACAGGCTAGCTCAGTCCTAGGTATAATGCTAGC", language=None)
|
| 939 |
+
st.caption("Gene control region")
|
| 940 |
with col3:
|
| 941 |
+
if st.button("✂️ CRISPR", key="crispr"):
|
| 942 |
st.code("GTCACCTCCAATGACTAGGGTGG", language=None)
|
| 943 |
+
st.caption("Gene editing target")
|
| 944 |
+
with col4:
|
| 945 |
+
if st.button("🧬 Telomere", key="telo"):
|
| 946 |
+
st.code("TTAGGGTTAGGGTTAGGG", language=None)
|
| 947 |
+
st.caption("Chromosome end")
|
| 948 |
|
| 949 |
+
if st.button("🔬 Analyze DNA", type="primary", use_container_width=True):
|
| 950 |
+
seq = dna_seq.strip().upper().replace("U", "T")
|
| 951 |
+
seq = ''.join(c for c in seq if c in 'ATGC')
|
| 952 |
|
| 953 |
if len(seq) < 3:
|
| 954 |
+
st.error("Sequence too short. Please enter at least 3 bases.")
|
| 955 |
else:
|
| 956 |
+
# Advanced statistics
|
| 957 |
+
st.markdown("### 📊 Sequence Analysis")
|
| 958 |
+
|
| 959 |
+
col1, col2, col3, col4 = st.columns(4)
|
| 960 |
|
| 961 |
with col1:
|
| 962 |
+
st.metric("Length", f"{len(seq)} bp")
|
| 963 |
+
st.metric("Size", f"~{len(seq)*660:.0f} Da")
|
| 964 |
+
|
| 965 |
+
with col2:
|
| 966 |
gc = (seq.count("G") + seq.count("C")) / len(seq) * 100
|
| 967 |
+
st.metric("GC Content", f"{gc:.1f}%")
|
| 968 |
+
if gc > 65:
|
| 969 |
+
st.caption("🔴 Very high")
|
| 970 |
+
elif gc > 55:
|
| 971 |
+
st.caption("🟠 High")
|
| 972 |
+
elif gc < 35:
|
| 973 |
+
st.caption("🔵 Low")
|
| 974 |
+
elif gc < 25:
|
| 975 |
+
st.caption("🟣 Very low")
|
| 976 |
else:
|
| 977 |
+
st.caption("🟢 Normal")
|
| 978 |
|
| 979 |
+
with col3:
|
| 980 |
+
at = 100 - gc
|
| 981 |
+
st.metric("AT Content", f"{at:.1f}%")
|
| 982 |
+
tm = 4 * (seq.count("G") + seq.count("C")) + 2 * (seq.count("A") + seq.count("T"))
|
| 983 |
+
st.metric("Tm (est.)", f"{tm}°C")
|
| 984 |
+
|
| 985 |
+
with col4:
|
| 986 |
+
cpg = seq.count("CG")
|
| 987 |
+
cpg_ratio = (cpg * len(seq)) / (seq.count("C") * seq.count("G")) if seq.count("C") * seq.count("G") > 0 else 0
|
| 988 |
+
st.metric("CpG Sites", cpg)
|
| 989 |
+
st.metric("CpG O/E", f"{cpg_ratio:.2f}")
|
| 990 |
+
|
| 991 |
+
# Motif search
|
| 992 |
+
st.markdown("### 🔍 Regulatory Elements & Motifs")
|
| 993 |
|
|
|
|
|
|
|
| 994 |
motifs_found = []
|
| 995 |
+
motif_positions = []
|
| 996 |
+
|
| 997 |
+
# Extended motif database
|
| 998 |
+
motif_db = {
|
| 999 |
+
"TATA Box": ["TATAAA", "TATAWAW"],
|
| 1000 |
+
"CAAT Box": ["CAAT", "CCAAT", "GGCCAATCT"],
|
| 1001 |
+
"GC Box": ["GGGCGG", "GGCGGG"],
|
| 1002 |
+
"Start Codon": ["ATG"],
|
| 1003 |
+
"Stop Codons": ["TAA", "TAG", "TGA"],
|
| 1004 |
+
"Kozak Sequence": ["GCCRCCATGG"],
|
| 1005 |
+
"Poly-A Signal": ["AATAAA", "ATTAAA"],
|
| 1006 |
+
"E-box": ["CANNTG"],
|
| 1007 |
+
"CRE": ["TGACGTCA"],
|
| 1008 |
+
"NF-κB": ["GGGACTTTCC"]
|
| 1009 |
+
}
|
| 1010 |
|
| 1011 |
+
for motif_name, patterns in motif_db.items():
|
| 1012 |
+
for pattern in patterns:
|
| 1013 |
+
# Simple pattern matching (R=A/G, W=A/T, N=any)
|
| 1014 |
+
simple_pattern = pattern.replace("R", "[AG]").replace("W", "[AT]").replace("N", "[ATGC]")
|
| 1015 |
+
import re
|
| 1016 |
+
if re.search(simple_pattern, seq):
|
| 1017 |
+
motifs_found.append(f"✅ {motif_name}: {pattern}")
|
| 1018 |
+
break
|
|
|
|
|
|
|
| 1019 |
|
| 1020 |
if motifs_found:
|
| 1021 |
for motif in motifs_found:
|
| 1022 |
st.write(motif)
|
| 1023 |
else:
|
| 1024 |
+
st.info("No known regulatory motifs detected")
|
| 1025 |
+
|
| 1026 |
+
# Codon analysis
|
| 1027 |
+
if len(seq) >= 3:
|
| 1028 |
+
st.markdown("### 🧬 Coding Potential Analysis")
|
| 1029 |
+
|
| 1030 |
+
col1, col2 = st.columns(2)
|
| 1031 |
+
|
| 1032 |
+
with col1:
|
| 1033 |
+
# Reading frames
|
| 1034 |
+
st.markdown("**Open Reading Frames:**")
|
| 1035 |
+
for frame in range(3):
|
| 1036 |
+
frame_seq = seq[frame:]
|
| 1037 |
+
if "ATG" in frame_seq:
|
| 1038 |
+
start_pos = frame_seq.index("ATG") + frame
|
| 1039 |
+
st.write(f"Frame {frame+1}: Start at position {start_pos+1}")
|
| 1040 |
+
|
| 1041 |
+
with col2:
|
| 1042 |
+
# Codon usage
|
| 1043 |
+
if len(seq) % 3 == 0:
|
| 1044 |
+
st.markdown("**Codon Statistics:**")
|
| 1045 |
+
codon_count = len(seq) // 3
|
| 1046 |
+
st.metric("Total Codons", codon_count)
|
| 1047 |
+
|
| 1048 |
+
# Count stops
|
| 1049 |
+
stops = seq.count("TAA") + seq.count("TAG") + seq.count("TGA")
|
| 1050 |
+
st.metric("Stop Codons", stops)
|
| 1051 |
|
| 1052 |
# AI Analysis
|
| 1053 |
if TORCH_AVAILABLE and TRANSFORMERS_AVAILABLE:
|
| 1054 |
+
st.markdown("### 🤖 AI-Powered Genomic Analysis")
|
| 1055 |
+
with st.spinner("Running DNABERT analysis... This may take 10-30 seconds"):
|
| 1056 |
result = dna_embed(seq, dna_model)
|
| 1057 |
+
|
| 1058 |
if "error" in result:
|
| 1059 |
+
st.error(f"Analysis failed: {result['error']}")
|
| 1060 |
else:
|
| 1061 |
+
st.success("✅ AI analysis complete!")
|
| 1062 |
|
| 1063 |
+
col1, col2, col3 = st.columns(3)
|
| 1064 |
with col1:
|
| 1065 |
+
st.metric("Embedding Dimension", result['size'])
|
|
|
|
|
|
|
| 1066 |
with col2:
|
| 1067 |
+
st.metric("k-mer Count", result.get('kmer_count', 'N/A'))
|
| 1068 |
+
with col3:
|
| 1069 |
+
st.metric("Mean Value", f"{result.get('mean', 0):.3f}")
|
| 1070 |
|
| 1071 |
st.markdown("""
|
| 1072 |
+
### 🎯 Applications of DNA Analysis:
|
| 1073 |
+
|
| 1074 |
+
1. **🔬 Gene Discovery**: Identify coding and regulatory regions
|
| 1075 |
+
2. **🏥 Disease Diagnosis**: Detect pathogenic mutations
|
| 1076 |
+
3. **✂️ CRISPR Design**: Find optimal gene editing sites
|
| 1077 |
+
4. **🌱 Evolution Studies**: Compare sequences across species
|
| 1078 |
+
5. **💊 Personalized Medicine**: Tailor treatments to genetic profiles
|
| 1079 |
+
6. **🦠 Pathogen Detection**: Identify viral/bacterial DNA
|
| 1080 |
""")
|
| 1081 |
else:
|
| 1082 |
+
st.warning("⚠️ AI models are loading. Please refresh in a moment.")
|
| 1083 |
|
| 1084 |
+
# Analysis History tab
|
| 1085 |
with tab4:
|
| 1086 |
+
st.subheader("📊 Analysis History & Insights")
|
| 1087 |
+
|
| 1088 |
+
if st.session_state.chat_history:
|
| 1089 |
+
st.markdown(f"### 💾 Previous Analyses ({len(st.session_state.chat_history)} total)")
|
| 1090 |
+
|
| 1091 |
+
for i, entry in enumerate(reversed(st.session_state.chat_history[-5:])):
|
| 1092 |
+
with st.expander(f"🕐 {entry['timestamp']} - Mode: {entry['mode']}", expanded=False):
|
| 1093 |
+
st.markdown("**Question:**")
|
| 1094 |
+
st.write(entry['question'])
|
| 1095 |
+
st.markdown("**Answer:**")
|
| 1096 |
+
st.write(entry['answer'][:500] + "..." if len(entry['answer']) > 500 else entry['answer'])
|
| 1097 |
+
|
| 1098 |
+
if st.button(f"View Full", key=f"view_{i}"):
|
| 1099 |
+
st.markdown(entry['answer'])
|
| 1100 |
+
else:
|
| 1101 |
+
st.info("No analysis history yet. Start by asking a question in the Chat tab!")
|
| 1102 |
+
|
| 1103 |
+
# Export options
|
| 1104 |
+
if st.session_state.chat_history:
|
| 1105 |
+
st.markdown("### 📤 Export Options")
|
| 1106 |
+
col1, col2 = st.columns(2)
|
| 1107 |
+
|
| 1108 |
+
with col1:
|
| 1109 |
+
if st.button("Export as Markdown"):
|
| 1110 |
+
md_content = "\n\n---\n\n".join([
|
| 1111 |
+
f"## {entry['timestamp']}\n\n**Q:** {entry['question']}\n\n**A:** {entry['answer']}"
|
| 1112 |
+
for entry in st.session_state.chat_history
|
| 1113 |
+
])
|
| 1114 |
+
st.download_button(
|
| 1115 |
+
"Download MD",
|
| 1116 |
+
md_content,
|
| 1117 |
+
f"bioseq_history_{time.strftime('%Y%m%d')}.md",
|
| 1118 |
+
"text/markdown"
|
| 1119 |
+
)
|
| 1120 |
+
|
| 1121 |
+
with col2:
|
| 1122 |
+
if st.button("Clear History"):
|
| 1123 |
+
st.session_state.chat_history = []
|
| 1124 |
+
st.rerun()
|
| 1125 |
+
|
| 1126 |
+
# Enhanced About tab
|
| 1127 |
+
with tab5:
|
| 1128 |
+
st.subheader("ℹ️ About BioSeq Chat Pro")
|
| 1129 |
+
|
| 1130 |
st.markdown("""
|
| 1131 |
+
### 🚀 Enhanced Features
|
| 1132 |
+
|
| 1133 |
+
#### **Collaborative AI System**
|
| 1134 |
+
- 🔍 **Investigator**: Verifies facts and identifies knowledge gaps
|
| 1135 |
+
- 📝 **Supervisor**: Creates comprehensive, structured answers
|
| 1136 |
+
- ✅ **Critic**: Reviews for accuracy and clarity
|
| 1137 |
+
- 🎯 **Integrator**: Synthesizes all inputs into final answer
|
| 1138 |
+
|
| 1139 |
+
#### **Technical Improvements**
|
| 1140 |
+
- **8000 token responses** for comprehensive answers
|
| 1141 |
+
- **Enhanced context building** with semantic search
|
| 1142 |
+
- **Multiple collaboration modes** (Full, Quick, Deep)
|
| 1143 |
+
- **Scientific source prioritization** in web search
|
| 1144 |
+
- **Larger embedding models** for better accuracy
|
| 1145 |
+
|
| 1146 |
+
### 🧬 Supported Analyses
|
| 1147 |
+
- **Protein Analysis**: ESM-2 embeddings, property prediction
|
| 1148 |
+
- **DNA Analysis**: DNABERT-2/BERT embeddings, motif search
|
| 1149 |
+
- **RAG Chat**: Context-aware Q&A with file integration
|
| 1150 |
+
- **PDF Support**: Direct analysis of research papers
|
| 1151 |
+
|
| 1152 |
+
### 📚 Models & Technologies
|
| 1153 |
+
- **LLM**: Llama 3.1 70B (via Fireworks AI)
|
| 1154 |
+
- **Protein**: ESM-2 (Meta/Facebook)
|
| 1155 |
+
- **DNA**: DNABERT-2 (Microsoft) / BERT (Google)
|
| 1156 |
+
- **Embeddings**: all-mpnet-base-v2 (Sentence Transformers)
|
| 1157 |
+
- **Vector Search**: FAISS (Facebook)
|
| 1158 |
+
|
| 1159 |
+
### ⚠️ Disclaimer
|
| 1160 |
+
This tool is designed for **research and educational purposes only**.
|
| 1161 |
+
- Not intended for medical diagnosis or treatment
|
| 1162 |
+
- Not validated for clinical use
|
| 1163 |
+
- Always consult qualified professionals for medical decisions
|
| 1164 |
+
|
| 1165 |
+
### 🔧 System Status
|
| 1166 |
""")
|
| 1167 |
|
| 1168 |
+
# System status with better formatting
|
| 1169 |
+
col1, col2 = st.columns(2)
|
| 1170 |
+
|
| 1171 |
+
deps_essential = {
|
| 1172 |
"PyTorch": TORCH_AVAILABLE,
|
| 1173 |
"Transformers": TRANSFORMERS_AVAILABLE,
|
| 1174 |
"Sentence Transformers": SENTENCE_TRANSFORMERS_AVAILABLE,
|
| 1175 |
"FAISS": FAISS_AVAILABLE,
|
| 1176 |
+
}
|
| 1177 |
+
|
| 1178 |
+
deps_optional = {
|
| 1179 |
"BioPython": BIOPYTHON_AVAILABLE,
|
| 1180 |
"Datasets": DATASETS_AVAILABLE,
|
| 1181 |
+
"PDF (pdfplumber)": PDFPLUMBER_AVAILABLE,
|
| 1182 |
+
"PDF (PyPDF2)": PYPDF2_AVAILABLE
|
| 1183 |
}
|
| 1184 |
|
| 1185 |
+
with col1:
|
| 1186 |
+
st.markdown("**Essential Components:**")
|
| 1187 |
+
for name, available in deps_essential.items():
|
| 1188 |
+
if available:
|
| 1189 |
+
st.success(f"✅ {name}")
|
| 1190 |
+
else:
|
| 1191 |
+
st.error(f"❌ {name}")
|
| 1192 |
+
|
| 1193 |
+
with col2:
|
| 1194 |
+
st.markdown("**Optional Components:**")
|
| 1195 |
+
for name, available in deps_optional.items():
|
| 1196 |
+
if available:
|
| 1197 |
+
st.success(f"✅ {name}")
|
| 1198 |
+
else:
|
| 1199 |
+
st.warning(f"⚠️ {name}")
|
| 1200 |
+
|
| 1201 |
+
# Performance metrics
|
| 1202 |
+
if st.session_state.chat_history:
|
| 1203 |
+
st.markdown("### 📈 Usage Statistics")
|
| 1204 |
+
col1, col2, col3 = st.columns(3)
|
| 1205 |
+
with col1:
|
| 1206 |
+
st.metric("Total Queries", len(st.session_state.chat_history))
|
| 1207 |
+
with col2:
|
| 1208 |
+
modes = [h['mode'] for h in st.session_state.chat_history]
|
| 1209 |
+
most_used = max(set(modes), key=modes.count) if modes else "N/A"
|
| 1210 |
+
st.metric("Most Used Mode", most_used)
|
| 1211 |
+
with col3:
|
| 1212 |
+
avg_length = sum(len(h['answer']) for h in st.session_state.chat_history) / len(st.session_state.chat_history)
|
| 1213 |
+
st.metric("Avg Answer Length", f"{avg_length:.0f} chars")
|
| 1214 |
+
|
| 1215 |
+
st.markdown("""
|
| 1216 |
+
---
|
| 1217 |
+
### 📞 Support & Feedback
|
| 1218 |
+
- Report issues or suggest features
|
| 1219 |
+
- Contribute to development
|
| 1220 |
+
- Share your research results
|
| 1221 |
+
|
| 1222 |
+
**Version**: 2.0.0 Pro | **Last Updated**: 2025
|
| 1223 |
+
""")
|