BIOseq / app.py
openfree's picture
Update app.py
4fd4c0d verified
raw
history blame
25.2 kB
import os
import json
from typing import List, Dict, Tuple
import streamlit as st
import requests
# ์„ ํƒ์  ์˜์กด์„ฑ ๊ฐ€๋“œ
try:
import torch
TORCH_AVAILABLE = True
except ImportError:
TORCH_AVAILABLE = False
print("[WARNING] torch not available")
try:
from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM
TRANSFORMERS_AVAILABLE = True
except ImportError:
TRANSFORMERS_AVAILABLE = False
print("[WARNING] transformers not available")
try:
from datasets import load_dataset
DATASETS_AVAILABLE = True
except ImportError:
DATASETS_AVAILABLE = False
print("[WARNING] datasets not available")
try:
from sentence_transformers import SentenceTransformer
SENTENCE_TRANSFORMERS_AVAILABLE = True
except ImportError:
SENTENCE_TRANSFORMERS_AVAILABLE = False
print("[WARNING] sentence_transformers not available")
try:
import faiss
FAISS_AVAILABLE = True
except ImportError:
FAISS_AVAILABLE = False
print("[WARNING] faiss not available")
try:
from Bio import SeqIO
BIOPYTHON_AVAILABLE = True
except ImportError:
BIOPYTHON_AVAILABLE = False
print("[WARNING] biopython not available")
# ์ƒ์ˆ˜
APP_TITLE = "BioSeq Chat: Protein & DNA Assistant"
DISCLAIMER = "This tool is for research/education and is not a medical device. Do not use outputs for diagnosis or treatment decisions."
# --------------- Helper Functions ---------------
def get_secret(name: str, fallback: str = "") -> str:
"""Get secret from st.secrets or environment"""
try:
# Streamlit secrets
if hasattr(st, 'secrets') and name in st.secrets:
return st.secrets[name]
except:
pass
# Environment variable
return os.environ.get(name, fallback)
def brave_search(query: str, count: int = 5) -> List[Dict]:
"""Brave Search API"""
key = get_secret("BRAVE_API_KEY", "")
if not key:
return [{
"title": "BRAVE_API_KEY missing",
"url": "",
"snippet": "Set BRAVE_API_KEY in Space secrets or sidebar"
}]
url = "https://api.search.brave.com/res/v1/web/search"
headers = {
"Accept": "application/json",
"X-Subscription-Token": key
}
params = {"q": query, "count": count}
try:
r = requests.get(url, headers=headers, params=params, timeout=15)
r.raise_for_status()
data = r.json()
results = []
for item in data.get("web", {}).get("results", [])[:count]:
results.append({
"title": item.get("title", ""),
"url": item.get("url", ""),
"snippet": item.get("description", "")
})
return results if results else [{"title": "No results", "url": "", "snippet": ""}]
except Exception as e:
return [{"title": "Error", "url": "", "snippet": str(e)}]
def call_llm(messages: List[Dict], temperature: float = 0.6, max_tokens: int = 4000) -> str:
"""Call Fireworks AI API"""
api_key = get_secret("FIREWORKS_API_KEY", "")
if not api_key:
return "FIREWORKS_API_KEY missing. Set it in Secrets or sidebar."
url = "https://api.fireworks.ai/inference/v1/chat/completions"
payload = {
"model": "accounts/fireworks/models/llama-v3p1-70b-instruct",
"messages": messages,
"max_tokens": max_tokens,
"temperature": temperature,
"top_p": 1,
"frequency_penalty": 0,
"presence_penalty": 0
}
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {api_key}"
}
try:
r = requests.post(url, headers=headers, json=payload, timeout=60)
r.raise_for_status()
return r.json()["choices"][0]["message"]["content"]
except Exception as e:
return f"[LLM Error] {e}"
def load_file_text(upload) -> str:
"""Load text from uploaded file"""
name = upload.name.lower()
try:
content = upload.read()
text = content.decode("utf-8", errors="ignore")
except:
return ""
# FASTA handling
if name.endswith((".fa", ".fasta", ".faa", ".fna")) and BIOPYTHON_AVAILABLE:
try:
upload.seek(0)
records = list(SeqIO.parse(upload, "fasta"))
seqs = [f">{r.id}\n{str(r.seq)}" for r in records]
return "\n\n".join(seqs)
except:
pass
return text
def chunk_text(text: str, size: int = 1200, overlap: int = 200) -> List[str]:
"""Split text into chunks"""
chunks = []
start = 0
text_len = len(text)
while start < text_len:
end = min(start + size, text_len)
chunks.append(text[start:end])
if end >= text_len:
break
start = end - overlap
return chunks
def build_index(texts: List[str]):
"""Build vector index"""
if not SENTENCE_TRANSFORMERS_AVAILABLE or not FAISS_AVAILABLE:
return None, None
try:
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
embeddings = model.encode(texts, show_progress_bar=False)
dim = embeddings.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(embeddings.astype("float32"))
return index, model
except Exception as e:
st.warning(f"Index build failed: {e}")
return None, None
def search_index(query: str, index, model, texts: List[str], k: int = 4) -> List[Dict]:
"""Search vector index"""
if index is None or model is None:
return []
try:
q_emb = model.encode([query])
D, I = index.search(q_emb.astype("float32"), k)
results = []
for idx, score in zip(I[0], D[0]):
if 0 <= idx < len(texts):
results.append({
"score": float(score),
"text": texts[idx]
})
return results
except:
return []
def esm2_embed(seq: str, model_name: str = "facebook/esm2_t6_8M_UR50D") -> Dict:
"""ESM-2 protein embedding"""
if not TORCH_AVAILABLE or not TRANSFORMERS_AVAILABLE:
return {"error": "PyTorch/Transformers not available"}
try:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_name)
model.eval()
with torch.no_grad():
inputs = tokenizer(seq, return_tensors="pt", truncation=True, max_length=1024)
outputs = model(**inputs, output_hidden_states=True)
hidden = outputs.hidden_states[-1].mean(dim=1).squeeze(0)
vec = hidden.cpu().numpy()
# ๋ฉ”๋ชจ๋ฆฌ ์ •๋ฆฌ
del model
del tokenizer
if torch.cuda.is_available():
torch.cuda.empty_cache()
return {
"embedding": vec.tolist()[:10], # ๋ฏธ๋ฆฌ๋ณด๊ธฐ์šฉ ์ฒซ 10๊ฐœ๋งŒ
"size": vec.shape[0]
}
except Exception as e:
return {"error": str(e)}
def dna_embed(seq: str, model_name: str = "zhihan1996/DNABERT-2-117M") -> Dict:
"""DNA embedding"""
if not TORCH_AVAILABLE or not TRANSFORMERS_AVAILABLE:
return {"error": "PyTorch/Transformers not available"}
try:
# einops ์ฒดํฌ
try:
import einops
except ImportError:
return {"error": "einops package required. Please wait for installation and refresh the page."}
# ๊ฐ„๋‹จํ•œ ๋Œ€์•ˆ: ๋” ์•ˆ์ •์ ์ธ ๋ชจ๋ธ ์‚ฌ์šฉ
# DNABERT-2๊ฐ€ ๋ฌธ์ œ๋ฅผ ์ผ์œผํ‚ค๋ฉด ๊ธฐ๋ณธ BERT ์‚ฌ์šฉ
try:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
except Exception as model_error:
# ๋Œ€์ฒด ๋ชจ๋ธ ์‚ฌ์šฉ
try:
from transformers import BertTokenizer, BertModel
# ๊ธฐ๋ณธ BERT ๋ชจ๋ธ๋กœ ํด๋ฐฑ
fallback_model = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(fallback_model)
model = BertModel.from_pretrained(fallback_model)
st.warning(f"DNABERT-2 ๋กœ๋”ฉ ์‹คํŒจ. ๋Œ€์ฒด ๋ชจ๋ธ ์‚ฌ์šฉ์ค‘: {fallback_model}")
except:
return {"error": f"๋ชจ๋ธ ๋กœ๋”ฉ ์‹คํŒจ: {str(model_error)}"}
model.eval()
# DNA ์„œ์—ด์„ k-mer๋กœ ๋ณ€ํ™˜ (DNABERT ์Šคํƒ€์ผ)
def seq_to_kmer(seq, k=6):
"""DNA ์„œ์—ด์„ k-mer๋กœ ๋ณ€ํ™˜"""
kmers = []
for i in range(len(seq) - k + 1):
kmers.append(seq[i:i+k])
return ' '.join(kmers)
# k-mer ๋ณ€ํ™˜ ๋˜๋Š” ์ง์ ‘ ์‚ฌ์šฉ
if len(seq) > 6:
input_seq = seq_to_kmer(seq, k=6)
else:
input_seq = seq
with torch.no_grad():
inputs = tokenizer(
input_seq,
return_tensors="pt",
truncation=True,
max_length=512,
padding=True
)
outputs = model(**inputs)
# last_hidden_state ๋˜๋Š” pooler_output ์‚ฌ์šฉ
if hasattr(outputs, 'pooler_output') and outputs.pooler_output is not None:
vec = outputs.pooler_output.squeeze(0).cpu().numpy()
else:
hidden = outputs.last_hidden_state.mean(dim=1).squeeze(0)
vec = hidden.cpu().numpy()
# ๋ฉ”๋ชจ๋ฆฌ ์ •๋ฆฌ
del model
del tokenizer
if torch.cuda.is_available():
torch.cuda.empty_cache()
return {
"embedding": vec.tolist()[:10], # ๋ฏธ๋ฆฌ๋ณด๊ธฐ์šฉ ์ฒซ 10๊ฐœ๋งŒ
"size": vec.shape[0]
}
except Exception as e:
return {"error": f"๋ถ„์„ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)[:200]}"}
def build_context(query: str, docs: List[str], index, model, use_web: bool, web_k: int) -> Tuple[str, List[Dict]]:
"""Build context from sources"""
pieces = []
sources = []
# File search
if index and model and docs:
hits = search_index(query, index, model, docs, k=4)
for h in hits:
pieces.append(f"[FILE] {h['text'][:500]}")
sources.append({"type": "file", "text": h['text'][:100]})
# Web search
if use_web:
results = brave_search(query, count=web_k)
for r in results:
pieces.append(f"[WEB] {r['title']}\n{r['snippet']}")
sources.append({"type": "web", "title": r['title'], "url": r['url']})
context = "\n\n---\n\n".join(pieces)[:4000]
return context, sources
def answer_question(query: str, context: str) -> str:
"""Generate answer"""
system = (
"You are an expert bioinformatics assistant who explains complex biological concepts in an accessible way. "
"Your responses should be:\n"
"1. Comprehensive yet easy to understand\n"
"2. Well-structured with clear sections\n"
"3. Include relevant examples and analogies\n"
"4. Provide actionable insights when appropriate\n"
"5. Use Korean if the user writes in Korean, otherwise English\n"
"6. Never provide medical diagnosis or treatment advice\n"
"7. Format your response with headers, bullet points, and clear paragraphs\n"
"8. Aim for 300-500 words minimum for complex questions"
)
user_msg = f"""Context information:\n{context}\n\n
User Question: {query}
Please provide a detailed, well-structured response that:
- Directly answers the question
- Explains the biological background
- Includes practical implications when relevant
- Uses simple analogies to explain complex concepts
- Cites the context when appropriate"""
messages = [
{"role": "system", "content": system},
{"role": "user", "content": user_msg}
]
return call_llm(messages, temperature=0.4, max_tokens=4000)
# --------------- Streamlit UI ---------------
st.set_page_config(page_title=APP_TITLE, page_icon="๐Ÿงฌ", layout="wide")
st.title(APP_TITLE)
st.caption(DISCLAIMER)
# Session state init
if "docs" not in st.session_state:
st.session_state.docs = []
if "index" not in st.session_state:
st.session_state.index = None
if "model" not in st.session_state:
st.session_state.model = None
# Sidebar
with st.sidebar:
st.header("Configuration")
fw_key = st.text_input(
"FIREWORKS_API_KEY",
value=get_secret("FIREWORKS_API_KEY", ""),
type="password"
)
brave_key = st.text_input(
"BRAVE_API_KEY",
value=get_secret("BRAVE_API_KEY", ""),
type="password"
)
if fw_key:
os.environ["FIREWORKS_API_KEY"] = fw_key
if brave_key:
os.environ["BRAVE_API_KEY"] = brave_key
st.divider()
esm_model = st.text_input(
"ESM-2 Model",
value="facebook/esm2_t6_8M_UR50D"
)
dna_model = st.text_input(
"DNA Model",
value="bert-base-uncased", # ๋” ์•ˆ์ •์ ์ธ ๊ธฐ๋ณธ ๋ชจ๋ธ
help="Options: bert-base-uncased (stable), zhihan1996/DNABERT-2-117M (specialized but may require more memory)"
)
use_web = st.checkbox("Enable web search", value=True)
web_results = st.slider("Web results", 1, 10, 3)
# Tabs
tab1, tab2, tab3, tab4 = st.tabs(["Chat", "Protein", "DNA", "About"])
# File upload
with st.expander("๐Ÿ“ Upload Files", expanded=True):
files = st.file_uploader(
"Upload text/FASTA files",
type=["txt", "fa", "fasta", "csv", "json"],
accept_multiple_files=True
)
if files:
docs = []
for f in files:
try:
text = load_file_text(f)
if text:
docs.extend(chunk_text(text))
except Exception as e:
st.error(f"Error reading {f.name}: {e}")
if docs:
st.session_state.docs = docs
st.success(f"Loaded {len(docs)} chunks")
if SENTENCE_TRANSFORMERS_AVAILABLE and FAISS_AVAILABLE:
with st.spinner("Building index..."):
index, model = build_index(docs)
if index:
st.session_state.index = index
st.session_state.model = model
# Chat tab
with tab1:
st.subheader("๐Ÿ’ฌ Chat Assistant")
question = st.text_area(
"Ask about proteins, DNA, or bioinformatics:",
value="What is the role of ESM-2 embeddings in protein analysis?",
height=100
)
if st.button("Get Answer", type="primary"):
if not get_secret("FIREWORKS_API_KEY"):
st.error("Please set FIREWORKS_API_KEY")
else:
with st.spinner("Thinking..."):
context, sources = build_context(
question,
st.session_state.docs,
st.session_state.index,
st.session_state.model,
use_web,
web_results
)
answer = answer_question(question, context)
st.markdown("### Answer")
st.write(answer)
if sources:
st.markdown("### Sources")
for s in sources:
if s["type"] == "web":
st.write(f"- ๐ŸŒ [{s['title']}]({s['url']})")
elif s["type"] == "file":
st.write(f"- ๐Ÿ“„ File: {s['text'][:80]}...")
# Protein tab
with tab2:
st.subheader("๐Ÿงฌ Protein Analysis")
st.info("""
**๋‹จ๋ฐฑ์งˆ ์„œ์—ด ๋ถ„์„์ด๋ž€?**
- ๋‹จ๋ฐฑ์งˆ์˜ ์•„๋ฏธ๋…ธ์‚ฐ ์„œ์—ด์„ AI๊ฐ€ ๋ถ„์„ํ•˜์—ฌ ๊ธฐ๋Šฅ๊ณผ ๊ตฌ์กฐ๋ฅผ ์˜ˆ์ธกํ•ฉ๋‹ˆ๋‹ค
- ESM-2๋Š” Meta๊ฐ€ ๊ฐœ๋ฐœํ•œ AI๋กœ, 6์–ต 5์ฒœ๋งŒ๊ฐœ ๋‹จ๋ฐฑ์งˆ์„ ํ•™์Šตํ–ˆ์Šต๋‹ˆ๋‹ค
- ์šฉ๋„: ์‹ ์•ฝ ๊ฐœ๋ฐœ, ์งˆ๋ณ‘ ์—ฐ๊ตฌ, ์ง„ํ™” ๋ถ„์„ ๋“ฑ
""")
protein_seq = st.text_area(
"๋‹จ๋ฐฑ์งˆ ์„œ์—ด ์ž…๋ ฅ (๋ณต์‚ฌ-๋ถ™์—ฌ๋„ฃ๊ธฐ ๊ฐ€๋Šฅ):",
value="MKTIIALSYIFCLVFA",
help="๋‹จ๋ฐฑ์งˆ ์„œ์—ด์€ 20๊ฐœ ์•„๋ฏธ๋…ธ์‚ฐ ๋ฌธ์ž(A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y)๋กœ ๊ตฌ์„ฑ๋ฉ๋‹ˆ๋‹ค",
height=100
)
st.markdown("**์˜ˆ์ œ ์„œ์—ด (ํด๋ฆญํ•ด์„œ ๋ณต์‚ฌ):**")
col1, col2, col3 = st.columns(3)
with col1:
if st.button("์ธ์А๋ฆฐ", key="ins"):
st.code("FVNQHLCGSHLVEALYLVCGERGFFYTPKT", language=None)
with col2:
if st.button("์—”๋Œํ•€", key="end"):
st.code("YGGFMTSEKSQTPLVTLFKNAIIKNAYKKGE", language=None)
with col3:
if st.button("์˜ฅ์‹œํ† ์‹ ", key="oxy"):
st.code("CYIQNCPLG", language=None)
if st.button("๐Ÿ”ฌ ๋‹จ๋ฐฑ์งˆ ๋ถ„์„ ์‹œ์ž‘", type="primary"):
seq = protein_seq.strip().upper()
# Basic stats
st.markdown("### ๐Ÿ“Š ๊ธฐ๋ณธ ๋ถ„์„ ๊ฒฐ๊ณผ")
col1, col2 = st.columns(2)
with col1:
st.metric("์„œ์—ด ๊ธธ์ด", f"{len(seq)} ์•„๋ฏธ๋…ธ์‚ฐ")
st.metric("๋ถ„์ž๋Ÿ‰ (์ถ”์ •)", f"~{len(seq) * 110} Da")
with col2:
unique_aa = len(set(seq))
st.metric("์‚ฌ์šฉ๋œ ์•„๋ฏธ๋…ธ์‚ฐ ์ข…๋ฅ˜", f"{unique_aa}๊ฐœ")
hydrophobic = sum(1 for aa in seq if aa in "AVILMFYW")
st.metric("์†Œ์ˆ˜์„ฑ ๋น„์œจ", f"{hydrophobic/len(seq)*100:.1f}%")
# AI Analysis
if TORCH_AVAILABLE and TRANSFORMERS_AVAILABLE:
st.markdown("### ๐Ÿค– AI ์ž„๋ฒ ๋”ฉ ๋ถ„์„")
with st.spinner("AI ๋ชจ๋ธ์ด ๋‹จ๋ฐฑ์งˆ์„ ๋ถ„์„์ค‘... (10-30์ดˆ)"):
result = esm2_embed(seq, esm_model)
if "error" in result:
st.error(result["error"])
else:
st.success("โœ… AI ๋ถ„์„ ์™„๋ฃŒ!")
col1, col2 = st.columns(2)
with col1:
st.metric("๋ฒกํ„ฐ ์ฐจ์›", result['size'])
st.caption("์ด ์ˆซ์ž๋“ค์€ ๋‹จ๋ฐฑ์งˆ์˜ ํŠน์„ฑ์„ ์ˆ˜์น˜ํ™”ํ•œ ๊ฒƒ์ž…๋‹ˆ๋‹ค")
with col2:
st.markdown("**์ž„๋ฒ ๋”ฉ ๋ฒกํ„ฐ ๋ฏธ๋ฆฌ๋ณด๊ธฐ:**")
st.code(result["embedding"][:5])
st.markdown("""
**๐ŸŽฏ ์ด ๋ถ„์„์˜ ํ™œ์šฉ:**
- ์œ ์‚ฌํ•œ ๊ธฐ๋Šฅ์˜ ๋‹จ๋ฐฑ์งˆ ์ฐพ๊ธฐ
- ๊ตฌ์กฐ ์˜ˆ์ธก์˜ ๊ธฐ์ดˆ ๋ฐ์ดํ„ฐ
- ๋Œ์—ฐ๋ณ€์ด ์˜ํ–ฅ ์˜ˆ์ธก
- ์‹ ์•ฝ ํƒ€๊ฒŸ ๋ฐœ๊ตด
""")
else:
st.warning("โš ๏ธ AI ๋ชจ๋ธ ๋กœ๋”ฉ ์ค‘... ์ž ์‹œ ํ›„ ๋‹ค์‹œ ์‹œ๋„ํ•ด์ฃผ์„ธ์š”")
# DNA tab
with tab3:
st.subheader("๐Ÿงฌ DNA Analysis")
st.info("""
**DNA ์„œ์—ด ๋ถ„์„์ด๋ž€?**
- DNA์˜ ์—ผ๊ธฐ์„œ์—ด(A,T,G,C)์„ AI๊ฐ€ ๋ถ„์„ํ•˜์—ฌ ๊ธฐ๋Šฅ์„ ์˜ˆ์ธกํ•ฉ๋‹ˆ๋‹ค
- DNABERT-2๋Š” ์ธ๊ฐ„ ๊ฒŒ๋†ˆ ์ „์ฒด๋ฅผ ํ•™์Šตํ•œ AI ๋ชจ๋ธ์ž…๋‹ˆ๋‹ค
- ์šฉ๋„: ์œ ์ „์ž ๊ธฐ๋Šฅ ์˜ˆ์ธก, ์งˆ๋ณ‘ ๋ณ€์ด ๋ฐœ๊ฒฌ, ์ง„ํ™” ์—ฐ๊ตฌ ๋“ฑ
""")
dna_seq = st.text_area(
"DNA ์„œ์—ด ์ž…๋ ฅ (๋ณต์‚ฌ-๋ถ™์—ฌ๋„ฃ๊ธฐ ๊ฐ€๋Šฅ):",
value="ATGCGATCGTAGC",
help="DNA๋Š” 4๊ฐœ ์—ผ๊ธฐ(A: ์•„๋ฐ๋‹Œ, T: ํ‹ฐ๋ฏผ, G: ๊ตฌ์•„๋‹Œ, C: ์‹œํ† ์‹ )๋กœ ๊ตฌ์„ฑ๋ฉ๋‹ˆ๋‹ค",
height=100
)
st.markdown("**์˜ˆ์ œ ์„œ์—ด (ํด๋ฆญํ•ด์„œ ๋ณต์‚ฌ):**")
col1, col2, col3 = st.columns(3)
with col1:
if st.button("TATA ๋ฐ•์Šค", key="tata"):
st.code("GCGCGATATAAAGGCGCGGGCGCGCG", language=None)
st.caption("์œ ์ „์ž ๋ฐœํ˜„ ์‹œ์ž‘ ์‹ ํ˜ธ")
with col2:
if st.button("ํ”„๋กœ๋ชจํ„ฐ", key="prom"):
st.code("TTGACAGGCTAGCTCAGTCCTAGGTATAATGCTAGC", language=None)
st.caption("์œ ์ „์ž ์กฐ์ ˆ ์˜์—ญ")
with col3:
if st.button("CRISPR ํƒ€๊ฒŸ", key="crispr"):
st.code("GTCACCTCCAATGACTAGGGTGG", language=None)
st.caption("์œ ์ „์ž ํŽธ์ง‘ ๋ถ€์œ„")
if st.button("๐Ÿ”ฌ DNA ๋ถ„์„ ์‹œ์ž‘", type="primary"):
seq = dna_seq.strip().upper().replace("U", "T") # RNA์˜ U๋ฅผ T๋กœ ๋ณ€ํ™˜
seq = ''.join(c for c in seq if c in 'ATGC') # ATGC๋งŒ ๋‚จ๊ธฐ๊ธฐ
if len(seq) < 3:
st.error("์ตœ์†Œ 3๊ฐœ ์ด์ƒ์˜ ์—ผ๊ธฐ๋ฅผ ์ž…๋ ฅํ•ด์ฃผ์„ธ์š”")
else:
st.markdown("### ๐Ÿ“Š ๊ธฐ๋ณธ ๋ถ„์„ ๊ฒฐ๊ณผ")
col1, col2 = st.columns(2)
with col1:
st.metric("์„œ์—ด ๊ธธ์ด", f"{len(seq)} bp")
gc = (seq.count("G") + seq.count("C")) / len(seq) * 100
st.metric("GC ํ•จ๋Ÿ‰", f"{gc:.1f}%")
if gc > 60:
st.caption("๐Ÿ”ด ๋†’์Œ: ์•ˆ์ •์ ์ด์ง€๋งŒ ๋ณต์ œ ์–ด๋ ค์›€")
elif gc < 40:
st.caption("๐Ÿ”ต ๋‚ฎ์Œ: ๋ถˆ์•ˆ์ •ํ•˜์ง€๋งŒ ๋ณต์ œ ์šฉ์ด")
else:
st.caption("๐ŸŸข ์ ์ •: ์ผ๋ฐ˜์ ์ธ ๋ฒ”์œ„")
with col2:
at = (seq.count("A") + seq.count("T")) / len(seq) * 100
st.metric("AT ํ•จ๋Ÿ‰", f"{at:.1f}%")
# ์ฝ”๋ˆ ๋ถ„์„ (3์˜ ๋ฐฐ์ˆ˜์ธ ๊ฒฝ์šฐ)
if len(seq) % 3 == 0:
st.metric("๊ฐ€๋Šฅํ•œ ์ฝ”๋ˆ ์ˆ˜", f"{len(seq)//3}๊ฐœ")
st.caption("๋‹จ๋ฐฑ์งˆ๋กœ ๋ฒˆ์—ญ ๊ฐ€๋Šฅ")
# ํŠน๋ณ„ ์„œ์—ด ์ฐพ๊ธฐ
st.markdown("### ๐Ÿ” ์ฃผ์š” ๋ชจํ‹ฐํ”„ ๊ฒ€์ƒ‰")
motifs_found = []
if "TATAAAA" in seq or "TATAAA" in seq:
motifs_found.append("โœ… TATA box ๋ฐœ๊ฒฌ (์ „์‚ฌ ์‹œ์ž‘ ์‹ ํ˜ธ)")
if "CAAT" in seq or "CCAAT" in seq:
motifs_found.append("โœ… CAAT box ๋ฐœ๊ฒฌ (์ „์‚ฌ ์กฐ์ ˆ)")
if "ATG" in seq:
motifs_found.append("โœ… ์‹œ์ž‘ ์ฝ”๋ˆ(ATG) ๋ฐœ๊ฒฌ")
if "TAA" in seq or "TAG" in seq or "TGA" in seq:
motifs_found.append("โœ… ์ •์ง€ ์ฝ”๋ˆ ๋ฐœ๊ฒฌ")
if seq.count("CG") > len(seq)/20:
motifs_found.append("โœ… CpG ์„ฌ ๊ฐ€๋Šฅ์„ฑ (์œ ์ „์ž ์กฐ์ ˆ)")
if motifs_found:
for motif in motifs_found:
st.write(motif)
else:
st.write("ํŠน๋ณ„ํ•œ ๋ชจํ‹ฐํ”„๊ฐ€ ๋ฐœ๊ฒฌ๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค")
# AI Analysis
if TORCH_AVAILABLE and TRANSFORMERS_AVAILABLE:
st.markdown("### ๐Ÿค– AI ์ž„๋ฒ ๋”ฉ ๋ถ„์„")
with st.spinner("AI ๋ชจ๋ธ์ด DNA๋ฅผ ๋ถ„์„์ค‘... (10-30์ดˆ)"):
result = dna_embed(seq, dna_model)
if "error" in result:
st.error(result["error"])
else:
st.success("โœ… AI ๋ถ„์„ ์™„๋ฃŒ!")
col1, col2 = st.columns(2)
with col1:
st.metric("๋ฒกํ„ฐ ์ฐจ์›", result['size'])
st.caption("DNA ํŠน์„ฑ์„ ์ˆ˜์น˜ํ™”ํ•œ ๊ฒฐ๊ณผ์ž…๋‹ˆ๋‹ค")
with col2:
st.markdown("**์ž„๋ฒ ๋”ฉ ๋ฒกํ„ฐ ๋ฏธ๋ฆฌ๋ณด๊ธฐ:**")
st.code(result["embedding"][:5])
st.markdown("""
**๐ŸŽฏ ์ด ๋ถ„์„์˜ ํ™œ์šฉ:**
- ์œ ์ „์ž ๊ธฐ๋Šฅ ์˜ˆ์ธก
- ํ”„๋กœ๋ชจํ„ฐ/์ธํ•ธ์„œ ์ฐพ๊ธฐ
- ์ง„ํ™”์  ๋ณด์กด ์˜์—ญ ๋ฐœ๊ฒฌ
- ์งˆ๋ณ‘ ๊ด€๋ จ ๋ณ€์ด ์˜ˆ์ธก
- CRISPR ํƒ€๊ฒŸ ๋ถ€์œ„ ํ‰๊ฐ€
""")
else:
st.warning("โš ๏ธ AI ๋ชจ๋ธ ๋กœ๋”ฉ ์ค‘... ์ž ์‹œ ํ›„ ๋‹ค์‹œ ์‹œ๋„ํ•ด์ฃผ์„ธ์š”")
# About tab
with tab4:
st.subheader("โ„น๏ธ About")
st.markdown("""
### Features
- ๐Ÿ’ฌ RAG-based chat for bioinformatics questions
- ๐Ÿงฌ Protein sequence analysis with ESM-2
- ๐Ÿงฌ DNA sequence analysis with DNABERT-2
- ๐Ÿ” Web search integration via Brave API
- ๐Ÿ“ File upload and vector search
### Models
- **Proteins:** ESM-2 (Facebook)
- **DNA:** DNABERT-2 (Microsoft)
- **LLM:** Llama 3.1 70B (via Fireworks)
### Disclaimer
This tool is for research and educational purposes only.
Not for medical diagnosis or treatment decisions.
""")
# Dependency check
st.divider()
st.subheader("System Status")
deps = {
"PyTorch": TORCH_AVAILABLE,
"Transformers": TRANSFORMERS_AVAILABLE,
"Sentence Transformers": SENTENCE_TRANSFORMERS_AVAILABLE,
"FAISS": FAISS_AVAILABLE,
"BioPython": BIOPYTHON_AVAILABLE,
"Datasets": DATASETS_AVAILABLE
}
for name, available in deps.items():
if available:
st.success(f"โœ… {name}")
else:
st.warning(f"โš ๏ธ {name} not available")