|
|
import os |
|
|
import json |
|
|
from typing import List, Dict, Tuple |
|
|
|
|
|
import streamlit as st |
|
|
import requests |
|
|
|
|
|
|
|
|
try: |
|
|
import torch |
|
|
TORCH_AVAILABLE = True |
|
|
except ImportError: |
|
|
TORCH_AVAILABLE = False |
|
|
print("[WARNING] torch not available") |
|
|
|
|
|
try: |
|
|
from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM |
|
|
TRANSFORMERS_AVAILABLE = True |
|
|
except ImportError: |
|
|
TRANSFORMERS_AVAILABLE = False |
|
|
print("[WARNING] transformers not available") |
|
|
|
|
|
try: |
|
|
from datasets import load_dataset |
|
|
DATASETS_AVAILABLE = True |
|
|
except ImportError: |
|
|
DATASETS_AVAILABLE = False |
|
|
print("[WARNING] datasets not available") |
|
|
|
|
|
try: |
|
|
from sentence_transformers import SentenceTransformer |
|
|
SENTENCE_TRANSFORMERS_AVAILABLE = True |
|
|
except ImportError: |
|
|
SENTENCE_TRANSFORMERS_AVAILABLE = False |
|
|
print("[WARNING] sentence_transformers not available") |
|
|
|
|
|
try: |
|
|
import faiss |
|
|
FAISS_AVAILABLE = True |
|
|
except ImportError: |
|
|
FAISS_AVAILABLE = False |
|
|
print("[WARNING] faiss not available") |
|
|
|
|
|
try: |
|
|
from Bio import SeqIO |
|
|
BIOPYTHON_AVAILABLE = True |
|
|
except ImportError: |
|
|
BIOPYTHON_AVAILABLE = False |
|
|
print("[WARNING] biopython not available") |
|
|
|
|
|
|
|
|
APP_TITLE = "BioSeq Chat: Protein & DNA Assistant" |
|
|
DISCLAIMER = "This tool is for research/education and is not a medical device. Do not use outputs for diagnosis or treatment decisions." |
|
|
|
|
|
|
|
|
|
|
|
def get_secret(name: str, fallback: str = "") -> str: |
|
|
"""Get secret from st.secrets or environment""" |
|
|
try: |
|
|
|
|
|
if hasattr(st, 'secrets') and name in st.secrets: |
|
|
return st.secrets[name] |
|
|
except: |
|
|
pass |
|
|
|
|
|
return os.environ.get(name, fallback) |
|
|
|
|
|
def brave_search(query: str, count: int = 5) -> List[Dict]: |
|
|
"""Brave Search API""" |
|
|
key = get_secret("BRAVE_API_KEY", "") |
|
|
if not key: |
|
|
return [{ |
|
|
"title": "BRAVE_API_KEY missing", |
|
|
"url": "", |
|
|
"snippet": "Set BRAVE_API_KEY in Space secrets or sidebar" |
|
|
}] |
|
|
|
|
|
url = "https://api.search.brave.com/res/v1/web/search" |
|
|
headers = { |
|
|
"Accept": "application/json", |
|
|
"X-Subscription-Token": key |
|
|
} |
|
|
params = {"q": query, "count": count} |
|
|
|
|
|
try: |
|
|
r = requests.get(url, headers=headers, params=params, timeout=15) |
|
|
r.raise_for_status() |
|
|
data = r.json() |
|
|
results = [] |
|
|
for item in data.get("web", {}).get("results", [])[:count]: |
|
|
results.append({ |
|
|
"title": item.get("title", ""), |
|
|
"url": item.get("url", ""), |
|
|
"snippet": item.get("description", "") |
|
|
}) |
|
|
return results if results else [{"title": "No results", "url": "", "snippet": ""}] |
|
|
except Exception as e: |
|
|
return [{"title": "Error", "url": "", "snippet": str(e)}] |
|
|
|
|
|
def call_llm(messages: List[Dict], temperature: float = 0.6, max_tokens: int = 4000) -> str: |
|
|
"""Call Fireworks AI API""" |
|
|
api_key = get_secret("FIREWORKS_API_KEY", "") |
|
|
if not api_key: |
|
|
return "FIREWORKS_API_KEY missing. Set it in Secrets or sidebar." |
|
|
|
|
|
url = "https://api.fireworks.ai/inference/v1/chat/completions" |
|
|
payload = { |
|
|
"model": "accounts/fireworks/models/llama-v3p1-70b-instruct", |
|
|
"messages": messages, |
|
|
"max_tokens": max_tokens, |
|
|
"temperature": temperature, |
|
|
"top_p": 1, |
|
|
"frequency_penalty": 0, |
|
|
"presence_penalty": 0 |
|
|
} |
|
|
headers = { |
|
|
"Content-Type": "application/json", |
|
|
"Authorization": f"Bearer {api_key}" |
|
|
} |
|
|
|
|
|
try: |
|
|
r = requests.post(url, headers=headers, json=payload, timeout=60) |
|
|
r.raise_for_status() |
|
|
return r.json()["choices"][0]["message"]["content"] |
|
|
except Exception as e: |
|
|
return f"[LLM Error] {e}" |
|
|
|
|
|
def load_file_text(upload) -> str: |
|
|
"""Load text from uploaded file""" |
|
|
name = upload.name.lower() |
|
|
|
|
|
try: |
|
|
content = upload.read() |
|
|
text = content.decode("utf-8", errors="ignore") |
|
|
except: |
|
|
return "" |
|
|
|
|
|
|
|
|
if name.endswith((".fa", ".fasta", ".faa", ".fna")) and BIOPYTHON_AVAILABLE: |
|
|
try: |
|
|
upload.seek(0) |
|
|
records = list(SeqIO.parse(upload, "fasta")) |
|
|
seqs = [f">{r.id}\n{str(r.seq)}" for r in records] |
|
|
return "\n\n".join(seqs) |
|
|
except: |
|
|
pass |
|
|
|
|
|
return text |
|
|
|
|
|
def chunk_text(text: str, size: int = 1200, overlap: int = 200) -> List[str]: |
|
|
"""Split text into chunks""" |
|
|
chunks = [] |
|
|
start = 0 |
|
|
text_len = len(text) |
|
|
|
|
|
while start < text_len: |
|
|
end = min(start + size, text_len) |
|
|
chunks.append(text[start:end]) |
|
|
if end >= text_len: |
|
|
break |
|
|
start = end - overlap |
|
|
|
|
|
return chunks |
|
|
|
|
|
def build_index(texts: List[str]): |
|
|
"""Build vector index""" |
|
|
if not SENTENCE_TRANSFORMERS_AVAILABLE or not FAISS_AVAILABLE: |
|
|
return None, None |
|
|
|
|
|
try: |
|
|
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") |
|
|
embeddings = model.encode(texts, show_progress_bar=False) |
|
|
|
|
|
dim = embeddings.shape[1] |
|
|
index = faiss.IndexFlatIP(dim) |
|
|
index.add(embeddings.astype("float32")) |
|
|
|
|
|
return index, model |
|
|
except Exception as e: |
|
|
st.warning(f"Index build failed: {e}") |
|
|
return None, None |
|
|
|
|
|
def search_index(query: str, index, model, texts: List[str], k: int = 4) -> List[Dict]: |
|
|
"""Search vector index""" |
|
|
if index is None or model is None: |
|
|
return [] |
|
|
|
|
|
try: |
|
|
q_emb = model.encode([query]) |
|
|
D, I = index.search(q_emb.astype("float32"), k) |
|
|
|
|
|
results = [] |
|
|
for idx, score in zip(I[0], D[0]): |
|
|
if 0 <= idx < len(texts): |
|
|
results.append({ |
|
|
"score": float(score), |
|
|
"text": texts[idx] |
|
|
}) |
|
|
return results |
|
|
except: |
|
|
return [] |
|
|
|
|
|
def esm2_embed(seq: str, model_name: str = "facebook/esm2_t6_8M_UR50D") -> Dict: |
|
|
"""ESM-2 protein embedding""" |
|
|
if not TORCH_AVAILABLE or not TRANSFORMERS_AVAILABLE: |
|
|
return {"error": "PyTorch/Transformers not available"} |
|
|
|
|
|
try: |
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
model = AutoModelForMaskedLM.from_pretrained(model_name) |
|
|
model.eval() |
|
|
|
|
|
with torch.no_grad(): |
|
|
inputs = tokenizer(seq, return_tensors="pt", truncation=True, max_length=1024) |
|
|
outputs = model(**inputs, output_hidden_states=True) |
|
|
hidden = outputs.hidden_states[-1].mean(dim=1).squeeze(0) |
|
|
vec = hidden.cpu().numpy() |
|
|
|
|
|
|
|
|
del model |
|
|
del tokenizer |
|
|
if torch.cuda.is_available(): |
|
|
torch.cuda.empty_cache() |
|
|
|
|
|
return { |
|
|
"embedding": vec.tolist()[:10], |
|
|
"size": vec.shape[0] |
|
|
} |
|
|
except Exception as e: |
|
|
return {"error": str(e)} |
|
|
|
|
|
def dna_embed(seq: str, model_name: str = "zhihan1996/DNABERT-2-117M") -> Dict: |
|
|
"""DNA embedding""" |
|
|
if not TORCH_AVAILABLE or not TRANSFORMERS_AVAILABLE: |
|
|
return {"error": "PyTorch/Transformers not available"} |
|
|
|
|
|
try: |
|
|
|
|
|
try: |
|
|
import einops |
|
|
except ImportError: |
|
|
return {"error": "einops package required. Please wait for installation and refresh the page."} |
|
|
|
|
|
|
|
|
|
|
|
try: |
|
|
from transformers import AutoTokenizer, AutoModel |
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) |
|
|
model = AutoModel.from_pretrained(model_name, trust_remote_code=True) |
|
|
except Exception as model_error: |
|
|
|
|
|
try: |
|
|
from transformers import BertTokenizer, BertModel |
|
|
|
|
|
fallback_model = "bert-base-uncased" |
|
|
tokenizer = BertTokenizer.from_pretrained(fallback_model) |
|
|
model = BertModel.from_pretrained(fallback_model) |
|
|
st.warning(f"DNABERT-2 ๋ก๋ฉ ์คํจ. ๋์ฒด ๋ชจ๋ธ ์ฌ์ฉ์ค: {fallback_model}") |
|
|
except: |
|
|
return {"error": f"๋ชจ๋ธ ๋ก๋ฉ ์คํจ: {str(model_error)}"} |
|
|
|
|
|
model.eval() |
|
|
|
|
|
|
|
|
def seq_to_kmer(seq, k=6): |
|
|
"""DNA ์์ด์ k-mer๋ก ๋ณํ""" |
|
|
kmers = [] |
|
|
for i in range(len(seq) - k + 1): |
|
|
kmers.append(seq[i:i+k]) |
|
|
return ' '.join(kmers) |
|
|
|
|
|
|
|
|
if len(seq) > 6: |
|
|
input_seq = seq_to_kmer(seq, k=6) |
|
|
else: |
|
|
input_seq = seq |
|
|
|
|
|
with torch.no_grad(): |
|
|
inputs = tokenizer( |
|
|
input_seq, |
|
|
return_tensors="pt", |
|
|
truncation=True, |
|
|
max_length=512, |
|
|
padding=True |
|
|
) |
|
|
outputs = model(**inputs) |
|
|
|
|
|
|
|
|
if hasattr(outputs, 'pooler_output') and outputs.pooler_output is not None: |
|
|
vec = outputs.pooler_output.squeeze(0).cpu().numpy() |
|
|
else: |
|
|
hidden = outputs.last_hidden_state.mean(dim=1).squeeze(0) |
|
|
vec = hidden.cpu().numpy() |
|
|
|
|
|
|
|
|
del model |
|
|
del tokenizer |
|
|
if torch.cuda.is_available(): |
|
|
torch.cuda.empty_cache() |
|
|
|
|
|
return { |
|
|
"embedding": vec.tolist()[:10], |
|
|
"size": vec.shape[0] |
|
|
} |
|
|
|
|
|
except Exception as e: |
|
|
return {"error": f"๋ถ์ ์ค ์ค๋ฅ ๋ฐ์: {str(e)[:200]}"} |
|
|
|
|
|
def build_context(query: str, docs: List[str], index, model, use_web: bool, web_k: int) -> Tuple[str, List[Dict]]: |
|
|
"""Build context from sources""" |
|
|
pieces = [] |
|
|
sources = [] |
|
|
|
|
|
|
|
|
if index and model and docs: |
|
|
hits = search_index(query, index, model, docs, k=4) |
|
|
for h in hits: |
|
|
pieces.append(f"[FILE] {h['text'][:500]}") |
|
|
sources.append({"type": "file", "text": h['text'][:100]}) |
|
|
|
|
|
|
|
|
if use_web: |
|
|
results = brave_search(query, count=web_k) |
|
|
for r in results: |
|
|
pieces.append(f"[WEB] {r['title']}\n{r['snippet']}") |
|
|
sources.append({"type": "web", "title": r['title'], "url": r['url']}) |
|
|
|
|
|
context = "\n\n---\n\n".join(pieces)[:4000] |
|
|
return context, sources |
|
|
|
|
|
def answer_question(query: str, context: str) -> str: |
|
|
"""Generate answer""" |
|
|
system = ( |
|
|
"You are an expert bioinformatics assistant who explains complex biological concepts in an accessible way. " |
|
|
"Your responses should be:\n" |
|
|
"1. Comprehensive yet easy to understand\n" |
|
|
"2. Well-structured with clear sections\n" |
|
|
"3. Include relevant examples and analogies\n" |
|
|
"4. Provide actionable insights when appropriate\n" |
|
|
"5. Use Korean if the user writes in Korean, otherwise English\n" |
|
|
"6. Never provide medical diagnosis or treatment advice\n" |
|
|
"7. Format your response with headers, bullet points, and clear paragraphs\n" |
|
|
"8. Aim for 300-500 words minimum for complex questions" |
|
|
) |
|
|
|
|
|
user_msg = f"""Context information:\n{context}\n\n |
|
|
User Question: {query} |
|
|
|
|
|
Please provide a detailed, well-structured response that: |
|
|
- Directly answers the question |
|
|
- Explains the biological background |
|
|
- Includes practical implications when relevant |
|
|
- Uses simple analogies to explain complex concepts |
|
|
- Cites the context when appropriate""" |
|
|
|
|
|
messages = [ |
|
|
{"role": "system", "content": system}, |
|
|
{"role": "user", "content": user_msg} |
|
|
] |
|
|
|
|
|
return call_llm(messages, temperature=0.4, max_tokens=4000) |
|
|
|
|
|
|
|
|
|
|
|
st.set_page_config(page_title=APP_TITLE, page_icon="๐งฌ", layout="wide") |
|
|
st.title(APP_TITLE) |
|
|
st.caption(DISCLAIMER) |
|
|
|
|
|
|
|
|
if "docs" not in st.session_state: |
|
|
st.session_state.docs = [] |
|
|
if "index" not in st.session_state: |
|
|
st.session_state.index = None |
|
|
if "model" not in st.session_state: |
|
|
st.session_state.model = None |
|
|
|
|
|
|
|
|
with st.sidebar: |
|
|
st.header("Configuration") |
|
|
|
|
|
fw_key = st.text_input( |
|
|
"FIREWORKS_API_KEY", |
|
|
value=get_secret("FIREWORKS_API_KEY", ""), |
|
|
type="password" |
|
|
) |
|
|
brave_key = st.text_input( |
|
|
"BRAVE_API_KEY", |
|
|
value=get_secret("BRAVE_API_KEY", ""), |
|
|
type="password" |
|
|
) |
|
|
|
|
|
if fw_key: |
|
|
os.environ["FIREWORKS_API_KEY"] = fw_key |
|
|
if brave_key: |
|
|
os.environ["BRAVE_API_KEY"] = brave_key |
|
|
|
|
|
st.divider() |
|
|
|
|
|
esm_model = st.text_input( |
|
|
"ESM-2 Model", |
|
|
value="facebook/esm2_t6_8M_UR50D" |
|
|
) |
|
|
dna_model = st.text_input( |
|
|
"DNA Model", |
|
|
value="bert-base-uncased", |
|
|
help="Options: bert-base-uncased (stable), zhihan1996/DNABERT-2-117M (specialized but may require more memory)" |
|
|
) |
|
|
|
|
|
use_web = st.checkbox("Enable web search", value=True) |
|
|
web_results = st.slider("Web results", 1, 10, 3) |
|
|
|
|
|
|
|
|
tab1, tab2, tab3, tab4 = st.tabs(["Chat", "Protein", "DNA", "About"]) |
|
|
|
|
|
|
|
|
with st.expander("๐ Upload Files", expanded=True): |
|
|
files = st.file_uploader( |
|
|
"Upload text/FASTA files", |
|
|
type=["txt", "fa", "fasta", "csv", "json"], |
|
|
accept_multiple_files=True |
|
|
) |
|
|
|
|
|
if files: |
|
|
docs = [] |
|
|
for f in files: |
|
|
try: |
|
|
text = load_file_text(f) |
|
|
if text: |
|
|
docs.extend(chunk_text(text)) |
|
|
except Exception as e: |
|
|
st.error(f"Error reading {f.name}: {e}") |
|
|
|
|
|
if docs: |
|
|
st.session_state.docs = docs |
|
|
st.success(f"Loaded {len(docs)} chunks") |
|
|
|
|
|
if SENTENCE_TRANSFORMERS_AVAILABLE and FAISS_AVAILABLE: |
|
|
with st.spinner("Building index..."): |
|
|
index, model = build_index(docs) |
|
|
if index: |
|
|
st.session_state.index = index |
|
|
st.session_state.model = model |
|
|
|
|
|
|
|
|
with tab1: |
|
|
st.subheader("๐ฌ Chat Assistant") |
|
|
|
|
|
question = st.text_area( |
|
|
"Ask about proteins, DNA, or bioinformatics:", |
|
|
value="What is the role of ESM-2 embeddings in protein analysis?", |
|
|
height=100 |
|
|
) |
|
|
|
|
|
if st.button("Get Answer", type="primary"): |
|
|
if not get_secret("FIREWORKS_API_KEY"): |
|
|
st.error("Please set FIREWORKS_API_KEY") |
|
|
else: |
|
|
with st.spinner("Thinking..."): |
|
|
context, sources = build_context( |
|
|
question, |
|
|
st.session_state.docs, |
|
|
st.session_state.index, |
|
|
st.session_state.model, |
|
|
use_web, |
|
|
web_results |
|
|
) |
|
|
|
|
|
answer = answer_question(question, context) |
|
|
|
|
|
st.markdown("### Answer") |
|
|
st.write(answer) |
|
|
|
|
|
if sources: |
|
|
st.markdown("### Sources") |
|
|
for s in sources: |
|
|
if s["type"] == "web": |
|
|
st.write(f"- ๐ [{s['title']}]({s['url']})") |
|
|
elif s["type"] == "file": |
|
|
st.write(f"- ๐ File: {s['text'][:80]}...") |
|
|
|
|
|
|
|
|
with tab2: |
|
|
st.subheader("๐งฌ Protein Analysis") |
|
|
|
|
|
st.info(""" |
|
|
**๋จ๋ฐฑ์ง ์์ด ๋ถ์์ด๋?** |
|
|
- ๋จ๋ฐฑ์ง์ ์๋ฏธ๋
ธ์ฐ ์์ด์ AI๊ฐ ๋ถ์ํ์ฌ ๊ธฐ๋ฅ๊ณผ ๊ตฌ์กฐ๋ฅผ ์์ธกํฉ๋๋ค |
|
|
- ESM-2๋ Meta๊ฐ ๊ฐ๋ฐํ AI๋ก, 6์ต 5์ฒ๋ง๊ฐ ๋จ๋ฐฑ์ง์ ํ์ตํ์ต๋๋ค |
|
|
- ์ฉ๋: ์ ์ฝ ๊ฐ๋ฐ, ์ง๋ณ ์ฐ๊ตฌ, ์งํ ๋ถ์ ๋ฑ |
|
|
""") |
|
|
|
|
|
protein_seq = st.text_area( |
|
|
"๋จ๋ฐฑ์ง ์์ด ์
๋ ฅ (๋ณต์ฌ-๋ถ์ฌ๋ฃ๊ธฐ ๊ฐ๋ฅ):", |
|
|
value="MKTIIALSYIFCLVFA", |
|
|
help="๋จ๋ฐฑ์ง ์์ด์ 20๊ฐ ์๋ฏธ๋
ธ์ฐ ๋ฌธ์(A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y)๋ก ๊ตฌ์ฑ๋ฉ๋๋ค", |
|
|
height=100 |
|
|
) |
|
|
|
|
|
st.markdown("**์์ ์์ด (ํด๋ฆญํด์ ๋ณต์ฌ):**") |
|
|
col1, col2, col3 = st.columns(3) |
|
|
with col1: |
|
|
if st.button("์ธ์๋ฆฐ", key="ins"): |
|
|
st.code("FVNQHLCGSHLVEALYLVCGERGFFYTPKT", language=None) |
|
|
with col2: |
|
|
if st.button("์๋ํ", key="end"): |
|
|
st.code("YGGFMTSEKSQTPLVTLFKNAIIKNAYKKGE", language=None) |
|
|
with col3: |
|
|
if st.button("์ฅ์ํ ์ ", key="oxy"): |
|
|
st.code("CYIQNCPLG", language=None) |
|
|
|
|
|
if st.button("๐ฌ ๋จ๋ฐฑ์ง ๋ถ์ ์์", type="primary"): |
|
|
seq = protein_seq.strip().upper() |
|
|
|
|
|
|
|
|
st.markdown("### ๐ ๊ธฐ๋ณธ ๋ถ์ ๊ฒฐ๊ณผ") |
|
|
col1, col2 = st.columns(2) |
|
|
|
|
|
with col1: |
|
|
st.metric("์์ด ๊ธธ์ด", f"{len(seq)} ์๋ฏธ๋
ธ์ฐ") |
|
|
st.metric("๋ถ์๋ (์ถ์ )", f"~{len(seq) * 110} Da") |
|
|
|
|
|
with col2: |
|
|
unique_aa = len(set(seq)) |
|
|
st.metric("์ฌ์ฉ๋ ์๋ฏธ๋
ธ์ฐ ์ข
๋ฅ", f"{unique_aa}๊ฐ") |
|
|
hydrophobic = sum(1 for aa in seq if aa in "AVILMFYW") |
|
|
st.metric("์์์ฑ ๋น์จ", f"{hydrophobic/len(seq)*100:.1f}%") |
|
|
|
|
|
|
|
|
if TORCH_AVAILABLE and TRANSFORMERS_AVAILABLE: |
|
|
st.markdown("### ๐ค AI ์๋ฒ ๋ฉ ๋ถ์") |
|
|
with st.spinner("AI ๋ชจ๋ธ์ด ๋จ๋ฐฑ์ง์ ๋ถ์์ค... (10-30์ด)"): |
|
|
result = esm2_embed(seq, esm_model) |
|
|
if "error" in result: |
|
|
st.error(result["error"]) |
|
|
else: |
|
|
st.success("โ
AI ๋ถ์ ์๋ฃ!") |
|
|
|
|
|
col1, col2 = st.columns(2) |
|
|
with col1: |
|
|
st.metric("๋ฒกํฐ ์ฐจ์", result['size']) |
|
|
st.caption("์ด ์ซ์๋ค์ ๋จ๋ฐฑ์ง์ ํน์ฑ์ ์์นํํ ๊ฒ์
๋๋ค") |
|
|
|
|
|
with col2: |
|
|
st.markdown("**์๋ฒ ๋ฉ ๋ฒกํฐ ๋ฏธ๋ฆฌ๋ณด๊ธฐ:**") |
|
|
st.code(result["embedding"][:5]) |
|
|
|
|
|
st.markdown(""" |
|
|
**๐ฏ ์ด ๋ถ์์ ํ์ฉ:** |
|
|
- ์ ์ฌํ ๊ธฐ๋ฅ์ ๋จ๋ฐฑ์ง ์ฐพ๊ธฐ |
|
|
- ๊ตฌ์กฐ ์์ธก์ ๊ธฐ์ด ๋ฐ์ดํฐ |
|
|
- ๋์ฐ๋ณ์ด ์ํฅ ์์ธก |
|
|
- ์ ์ฝ ํ๊ฒ ๋ฐ๊ตด |
|
|
""") |
|
|
else: |
|
|
st.warning("โ ๏ธ AI ๋ชจ๋ธ ๋ก๋ฉ ์ค... ์ ์ ํ ๋ค์ ์๋ํด์ฃผ์ธ์") |
|
|
|
|
|
|
|
|
with tab3: |
|
|
st.subheader("๐งฌ DNA Analysis") |
|
|
|
|
|
st.info(""" |
|
|
**DNA ์์ด ๋ถ์์ด๋?** |
|
|
- DNA์ ์ผ๊ธฐ์์ด(A,T,G,C)์ AI๊ฐ ๋ถ์ํ์ฌ ๊ธฐ๋ฅ์ ์์ธกํฉ๋๋ค |
|
|
- DNABERT-2๋ ์ธ๊ฐ ๊ฒ๋ ์ ์ฒด๋ฅผ ํ์ตํ AI ๋ชจ๋ธ์
๋๋ค |
|
|
- ์ฉ๋: ์ ์ ์ ๊ธฐ๋ฅ ์์ธก, ์ง๋ณ ๋ณ์ด ๋ฐ๊ฒฌ, ์งํ ์ฐ๊ตฌ ๋ฑ |
|
|
""") |
|
|
|
|
|
dna_seq = st.text_area( |
|
|
"DNA ์์ด ์
๋ ฅ (๋ณต์ฌ-๋ถ์ฌ๋ฃ๊ธฐ ๊ฐ๋ฅ):", |
|
|
value="ATGCGATCGTAGC", |
|
|
help="DNA๋ 4๊ฐ ์ผ๊ธฐ(A: ์๋ฐ๋, T: ํฐ๋ฏผ, G: ๊ตฌ์๋, C: ์ํ ์ )๋ก ๊ตฌ์ฑ๋ฉ๋๋ค", |
|
|
height=100 |
|
|
) |
|
|
|
|
|
st.markdown("**์์ ์์ด (ํด๋ฆญํด์ ๋ณต์ฌ):**") |
|
|
col1, col2, col3 = st.columns(3) |
|
|
with col1: |
|
|
if st.button("TATA ๋ฐ์ค", key="tata"): |
|
|
st.code("GCGCGATATAAAGGCGCGGGCGCGCG", language=None) |
|
|
st.caption("์ ์ ์ ๋ฐํ ์์ ์ ํธ") |
|
|
with col2: |
|
|
if st.button("ํ๋ก๋ชจํฐ", key="prom"): |
|
|
st.code("TTGACAGGCTAGCTCAGTCCTAGGTATAATGCTAGC", language=None) |
|
|
st.caption("์ ์ ์ ์กฐ์ ์์ญ") |
|
|
with col3: |
|
|
if st.button("CRISPR ํ๊ฒ", key="crispr"): |
|
|
st.code("GTCACCTCCAATGACTAGGGTGG", language=None) |
|
|
st.caption("์ ์ ์ ํธ์ง ๋ถ์") |
|
|
|
|
|
if st.button("๐ฌ DNA ๋ถ์ ์์", type="primary"): |
|
|
seq = dna_seq.strip().upper().replace("U", "T") |
|
|
seq = ''.join(c for c in seq if c in 'ATGC') |
|
|
|
|
|
if len(seq) < 3: |
|
|
st.error("์ต์ 3๊ฐ ์ด์์ ์ผ๊ธฐ๋ฅผ ์
๋ ฅํด์ฃผ์ธ์") |
|
|
else: |
|
|
st.markdown("### ๐ ๊ธฐ๋ณธ ๋ถ์ ๊ฒฐ๊ณผ") |
|
|
col1, col2 = st.columns(2) |
|
|
|
|
|
with col1: |
|
|
st.metric("์์ด ๊ธธ์ด", f"{len(seq)} bp") |
|
|
gc = (seq.count("G") + seq.count("C")) / len(seq) * 100 |
|
|
st.metric("GC ํจ๋", f"{gc:.1f}%") |
|
|
if gc > 60: |
|
|
st.caption("๐ด ๋์: ์์ ์ ์ด์ง๋ง ๋ณต์ ์ด๋ ค์") |
|
|
elif gc < 40: |
|
|
st.caption("๐ต ๋ฎ์: ๋ถ์์ ํ์ง๋ง ๋ณต์ ์ฉ์ด") |
|
|
else: |
|
|
st.caption("๐ข ์ ์ : ์ผ๋ฐ์ ์ธ ๋ฒ์") |
|
|
|
|
|
with col2: |
|
|
at = (seq.count("A") + seq.count("T")) / len(seq) * 100 |
|
|
st.metric("AT ํจ๋", f"{at:.1f}%") |
|
|
|
|
|
|
|
|
if len(seq) % 3 == 0: |
|
|
st.metric("๊ฐ๋ฅํ ์ฝ๋ ์", f"{len(seq)//3}๊ฐ") |
|
|
st.caption("๋จ๋ฐฑ์ง๋ก ๋ฒ์ญ ๊ฐ๋ฅ") |
|
|
|
|
|
|
|
|
st.markdown("### ๐ ์ฃผ์ ๋ชจํฐํ ๊ฒ์") |
|
|
motifs_found = [] |
|
|
|
|
|
if "TATAAAA" in seq or "TATAAA" in seq: |
|
|
motifs_found.append("โ
TATA box ๋ฐ๊ฒฌ (์ ์ฌ ์์ ์ ํธ)") |
|
|
if "CAAT" in seq or "CCAAT" in seq: |
|
|
motifs_found.append("โ
CAAT box ๋ฐ๊ฒฌ (์ ์ฌ ์กฐ์ )") |
|
|
if "ATG" in seq: |
|
|
motifs_found.append("โ
์์ ์ฝ๋(ATG) ๋ฐ๊ฒฌ") |
|
|
if "TAA" in seq or "TAG" in seq or "TGA" in seq: |
|
|
motifs_found.append("โ
์ ์ง ์ฝ๋ ๋ฐ๊ฒฌ") |
|
|
if seq.count("CG") > len(seq)/20: |
|
|
motifs_found.append("โ
CpG ์ฌ ๊ฐ๋ฅ์ฑ (์ ์ ์ ์กฐ์ )") |
|
|
|
|
|
if motifs_found: |
|
|
for motif in motifs_found: |
|
|
st.write(motif) |
|
|
else: |
|
|
st.write("ํน๋ณํ ๋ชจํฐํ๊ฐ ๋ฐ๊ฒฌ๋์ง ์์์ต๋๋ค") |
|
|
|
|
|
|
|
|
if TORCH_AVAILABLE and TRANSFORMERS_AVAILABLE: |
|
|
st.markdown("### ๐ค AI ์๋ฒ ๋ฉ ๋ถ์") |
|
|
with st.spinner("AI ๋ชจ๋ธ์ด DNA๋ฅผ ๋ถ์์ค... (10-30์ด)"): |
|
|
result = dna_embed(seq, dna_model) |
|
|
if "error" in result: |
|
|
st.error(result["error"]) |
|
|
else: |
|
|
st.success("โ
AI ๋ถ์ ์๋ฃ!") |
|
|
|
|
|
col1, col2 = st.columns(2) |
|
|
with col1: |
|
|
st.metric("๋ฒกํฐ ์ฐจ์", result['size']) |
|
|
st.caption("DNA ํน์ฑ์ ์์นํํ ๊ฒฐ๊ณผ์
๋๋ค") |
|
|
|
|
|
with col2: |
|
|
st.markdown("**์๋ฒ ๋ฉ ๋ฒกํฐ ๋ฏธ๋ฆฌ๋ณด๊ธฐ:**") |
|
|
st.code(result["embedding"][:5]) |
|
|
|
|
|
st.markdown(""" |
|
|
**๐ฏ ์ด ๋ถ์์ ํ์ฉ:** |
|
|
- ์ ์ ์ ๊ธฐ๋ฅ ์์ธก |
|
|
- ํ๋ก๋ชจํฐ/์ธํธ์ ์ฐพ๊ธฐ |
|
|
- ์งํ์ ๋ณด์กด ์์ญ ๋ฐ๊ฒฌ |
|
|
- ์ง๋ณ ๊ด๋ จ ๋ณ์ด ์์ธก |
|
|
- CRISPR ํ๊ฒ ๋ถ์ ํ๊ฐ |
|
|
""") |
|
|
else: |
|
|
st.warning("โ ๏ธ AI ๋ชจ๋ธ ๋ก๋ฉ ์ค... ์ ์ ํ ๋ค์ ์๋ํด์ฃผ์ธ์") |
|
|
|
|
|
|
|
|
with tab4: |
|
|
st.subheader("โน๏ธ About") |
|
|
st.markdown(""" |
|
|
### Features |
|
|
- ๐ฌ RAG-based chat for bioinformatics questions |
|
|
- ๐งฌ Protein sequence analysis with ESM-2 |
|
|
- ๐งฌ DNA sequence analysis with DNABERT-2 |
|
|
- ๐ Web search integration via Brave API |
|
|
- ๐ File upload and vector search |
|
|
|
|
|
### Models |
|
|
- **Proteins:** ESM-2 (Facebook) |
|
|
- **DNA:** DNABERT-2 (Microsoft) |
|
|
- **LLM:** Llama 3.1 70B (via Fireworks) |
|
|
|
|
|
### Disclaimer |
|
|
This tool is for research and educational purposes only. |
|
|
Not for medical diagnosis or treatment decisions. |
|
|
""") |
|
|
|
|
|
|
|
|
st.divider() |
|
|
st.subheader("System Status") |
|
|
deps = { |
|
|
"PyTorch": TORCH_AVAILABLE, |
|
|
"Transformers": TRANSFORMERS_AVAILABLE, |
|
|
"Sentence Transformers": SENTENCE_TRANSFORMERS_AVAILABLE, |
|
|
"FAISS": FAISS_AVAILABLE, |
|
|
"BioPython": BIOPYTHON_AVAILABLE, |
|
|
"Datasets": DATASETS_AVAILABLE |
|
|
} |
|
|
|
|
|
for name, available in deps.items(): |
|
|
if available: |
|
|
st.success(f"โ
{name}") |
|
|
else: |
|
|
st.warning(f"โ ๏ธ {name} not available") |