Spaces:

openfree
/

BIOseq

Sleeping

App Files Files Community

BIOseq / app.py

openfree

Update app.py

4fd4c0d verified 4 months ago

raw

history blame

25.2 kB

	import os
	import json
	from typing import List, Dict, Tuple

	import streamlit as st
	import requests

	# 선택적 의존성 가드
	try:
	import torch
	TORCH_AVAILABLE = True
	except ImportError:
	TORCH_AVAILABLE = False
	print("[WARNING] torch not available")

	try:
	from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM
	TRANSFORMERS_AVAILABLE = True
	except ImportError:
	TRANSFORMERS_AVAILABLE = False
	print("[WARNING] transformers not available")

	try:
	from datasets import load_dataset
	DATASETS_AVAILABLE = True
	except ImportError:
	DATASETS_AVAILABLE = False
	print("[WARNING] datasets not available")

	try:
	from sentence_transformers import SentenceTransformer
	SENTENCE_TRANSFORMERS_AVAILABLE = True
	except ImportError:
	SENTENCE_TRANSFORMERS_AVAILABLE = False
	print("[WARNING] sentence_transformers not available")

	try:
	import faiss
	FAISS_AVAILABLE = True
	except ImportError:
	FAISS_AVAILABLE = False
	print("[WARNING] faiss not available")

	try:
	from Bio import SeqIO
	BIOPYTHON_AVAILABLE = True
	except ImportError:
	BIOPYTHON_AVAILABLE = False
	print("[WARNING] biopython not available")

	# 상수
	APP_TITLE = "BioSeq Chat: Protein & DNA Assistant"
	DISCLAIMER = "This tool is for research/education and is not a medical device. Do not use outputs for diagnosis or treatment decisions."

	# --------------- Helper Functions ---------------

	def get_secret(name: str, fallback: str = "") -> str:
	"""Get secret from st.secrets or environment"""
	try:
	# Streamlit secrets
	if hasattr(st, 'secrets') and name in st.secrets:
	return st.secrets[name]
	except:
	pass
	# Environment variable
	return os.environ.get(name, fallback)

	def brave_search(query: str, count: int = 5) -> List[Dict]:
	"""Brave Search API"""
	key = get_secret("BRAVE_API_KEY", "")
	if not key:
	return [{
	"title": "BRAVE_API_KEY missing",
	"url": "",
	"snippet": "Set BRAVE_API_KEY in Space secrets or sidebar"
	}]

	url = "https://api.search.brave.com/res/v1/web/search"
	headers = {
	"Accept": "application/json",
	"X-Subscription-Token": key
	}
	params = {"q": query, "count": count}

	try:
	r = requests.get(url, headers=headers, params=params, timeout=15)
	r.raise_for_status()
	data = r.json()
	results = []
	for item in data.get("web", {}).get("results", [])[:count]:
	results.append({
	"title": item.get("title", ""),
	"url": item.get("url", ""),
	"snippet": item.get("description", "")
	})
	return results if results else [{"title": "No results", "url": "", "snippet": ""}]
	except Exception as e:
	return [{"title": "Error", "url": "", "snippet": str(e)}]

	def call_llm(messages: List[Dict], temperature: float = 0.6, max_tokens: int = 4000) -> str:
	"""Call Fireworks AI API"""
	api_key = get_secret("FIREWORKS_API_KEY", "")
	if not api_key:
	return "FIREWORKS_API_KEY missing. Set it in Secrets or sidebar."

	url = "https://api.fireworks.ai/inference/v1/chat/completions"
	payload = {
	"model": "accounts/fireworks/models/llama-v3p1-70b-instruct",
	"messages": messages,
	"max_tokens": max_tokens,
	"temperature": temperature,
	"top_p": 1,
	"frequency_penalty": 0,
	"presence_penalty": 0
	}
	headers = {
	"Content-Type": "application/json",
	"Authorization": f"Bearer {api_key}"
	}

	try:
	r = requests.post(url, headers=headers, json=payload, timeout=60)
	r.raise_for_status()
	return r.json()["choices"][0]["message"]["content"]
	except Exception as e:
	return f"[LLM Error] {e}"

	def load_file_text(upload) -> str:
	"""Load text from uploaded file"""
	name = upload.name.lower()

	try:
	content = upload.read()
	text = content.decode("utf-8", errors="ignore")
	except:
	return ""

	# FASTA handling
	if name.endswith((".fa", ".fasta", ".faa", ".fna")) and BIOPYTHON_AVAILABLE:
	try:
	upload.seek(0)
	records = list(SeqIO.parse(upload, "fasta"))
	seqs = [f">{r.id}\n{str(r.seq)}" for r in records]
	return "\n\n".join(seqs)
	except:
	pass

	return text

	def chunk_text(text: str, size: int = 1200, overlap: int = 200) -> List[str]:
	"""Split text into chunks"""
	chunks = []
	start = 0
	text_len = len(text)

	while start < text_len:
	end = min(start + size, text_len)
	chunks.append(text[start:end])
	if end >= text_len:
	break
	start = end - overlap

	return chunks

	def build_index(texts: List[str]):
	"""Build vector index"""
	if not SENTENCE_TRANSFORMERS_AVAILABLE or not FAISS_AVAILABLE:
	return None, None

	try:
	model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
	embeddings = model.encode(texts, show_progress_bar=False)

	dim = embeddings.shape[1]
	index = faiss.IndexFlatIP(dim)
	index.add(embeddings.astype("float32"))

	return index, model
	except Exception as e:
	st.warning(f"Index build failed: {e}")
	return None, None

	def search_index(query: str, index, model, texts: List[str], k: int = 4) -> List[Dict]:
	"""Search vector index"""
	if index is None or model is None:
	return []

	try:
	q_emb = model.encode([query])
	D, I = index.search(q_emb.astype("float32"), k)

	results = []
	for idx, score in zip(I[0], D[0]):
	if 0 <= idx < len(texts):
	results.append({
	"score": float(score),
	"text": texts[idx]
	})
	return results
	except:
	return []

	def esm2_embed(seq: str, model_name: str = "facebook/esm2_t6_8M_UR50D") -> Dict:
	"""ESM-2 protein embedding"""
	if not TORCH_AVAILABLE or not TRANSFORMERS_AVAILABLE:
	return {"error": "PyTorch/Transformers not available"}

	try:
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForMaskedLM.from_pretrained(model_name)
	model.eval()

	with torch.no_grad():
	inputs = tokenizer(seq, return_tensors="pt", truncation=True, max_length=1024)
	outputs = model(**inputs, output_hidden_states=True)
	hidden = outputs.hidden_states[-1].mean(dim=1).squeeze(0)
	vec = hidden.cpu().numpy()

	# 메모리 정리
	del model
	del tokenizer
	if torch.cuda.is_available():
	torch.cuda.empty_cache()

	return {
	"embedding": vec.tolist()[:10], # 미리보기용 첫 10개만
	"size": vec.shape[0]
	}
	except Exception as e:
	return {"error": str(e)}

	def dna_embed(seq: str, model_name: str = "zhihan1996/DNABERT-2-117M") -> Dict:
	"""DNA embedding"""
	if not TORCH_AVAILABLE or not TRANSFORMERS_AVAILABLE:
	return {"error": "PyTorch/Transformers not available"}

	try:
	# einops 체크
	try:
	import einops
	except ImportError:
	return {"error": "einops package required. Please wait for installation and refresh the page."}

	# 간단한 대안: 더 안정적인 모델 사용
	# DNABERT-2가 문제를 일으키면 기본 BERT 사용
	try:
	from transformers import AutoTokenizer, AutoModel
	tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
	model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
	except Exception as model_error:
	# 대체 모델 사용
	try:
	from transformers import BertTokenizer, BertModel
	# 기본 BERT 모델로 폴백
	fallback_model = "bert-base-uncased"
	tokenizer = BertTokenizer.from_pretrained(fallback_model)
	model = BertModel.from_pretrained(fallback_model)
	st.warning(f"DNABERT-2 로딩 실패. 대체 모델 사용중: {fallback_model}")
	except:
	return {"error": f"모델 로딩 실패: {str(model_error)}"}

	model.eval()

	# DNA 서열을 k-mer로 변환 (DNABERT 스타일)
	def seq_to_kmer(seq, k=6):
	"""DNA 서열을 k-mer로 변환"""
	kmers = []
	for i in range(len(seq) - k + 1):
	kmers.append(seq[i:i+k])
	return ' '.join(kmers)

	# k-mer 변환 또는 직접 사용
	if len(seq) > 6:
	input_seq = seq_to_kmer(seq, k=6)
	else:
	input_seq = seq

	with torch.no_grad():
	inputs = tokenizer(
	input_seq,
	return_tensors="pt",
	truncation=True,
	max_length=512,
	padding=True
	)
	outputs = model(**inputs)

	# last_hidden_state 또는 pooler_output 사용
	if hasattr(outputs, 'pooler_output') and outputs.pooler_output is not None:
	vec = outputs.pooler_output.squeeze(0).cpu().numpy()
	else:
	hidden = outputs.last_hidden_state.mean(dim=1).squeeze(0)
	vec = hidden.cpu().numpy()

	# 메모리 정리
	del model
	del tokenizer
	if torch.cuda.is_available():
	torch.cuda.empty_cache()

	return {
	"embedding": vec.tolist()[:10], # 미리보기용 첫 10개만
	"size": vec.shape[0]
	}

	except Exception as e:
	return {"error": f"분석 중 오류 발생: {str(e)[:200]}"}

	def build_context(query: str, docs: List[str], index, model, use_web: bool, web_k: int) -> Tuple[str, List[Dict]]:
	"""Build context from sources"""
	pieces = []
	sources = []

	# File search
	if index and model and docs:
	hits = search_index(query, index, model, docs, k=4)
	for h in hits:
	pieces.append(f"[FILE] {h['text'][:500]}")
	sources.append({"type": "file", "text": h['text'][:100]})

	# Web search
	if use_web:
	results = brave_search(query, count=web_k)
	for r in results:
	pieces.append(f"[WEB] {r['title']}\n{r['snippet']}")
	sources.append({"type": "web", "title": r['title'], "url": r['url']})

	context = "\n\n---\n\n".join(pieces)[:4000]
	return context, sources

	def answer_question(query: str, context: str) -> str:
	"""Generate answer"""
	system = (
	"You are an expert bioinformatics assistant who explains complex biological concepts in an accessible way. "
	"Your responses should be:\n"
	"1. Comprehensive yet easy to understand\n"
	"2. Well-structured with clear sections\n"
	"3. Include relevant examples and analogies\n"
	"4. Provide actionable insights when appropriate\n"
	"5. Use Korean if the user writes in Korean, otherwise English\n"
	"6. Never provide medical diagnosis or treatment advice\n"
	"7. Format your response with headers, bullet points, and clear paragraphs\n"
	"8. Aim for 300-500 words minimum for complex questions"
	)

	user_msg = f"""Context information:\n{context}\n\n
	User Question: {query}

	Please provide a detailed, well-structured response that:
	- Directly answers the question
	- Explains the biological background
	- Includes practical implications when relevant
	- Uses simple analogies to explain complex concepts
	- Cites the context when appropriate"""

	messages = [
	{"role": "system", "content": system},
	{"role": "user", "content": user_msg}
	]

	return call_llm(messages, temperature=0.4, max_tokens=4000)

	# --------------- Streamlit UI ---------------

	st.set_page_config(page_title=APP_TITLE, page_icon="🧬", layout="wide")
	st.title(APP_TITLE)
	st.caption(DISCLAIMER)

	# Session state init
	if "docs" not in st.session_state:
	st.session_state.docs = []
	if "index" not in st.session_state:
	st.session_state.index = None
	if "model" not in st.session_state:
	st.session_state.model = None

	# Sidebar
	with st.sidebar:
	st.header("Configuration")

	fw_key = st.text_input(
	"FIREWORKS_API_KEY",
	value=get_secret("FIREWORKS_API_KEY", ""),
	type="password"
	)
	brave_key = st.text_input(
	"BRAVE_API_KEY",
	value=get_secret("BRAVE_API_KEY", ""),
	type="password"
	)

	if fw_key:
	os.environ["FIREWORKS_API_KEY"] = fw_key
	if brave_key:
	os.environ["BRAVE_API_KEY"] = brave_key

	st.divider()

	esm_model = st.text_input(
	"ESM-2 Model",
	value="facebook/esm2_t6_8M_UR50D"
	)
	dna_model = st.text_input(
	"DNA Model",
	value="bert-base-uncased", # 더 안정적인 기본 모델
	help="Options: bert-base-uncased (stable), zhihan1996/DNABERT-2-117M (specialized but may require more memory)"
	)

	use_web = st.checkbox("Enable web search", value=True)
	web_results = st.slider("Web results", 1, 10, 3)

	# Tabs
	tab1, tab2, tab3, tab4 = st.tabs(["Chat", "Protein", "DNA", "About"])

	# File upload
	with st.expander("📁 Upload Files", expanded=True):
	files = st.file_uploader(
	"Upload text/FASTA files",
	type=["txt", "fa", "fasta", "csv", "json"],
	accept_multiple_files=True
	)

	if files:
	docs = []
	for f in files:
	try:
	text = load_file_text(f)
	if text:
	docs.extend(chunk_text(text))
	except Exception as e:
	st.error(f"Error reading {f.name}: {e}")

	if docs:
	st.session_state.docs = docs
	st.success(f"Loaded {len(docs)} chunks")

	if SENTENCE_TRANSFORMERS_AVAILABLE and FAISS_AVAILABLE:
	with st.spinner("Building index..."):
	index, model = build_index(docs)
	if index:
	st.session_state.index = index
	st.session_state.model = model

	# Chat tab
	with tab1:
	st.subheader("💬 Chat Assistant")

	question = st.text_area(
	"Ask about proteins, DNA, or bioinformatics:",
	value="What is the role of ESM-2 embeddings in protein analysis?",
	height=100
	)

	if st.button("Get Answer", type="primary"):
	if not get_secret("FIREWORKS_API_KEY"):
	st.error("Please set FIREWORKS_API_KEY")
	else:
	with st.spinner("Thinking..."):
	context, sources = build_context(
	question,
	st.session_state.docs,
	st.session_state.index,
	st.session_state.model,
	use_web,
	web_results
	)

	answer = answer_question(question, context)

	st.markdown("### Answer")
	st.write(answer)

	if sources:
	st.markdown("### Sources")
	for s in sources:
	if s["type"] == "web":
	st.write(f"- 🌐 [{s['title']}]({s['url']})")
	elif s["type"] == "file":
	st.write(f"- 📄 File: {s['text'][:80]}...")

	# Protein tab
	with tab2:
	st.subheader("🧬 Protein Analysis")

	st.info("""
	단백질 서열 분석이란?
	- 단백질의 아미노산 서열을 AI가 분석하여 기능과 구조를 예측합니다
	- ESM-2는 Meta가 개발한 AI로, 6억 5천만개 단백질을 학습했습니다
	- 용도: 신약 개발, 질병 연구, 진화 분석 등
	""")

	protein_seq = st.text_area(
	"단백질 서열 입력 (복사-붙여넣기 가능):",
	value="MKTIIALSYIFCLVFA",
	help="단백질 서열은 20개 아미노산 문자(A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y)로 구성됩니다",
	height=100
	)

	st.markdown("예제 서열 (클릭해서 복사):")
	col1, col2, col3 = st.columns(3)
	with col1:
	if st.button("인슐린", key="ins"):
	st.code("FVNQHLCGSHLVEALYLVCGERGFFYTPKT", language=None)
	with col2:
	if st.button("엔돌핀", key="end"):
	st.code("YGGFMTSEKSQTPLVTLFKNAIIKNAYKKGE", language=None)
	with col3:
	if st.button("옥시토신", key="oxy"):
	st.code("CYIQNCPLG", language=None)

	if st.button("🔬 단백질 분석 시작", type="primary"):
	seq = protein_seq.strip().upper()

	# Basic stats
	st.markdown("### 📊 기본 분석 결과")
	col1, col2 = st.columns(2)

	with col1:
	st.metric("서열 길이", f"{len(seq)} 아미노산")
	st.metric("분자량 (추정)", f"~{len(seq) * 110} Da")

	with col2:
	unique_aa = len(set(seq))
	st.metric("사용된 아미노산 종류", f"{unique_aa}개")
	hydrophobic = sum(1 for aa in seq if aa in "AVILMFYW")
	st.metric("소수성 비율", f"{hydrophobic/len(seq)*100:.1f}%")

	# AI Analysis
	if TORCH_AVAILABLE and TRANSFORMERS_AVAILABLE:
	st.markdown("### 🤖 AI 임베딩 분석")
	with st.spinner("AI 모델이 단백질을 분석중... (10-30초)"):
	result = esm2_embed(seq, esm_model)
	if "error" in result:
	st.error(result["error"])
	else:
	st.success("✅ AI 분석 완료!")

	col1, col2 = st.columns(2)
	with col1:
	st.metric("벡터 차원", result['size'])
	st.caption("이 숫자들은 단백질의 특성을 수치화한 것입니다")

	with col2:
	st.markdown("임베딩 벡터 미리보기:")
	st.code(result["embedding"][:5])

	st.markdown("""
	🎯 이 분석의 활용:
	- 유사한 기능의 단백질 찾기
	- 구조 예측의 기초 데이터
	- 돌연변이 영향 예측
	- 신약 타겟 발굴
	""")
	else:
	st.warning("⚠️ AI 모델 로딩 중... 잠시 후 다시 시도해주세요")

	# DNA tab
	with tab3:
	st.subheader("🧬 DNA Analysis")

	st.info("""
	DNA 서열 분석이란?
	- DNA의 염기서열(A,T,G,C)을 AI가 분석하여 기능을 예측합니다
	- DNABERT-2는 인간 게놈 전체를 학습한 AI 모델입니다
	- 용도: 유전자 기능 예측, 질병 변이 발견, 진화 연구 등
	""")

	dna_seq = st.text_area(
	"DNA 서열 입력 (복사-붙여넣기 가능):",
	value="ATGCGATCGTAGC",
	help="DNA는 4개 염기(A: 아데닌, T: 티민, G: 구아닌, C: 시토신)로 구성됩니다",
	height=100
	)

	st.markdown("예제 서열 (클릭해서 복사):")
	col1, col2, col3 = st.columns(3)
	with col1:
	if st.button("TATA 박스", key="tata"):
	st.code("GCGCGATATAAAGGCGCGGGCGCGCG", language=None)
	st.caption("유전자 발현 시작 신호")
	with col2:
	if st.button("프로모터", key="prom"):
	st.code("TTGACAGGCTAGCTCAGTCCTAGGTATAATGCTAGC", language=None)
	st.caption("유전자 조절 영역")
	with col3:
	if st.button("CRISPR 타겟", key="crispr"):
	st.code("GTCACCTCCAATGACTAGGGTGG", language=None)
	st.caption("유전자 편집 부위")

	if st.button("🔬 DNA 분석 시작", type="primary"):
	seq = dna_seq.strip().upper().replace("U", "T") # RNA의 U를 T로 변환
	seq = ''.join(c for c in seq if c in 'ATGC') # ATGC만 남기기

	if len(seq) < 3:
	st.error("최소 3개 이상의 염기를 입력해주세요")
	else:
	st.markdown("### 📊 기본 분석 결과")
	col1, col2 = st.columns(2)

	with col1:
	st.metric("서열 길이", f"{len(seq)} bp")
	gc = (seq.count("G") + seq.count("C")) / len(seq) * 100
	st.metric("GC 함량", f"{gc:.1f}%")
	if gc > 60:
	st.caption("🔴 높음: 안정적이지만 복제 어려움")
	elif gc < 40:
	st.caption("🔵 낮음: 불안정하지만 복제 용이")
	else:
	st.caption("🟢 적정: 일반적인 범위")

	with col2:
	at = (seq.count("A") + seq.count("T")) / len(seq) * 100
	st.metric("AT 함량", f"{at:.1f}%")

	# 코돈 분석 (3의 배수인 경우)
	if len(seq) % 3 == 0:
	st.metric("가능한 코돈 수", f"{len(seq)//3}개")
	st.caption("단백질로 번역 가능")

	# 특별 서열 찾기
	st.markdown("### 🔍 주요 모티프 검색")
	motifs_found = []

	if "TATAAAA" in seq or "TATAAA" in seq:
	motifs_found.append("✅ TATA box 발견 (전사 시작 신호)")
	if "CAAT" in seq or "CCAAT" in seq:
	motifs_found.append("✅ CAAT box 발견 (전사 조절)")
	if "ATG" in seq:
	motifs_found.append("✅ 시작 코돈(ATG) 발견")
	if "TAA" in seq or "TAG" in seq or "TGA" in seq:
	motifs_found.append("✅ 정지 코돈 발견")
	if seq.count("CG") > len(seq)/20:
	motifs_found.append("✅ CpG 섬 가능성 (유전자 조절)")

	if motifs_found:
	for motif in motifs_found:
	st.write(motif)
	else:
	st.write("특별한 모티프가 발견되지 않았습니다")

	# AI Analysis
	if TORCH_AVAILABLE and TRANSFORMERS_AVAILABLE:
	st.markdown("### 🤖 AI 임베딩 분석")
	with st.spinner("AI 모델이 DNA를 분석중... (10-30초)"):
	result = dna_embed(seq, dna_model)
	if "error" in result:
	st.error(result["error"])
	else:
	st.success("✅ AI 분석 완료!")

	col1, col2 = st.columns(2)
	with col1:
	st.metric("벡터 차원", result['size'])
	st.caption("DNA 특성을 수치화한 결과입니다")

	with col2:
	st.markdown("임베딩 벡터 미리보기:")
	st.code(result["embedding"][:5])

	st.markdown("""
	🎯 이 분석의 활용:
	- 유전자 기능 예측
	- 프로모터/인핸서 찾기
	- 진화적 보존 영역 발견
	- 질병 관련 변이 예측
	- CRISPR 타겟 부위 평가
	""")
	else:
	st.warning("⚠️ AI 모델 로딩 중... 잠시 후 다시 시도해주세요")

	# About tab
	with tab4:
	st.subheader("ℹ️ About")
	st.markdown("""
	### Features
	- 💬 RAG-based chat for bioinformatics questions
	- 🧬 Protein sequence analysis with ESM-2
	- 🧬 DNA sequence analysis with DNABERT-2
	- 🔍 Web search integration via Brave API
	- 📁 File upload and vector search

	### Models
	- Proteins: ESM-2 (Facebook)
	- DNA: DNABERT-2 (Microsoft)
	- LLM: Llama 3.1 70B (via Fireworks)

	### Disclaimer
	This tool is for research and educational purposes only.
	Not for medical diagnosis or treatment decisions.
	""")

	# Dependency check
	st.divider()
	st.subheader("System Status")
	deps = {
	"PyTorch": TORCH_AVAILABLE,
	"Transformers": TRANSFORMERS_AVAILABLE,
	"Sentence Transformers": SENTENCE_TRANSFORMERS_AVAILABLE,
	"FAISS": FAISS_AVAILABLE,
	"BioPython": BIOPYTHON_AVAILABLE,
	"Datasets": DATASETS_AVAILABLE
	}

	for name, available in deps.items():
	if available:
	st.success(f"✅ {name}")
	else:
	st.warning(f"⚠️ {name} not available")