Spaces:

openfree
/

BIOseq

Sleeping

App Files Files Community

BIOseq / app.py

openfree

Update app.py

c383a1a verified 4 months ago

raw

history blame contribute delete

46 kB

	import os
	import json
	from typing import List, Dict, Tuple
	import time

	import streamlit as st
	import requests

	# 선택적 의존성 가드
	try:
	import torch
	TORCH_AVAILABLE = True
	except ImportError:
	TORCH_AVAILABLE = False
	print("[WARNING] torch not available")

	try:
	from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM
	TRANSFORMERS_AVAILABLE = True
	except ImportError:
	TRANSFORMERS_AVAILABLE = False
	print("[WARNING] transformers not available")

	try:
	from datasets import load_dataset
	DATASETS_AVAILABLE = True
	except ImportError:
	DATASETS_AVAILABLE = False
	print("[WARNING] datasets not available")

	try:
	from sentence_transformers import SentenceTransformer
	SENTENCE_TRANSFORMERS_AVAILABLE = True
	except ImportError:
	SENTENCE_TRANSFORMERS_AVAILABLE = False
	print("[WARNING] sentence_transformers not available")

	try:
	import faiss
	FAISS_AVAILABLE = True
	except ImportError:
	FAISS_AVAILABLE = False
	print("[WARNING] faiss not available")

	try:
	from Bio import SeqIO
	BIOPYTHON_AVAILABLE = True
	except ImportError:
	BIOPYTHON_AVAILABLE = False
	print("[WARNING] biopython not available")

	# PDF 지원 라이브러리
	try:
	import pdfplumber
	PDFPLUMBER_AVAILABLE = True
	except ImportError:
	PDFPLUMBER_AVAILABLE = False
	print("[WARNING] pdfplumber not available")

	try:
	import PyPDF2
	PYPDF2_AVAILABLE = True
	except ImportError:
	PYPDF2_AVAILABLE = False
	print("[WARNING] PyPDF2 not available")

	# 상수
	APP_TITLE = "BioSeq Chat Pro: Advanced Collaborative AI System"
	DISCLAIMER = "This tool is for research/education and is not a medical device. Do not use outputs for diagnosis or treatment decisions."

	# --------------- Helper Functions ---------------

	def get_secret(name: str, fallback: str = "") -> str:
	"""Get secret from st.secrets or environment"""
	try:
	if hasattr(st, 'secrets') and name in st.secrets:
	return st.secrets[name]
	except:
	pass
	return os.environ.get(name, fallback)

	def brave_search(query: str, count: int = 5) -> List[Dict]:
	"""Brave Search API"""
	key = get_secret("BRAVE_API_KEY", "")
	if not key:
	return [{
	"title": "BRAVE_API_KEY missing",
	"url": "",
	"snippet": "Set BRAVE_API_KEY in Space secrets or sidebar"
	}]

	url = "https://api.search.brave.com/res/v1/web/search"
	headers = {
	"Accept": "application/json",
	"X-Subscription-Token": key
	}
	params = {"q": query, "count": count}

	try:
	r = requests.get(url, headers=headers, params=params, timeout=15)
	r.raise_for_status()
	data = r.json()
	results = []
	for item in data.get("web", {}).get("results", [])[:count]:
	results.append({
	"title": item.get("title", ""),
	"url": item.get("url", ""),
	"snippet": item.get("description", "")
	})
	return results if results else [{"title": "No results", "url": "", "snippet": ""}]
	except Exception as e:
	return [{"title": "Error", "url": "", "snippet": str(e)}]

	def call_llm(messages: List[Dict], temperature: float = 0.6, max_tokens: int = 8000) -> str:
	"""Call Fireworks AI API with increased token limit"""
	api_key = get_secret("FIREWORKS_API_KEY", "")
	if not api_key:
	return "FIREWORKS_API_KEY missing. Set it in Secrets or sidebar."

	url = "https://api.fireworks.ai/inference/v1/chat/completions"
	payload = {
	"model": "accounts/fireworks/models/llama-v3p1-70b-instruct",
	"messages": messages,
	"max_tokens": max_tokens, # 8000으로 증가
	"temperature": temperature,
	"top_p": 1,
	"frequency_penalty": 0,
	"presence_penalty": 0
	}
	headers = {
	"Content-Type": "application/json",
	"Authorization": f"Bearer {api_key}"
	}

	try:
	r = requests.post(url, headers=headers, json=payload, timeout=120)
	r.raise_for_status()
	return r.json()["choices"][0]["message"]["content"]
	except Exception as e:
	return f"[LLM Error] {e}"

	def collaborative_answer(query: str, context: str, collaboration_type: str = "full") -> Dict[str, str]:
	"""
	협업 AI 시스템: 감독자, 비평자, 조사자가 협력하여 답변 생성

	Args:
	query: 사용자 질문
	context: 검색된 문맥 정보
	collaboration_type: "full" (전체 협업), "quick" (빠른 답변), "deep" (심층 분석)

	Returns:
	각 역할자의 기여와 최종 답변을 포함한 딕셔너리
	"""

	# 1. 조사자(Investigator) - 사실 수집 및 검증
	investigator_prompt = f"""You are an INVESTIGATOR specializing in bioinformatics fact-checking.

	Context: {context}
	Question: {query}

	Your task:
	1. Extract and verify all relevant facts from the context
	2. Identify any missing information that would improve the answer
	3. Flag any potentially conflicting or uncertain information
	4. Suggest additional areas for research
	5. Provide confidence scores for key facts (0-100%)

	Format your response with:
	- VERIFIED FACTS: (with confidence scores)
	- UNCERTAIN AREAS:
	- MISSING INFORMATION:
	- RESEARCH SUGGESTIONS:
	- KEY CITATIONS:"""

	investigator_msg = [
	{"role": "system", "content": "You are a meticulous scientific fact-checker and researcher."},
	{"role": "user", "content": investigator_prompt}
	]

	investigator_response = call_llm(investigator_msg, temperature=0.2, max_tokens=2000)

	# 2. 감독자(Supervisor) - 구조화된 답변 생성
	supervisor_prompt = f"""You are a SUPERVISOR creating a comprehensive answer.

	Question: {query}
	Context: {context}
	Investigator's Analysis:
	{investigator_response}

	Your task:
	1. Create a well-structured, scientifically accurate answer
	2. Include:
	- Executive Summary (2-3 sentences)
	- Background & Context
	- Detailed Explanation with subsections
	- Practical Applications
	- Current Research Status
	- Future Perspectives
	3. Use clear headings and logical flow
	4. Integrate verified facts from the investigator
	5. Aim for 500-1000 words minimum
	6. Include relevant examples and analogies

	Format with clear markdown headers and bullet points where appropriate."""

	supervisor_msg = [
	{"role": "system", "content": "You are an expert bioinformatics educator who creates comprehensive, well-structured scientific explanations."},
	{"role": "user", "content": supervisor_prompt}
	]

	supervisor_response = call_llm(supervisor_msg, temperature=0.4, max_tokens=3500)

	# 3. 비평자(Critic) - 품질 검증 및 개선
	critic_prompt = f"""You are a CRITIC reviewing the following answer for scientific accuracy.

	Original Question: {query}
	Supervisor's Answer:
	{supervisor_response}

	Investigator's Facts:
	{investigator_response}

	Your task:
	1. Check for scientific accuracy and completeness
	2. Identify any errors, omissions, or unclear explanations
	3. Verify that all claims are properly supported
	4. Assess the answer's clarity and accessibility
	5. Suggest specific improvements
	6. Provide a quality score (0-100)

	Format your critique:
	- ACCURACY ASSESSMENT:
	- COMPLETENESS CHECK:
	- CLARITY EVALUATION:
	- ERRORS/ISSUES FOUND:
	- IMPROVEMENT SUGGESTIONS:
	- QUALITY SCORE: X/100"""

	critic_msg = [
	{"role": "system", "content": "You are a rigorous scientific peer reviewer specializing in bioinformatics."},
	{"role": "user", "content": critic_prompt}
	]

	critic_response = call_llm(critic_msg, temperature=0.3, max_tokens=1500)

	# 4. 최종 통합 답변 (Final Integration)
	if collaboration_type == "full":
	integration_prompt = f"""Create the FINAL INTEGRATED ANSWER incorporating all feedback.

	Question: {query}
	Supervisor's Answer: {supervisor_response}
	Critic's Feedback: {critic_response}
	Verified Facts: {investigator_response}

	Create a polished, final answer that:
	1. Addresses all critic's concerns
	2. Maintains scientific rigor
	3. Includes proper citations
	4. Uses clear structure with markdown formatting
	5. Provides comprehensive coverage (800-1500 words)
	6. Includes a TL;DR section at the beginning
	7. Ends with key takeaways and further reading suggestions

	Use Korean if the question is in Korean, otherwise English."""

	integration_msg = [
	{"role": "system", "content": "You are a master science communicator creating the definitive answer by integrating all expert inputs."},
	{"role": "user", "content": integration_prompt}
	]

	final_answer = call_llm(integration_msg, temperature=0.35, max_tokens=8000)
	else:
	final_answer = supervisor_response

	return {
	"investigator": investigator_response,
	"supervisor": supervisor_response,
	"critic": critic_response,
	"final": final_answer
	}

	def load_file_text(upload) -> str:
	"""Load text from uploaded file (PDF 지원 포함)"""
	name = upload.name.lower()

	# PDF 처리
	if name.endswith(".pdf"):
	if PDFPLUMBER_AVAILABLE:
	try:
	text_parts = []
	with pdfplumber.open(upload) as pdf:
	for page in pdf.pages:
	page_text = page.extract_text()
	if page_text:
	text_parts.append(page_text)
	return "\n\n".join(text_parts)
	except Exception as e:
	st.error(f"PDF 읽기 오류 (pdfplumber): {e}")
	return ""

	elif PYPDF2_AVAILABLE:
	try:
	upload.seek(0)
	pdf_reader = PyPDF2.PdfReader(upload)
	text_parts = []
	for page_num in range(len(pdf_reader.pages)):
	page = pdf_reader.pages[page_num]
	text_parts.append(page.extract_text())
	return "\n\n".join(text_parts)
	except Exception as e:
	st.error(f"PDF 읽기 오류 (PyPDF2): {e}")
	return ""
	else:
	st.error("PDF 파일을 읽으려면 pdfplumber 또는 PyPDF2가 필요합니다")
	return ""

	# 기존 텍스트 파일 처리
	try:
	content = upload.read()
	text = content.decode("utf-8", errors="ignore")
	except:
	return ""

	# FASTA handling
	if name.endswith((".fa", ".fasta", ".faa", ".fna")) and BIOPYTHON_AVAILABLE:
	try:
	upload.seek(0)
	records = list(SeqIO.parse(upload, "fasta"))
	seqs = [f">{r.id}\n{str(r.seq)}" for r in records]
	return "\n\n".join(seqs)
	except:
	pass

	return text

	def chunk_text(text: str, size: int = 1500, overlap: int = 300) -> List[str]:
	"""Split text into chunks with larger size for better context"""
	chunks = []
	start = 0
	text_len = len(text)

	while start < text_len:
	end = min(start + size, text_len)
	chunks.append(text[start:end])
	if end >= text_len:
	break
	start = end - overlap

	return chunks

	def build_index(texts: List[str]):
	"""Build vector index with better model"""
	if not SENTENCE_TRANSFORMERS_AVAILABLE or not FAISS_AVAILABLE:
	return None, None

	try:
	# 더 나은 임베딩 모델 사용
	model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
	embeddings = model.encode(texts, show_progress_bar=False)

	dim = embeddings.shape[1]
	index = faiss.IndexFlatIP(dim)
	index.add(embeddings.astype("float32"))

	return index, model
	except Exception as e:
	st.warning(f"Index build failed: {e}")
	return None, None

	def search_index(query: str, index, model, texts: List[str], k: int = 5) -> List[Dict]:
	"""Search vector index with more results"""
	if index is None or model is None:
	return []

	try:
	q_emb = model.encode([query])
	D, I = index.search(q_emb.astype("float32"), k)

	results = []
	for idx, score in zip(I[0], D[0]):
	if 0 <= idx < len(texts):
	results.append({
	"score": float(score),
	"text": texts[idx]
	})
	return results
	except:
	return []

	def build_context(query: str, docs: List[str], index, model, use_web: bool, web_k: int) -> Tuple[str, List[Dict]]:
	"""Build enhanced context from sources"""
	pieces = []
	sources = []

	# File search with more results
	if index and model and docs:
	hits = search_index(query, index, model, docs, k=6)
	for h in hits:
	pieces.append(f"[FILE SOURCE] {h['text'][:800]}")
	sources.append({"type": "file", "text": h['text'][:150], "score": h['score']})

	# Web search with scientific focus
	if use_web:
	# 과학적 키워드 추가
	scientific_query = f"{query} scientific research pubmed nature science"
	results = brave_search(scientific_query, count=web_k)
	for r in results:
	pieces.append(f"[WEB SOURCE] {r['title']}\n{r['snippet']}")
	sources.append({"type": "web", "title": r['title'], "url": r['url']})

	context = "\n\n---\n\n".join(pieces)[:6000] # 컨텍스트 크기 증가
	return context, sources

	# Enhanced analysis functions
	def esm2_embed(seq: str, model_name: str = "facebook/esm2_t6_8M_UR50D") -> Dict:
	"""Enhanced ESM-2 protein embedding with more analysis"""
	if not TORCH_AVAILABLE or not TRANSFORMERS_AVAILABLE:
	return {"error": "PyTorch/Transformers not available"}

	try:
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForMaskedLM.from_pretrained(model_name)
	model.eval()

	with torch.no_grad():
	inputs = tokenizer(seq, return_tensors="pt", truncation=True, max_length=1024)
	outputs = model(**inputs, output_hidden_states=True)
	hidden = outputs.hidden_states[-1].mean(dim=1).squeeze(0)
	vec = hidden.cpu().numpy()

	# 추가 분석
	attention_weights = outputs.hidden_states[-1].std(dim=1).squeeze(0).cpu().numpy()

	# 메모리 정리
	del model
	del tokenizer
	if torch.cuda.is_available():
	torch.cuda.empty_cache()

	return {
	"embedding": vec.tolist()[:10],
	"size": vec.shape[0],
	"mean": float(vec.mean()),
	"std": float(vec.std()),
	"attention_peaks": attention_weights.tolist()[:10]
	}
	except Exception as e:
	return {"error": str(e)}

	def dna_embed(seq: str, model_name: str = "zhihan1996/DNABERT-2-117M") -> Dict:
	"""Enhanced DNA embedding with k-mer analysis"""
	if not TORCH_AVAILABLE or not TRANSFORMERS_AVAILABLE:
	return {"error": "PyTorch/Transformers not available"}

	try:
	# einops 체크
	try:
	import einops
	except ImportError:
	return {"error": "einops package required. Please wait for installation and refresh the page."}

	# k-mer 변환 함수
	def seq_to_kmer(seq, k=6):
	kmers = []
	for i in range(len(seq) - k + 1):
	kmers.append(seq[i:i+k])
	return ' '.join(kmers)

	# 모델 로딩 시도
	try:
	from transformers import AutoTokenizer, AutoModel
	tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
	model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
	except Exception as model_error:
	# 대체 모델 사용
	try:
	from transformers import BertTokenizer, BertModel
	fallback_model = "bert-base-uncased"
	tokenizer = BertTokenizer.from_pretrained(fallback_model)
	model = BertModel.from_pretrained(fallback_model)
	st.warning(f"DNABERT-2 로딩 실패. 대체 모델 사용중: {fallback_model}")
	except:
	return {"error": f"모델 로딩 실패: {str(model_error)}"}

	model.eval()

	# k-mer 변환
	if len(seq) > 6:
	input_seq = seq_to_kmer(seq, k=6)
	kmer_count = len(seq) - 5
	else:
	input_seq = seq
	kmer_count = 1

	with torch.no_grad():
	inputs = tokenizer(
	input_seq,
	return_tensors="pt",
	truncation=True,
	max_length=512,
	padding=True
	)
	outputs = model(**inputs)

	if hasattr(outputs, 'pooler_output') and outputs.pooler_output is not None:
	vec = outputs.pooler_output.squeeze(0).cpu().numpy()
	else:
	hidden = outputs.last_hidden_state.mean(dim=1).squeeze(0)
	vec = hidden.cpu().numpy()

	# 메모리 정리
	del model
	del tokenizer
	if torch.cuda.is_available():
	torch.cuda.empty_cache()

	return {
	"embedding": vec.tolist()[:10],
	"size": vec.shape[0],
	"kmer_count": kmer_count,
	"mean": float(vec.mean()),
	"std": float(vec.std())
	}

	except Exception as e:
	return {"error": f"분석 중 오류 발생: {str(e)[:200]}"}

	# --------------- Streamlit UI ---------------

	st.set_page_config(page_title=APP_TITLE, page_icon="🧬", layout="wide")
	st.title(APP_TITLE)
	st.caption(DISCLAIMER)

	# Session state init
	if "docs" not in st.session_state:
	st.session_state.docs = []
	if "index" not in st.session_state:
	st.session_state.index = None
	if "model" not in st.session_state:
	st.session_state.model = None
	if "chat_history" not in st.session_state:
	st.session_state.chat_history = []

	# Sidebar
	with st.sidebar:
	st.header("⚙️ Configuration")

	fw_key = st.text_input(
	"FIREWORKS_API_KEY",
	value=get_secret("FIREWORKS_API_KEY", ""),
	type="password",
	help="Required for AI responses"
	)
	brave_key = st.text_input(
	"BRAVE_API_KEY",
	value=get_secret("BRAVE_API_KEY", ""),
	type="password",
	help="Required for web search"
	)

	if fw_key:
	os.environ["FIREWORKS_API_KEY"] = fw_key
	if brave_key:
	os.environ["BRAVE_API_KEY"] = brave_key

	st.divider()

	st.subheader("🤖 AI Models")
	esm_model = st.text_input(
	"ESM-2 Model",
	value="facebook/esm2_t6_8M_UR50D",
	help="Protein analysis model"
	)
	dna_model = st.text_input(
	"DNA Model",
	value="bert-base-uncased",
	help="DNA analysis model"
	)

	st.divider()

	st.subheader("🔍 Search Settings")
	use_web = st.checkbox("Enable web search", value=True)
	web_results = st.slider("Web results", 1, 10, 5)

	st.divider()

	st.subheader("🎭 Collaboration Mode")
	collab_mode = st.radio(
	"AI Collaboration Type",
	["full", "quick", "deep"],
	index=0,
	help="Full: Complete collaboration\nQuick: Fast response\nDeep: In-depth analysis"
	)

	# Tabs
	tab1, tab2, tab3, tab4, tab5 = st.tabs(["💬 Chat", "🧬 Protein", "🧬 DNA", "📊 Analysis", "ℹ️ About"])

	# File upload
	with st.expander("📁 Upload Files", expanded=True):
	files = st.file_uploader(
	"Upload text/FASTA/PDF files",
	type=["txt", "fa", "fasta", "csv", "json", "pdf"],
	accept_multiple_files=True,
	help="Support for multiple file types including PDF"
	)

	if files:
	docs = []
	for f in files:
	try:
	if f.name.lower().endswith(".pdf"):
	if not (PDFPLUMBER_AVAILABLE or PYPDF2_AVAILABLE):
	st.warning(f"⚠️ PDF support requires: pip install pdfplumber")
	continue

	text = load_file_text(f)
	if text:
	docs.extend(chunk_text(text))
	st.success(f"✅ {f.name} loaded ({len(text)} chars)")
	except Exception as e:
	st.error(f"Error reading {f.name}: {e}")

	if docs:
	st.session_state.docs = docs
	st.info(f"📚 Total chunks created: {len(docs)}")

	if SENTENCE_TRANSFORMERS_AVAILABLE and FAISS_AVAILABLE:
	with st.spinner("Building semantic index..."):
	index, model = build_index(docs)
	if index:
	st.session_state.index = index
	st.session_state.model = model
	st.success("✅ Index built successfully")

	# Chat tab with collaborative AI
	with tab1:
	st.subheader("💬 Advanced Collaborative Chat")

	# 협업 시스템 설명
	with st.expander("🎭 How Collaborative AI Works", expanded=False):
	st.markdown("""
	### Three AI Experts Work Together:

	1. 🔍 Investigator: Fact-checks and verifies information
	2. 📝 Supervisor: Creates structured, comprehensive answers
	3. ✅ Critic: Reviews for accuracy and clarity
	4. 🎯 Integrator: Combines all inputs for the final answer

	This system ensures maximum accuracy and comprehensiveness.
	""")

	question = st.text_area(
	"Ask about proteins, DNA, or any bioinformatics topic:",
	value="Explain how AlphaFold revolutionized protein structure prediction and its impact on drug discovery.",
	height=100
	)

	col1, col2 = st.columns([3, 1])
	with col1:
	answer_button = st.button("🚀 Get Collaborative Answer", type="primary", use_container_width=True)
	with col2:
	show_process = st.checkbox("Show process", value=False, help="Display each AI's contribution")

	if answer_button:
	if not get_secret("FIREWORKS_API_KEY"):
	st.error("⚠️ Please set FIREWORKS_API_KEY")
	else:
	# Progress tracking
	progress_bar = st.progress(0)
	status_text = st.empty()

	with st.spinner("🔍 Building knowledge base..."):
	status_text.text("Searching sources...")
	progress_bar.progress(10)

	context, sources = build_context(
	question,
	st.session_state.docs,
	st.session_state.index,
	st.session_state.model,
	use_web,
	web_results
	)

	progress_bar.progress(20)
	status_text.text("Collaborative AI system working...")

	# Get collaborative answer
	start_time = time.time()
	collaborative_result = collaborative_answer(
	question,
	context,
	collaboration_type=collab_mode
	)
	elapsed_time = time.time() - start_time

	progress_bar.progress(100)
	status_text.text(f"✅ Completed in {elapsed_time:.1f} seconds")

	# Display results
	if show_process:
	# Show each AI's contribution
	with st.expander("🔍 Investigator's Analysis", expanded=False):
	st.markdown(collaborative_result["investigator"])

	with st.expander("📝 Supervisor's Draft", expanded=False):
	st.markdown(collaborative_result["supervisor"])

	with st.expander("✅ Critic's Review", expanded=False):
	st.markdown(collaborative_result["critic"])

	# Final answer
	st.markdown("### 🎯 Final Integrated Answer")
	st.markdown(collaborative_result["final"])

	# Sources
	if sources:
	with st.expander("📚 Sources & References", expanded=False):
	for s in sources:
	if s["type"] == "web":
	st.write(f"- 🌐 [{s['title']}]({s['url']})")
	elif s["type"] == "file":
	st.write(f"- 📄 File: {s['text'][:100]}... (Score: {s.get('score', 0):.2f})")

	# Save to history
	st.session_state.chat_history.append({
	"question": question,
	"answer": collaborative_result["final"],
	"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
	"mode": collab_mode
	})

	# Feedback
	col1, col2, col3 = st.columns(3)
	with col1:
	if st.button("👍 Helpful"):
	st.success("Thank you for your feedback!")
	with col2:
	if st.button("👎 Not helpful"):
	st.info("We'll work on improving our responses.")
	with col3:
	if st.button("💾 Save Answer"):
	st.download_button(
	label="Download",
	data=collaborative_result["final"],
	file_name=f"bioseq_answer_{time.strftime('%Y%m%d_%H%M%S')}.md",
	mime="text/markdown"
	)

	# Enhanced Protein tab
	with tab2:
	st.subheader("🧬 Advanced Protein Analysis")

	with st.expander("📚 Learn About Protein Analysis", expanded=False):
	st.markdown("""
	### What is Protein Sequence Analysis?

	Proteins are the workhorses of cells, performing nearly every function necessary for life:
	- 🧪 Enzymes: Catalyze chemical reactions
	- 🛡️ Antibodies: Defend against pathogens
	- 🚚 Transporters: Move molecules across membranes
	- 📡 Receptors: Receive and transmit signals

	ESM-2 (Evolutionary Scale Modeling) is Meta's breakthrough AI that:
	- Trained on 65 million protein sequences
	- Predicts structure and function from sequence alone
	- Enables drug discovery and protein engineering
	""")

	protein_seq = st.text_area(
	"Enter protein sequence (single letter amino acid code):",
	value="MKTIIALSYIFCLVFA",
	help="Standard amino acids: A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y",
	height=100
	)

	# Example sequences
	st.markdown("🧪 Example Sequences (Click to copy):")
	col1, col2, col3, col4 = st.columns(4)
	with col1:
	if st.button("💉 Insulin", key="ins"):
	st.code("FVNQHLCGSHLVEALYLVCGERGFFYTPKT", language=None)
	with col2:
	if st.button("😊 Endorphin", key="end"):
	st.code("YGGFMTSEKSQTPLVTLFKNAIIKNAYKKGE", language=None)
	with col3:
	if st.button("❤️ Oxytocin", key="oxy"):
	st.code("CYIQNCPLG", language=None)
	with col4:
	if st.button("🦠 Lysozyme", key="lys"):
	st.code("KVFGRCELAAAMKRHGLDNYRGYSLGNWVCAAKFESNFNTQATNR", language=None)

	if st.button("🔬 Analyze Protein", type="primary", use_container_width=True):
	seq = protein_seq.strip().upper()

	# Validation
	valid_aa = set("ACDEFGHIKLMNPQRSTVWY")
	invalid = set(seq) - valid_aa
	if invalid:
	st.warning(f"⚠️ Invalid amino acids detected: {', '.join(invalid)}")
	seq = ''.join([aa for aa in seq if aa in valid_aa])

	if len(seq) < 3:
	st.error("Sequence too short. Please enter at least 3 amino acids.")
	else:
	# Basic analysis
	st.markdown("### 📊 Sequence Statistics")
	col1, col2, col3, col4 = st.columns(4)

	with col1:
	st.metric("Length", f"{len(seq)} aa")
	st.metric("Mol. Weight", f"~{len(seq) * 110:.1f} Da")

	with col2:
	unique_aa = len(set(seq))
	st.metric("Unique AA", f"{unique_aa}/20")
	charged = sum(1 for aa in seq if aa in "DEKR")
	st.metric("Charged", f"{charged/len(seq)*100:.1f}%")

	with col3:
	hydrophobic = sum(1 for aa in seq if aa in "AVILMFYW")
	st.metric("Hydrophobic", f"{hydrophobic/len(seq)*100:.1f}%")
	aromatic = sum(1 for aa in seq if aa in "FWY")
	st.metric("Aromatic", f"{aromatic/len(seq)*100:.1f}%")

	with col4:
	basic = sum(1 for aa in seq if aa in "KRH")
	acidic = sum(1 for aa in seq if aa in "DE")
	pi_estimate = 7 + (basic - acidic) * 0.5
	st.metric("pI (est.)", f"~{pi_estimate:.1f}")
	st.metric("Basic/Acidic", f"{basic}/{acidic}")

	# Secondary structure prediction (simplified)
	st.markdown("### 🔮 Predicted Properties")
	col1, col2 = st.columns(2)

	with col1:
	# Helix propensity
	helix_aa = "AELMQKRH"
	helix_score = sum(1 for aa in seq if aa in helix_aa) / len(seq)
	st.metric("α-Helix Propensity", f"{helix_score*100:.1f}%")

	# Beta propensity
	beta_aa = "FIVWY"
	beta_score = sum(1 for aa in seq if aa in beta_aa) / len(seq)
	st.metric("β-Sheet Propensity", f"{beta_score*100:.1f}%")

	with col2:
	# Disorder prediction
	disorder_aa = "PESKTQ"
	disorder_score = sum(1 for aa in seq if aa in disorder_aa) / len(seq)
	st.metric("Disorder Tendency", f"{disorder_score*100:.1f}%")

	# Solubility estimate
	soluble_score = 100 - (hydrophobic/len(seq)*100)
	st.metric("Solubility Score", f"{soluble_score:.1f}%")

	# AI Analysis
	if TORCH_AVAILABLE and TRANSFORMERS_AVAILABLE:
	st.markdown("### 🤖 AI-Powered Analysis")
	with st.spinner("Running ESM-2 analysis... This may take 10-30 seconds"):
	result = esm2_embed(seq, esm_model)

	if "error" in result:
	st.error(f"Analysis failed: {result['error']}")
	else:
	st.success("✅ AI analysis complete!")

	col1, col2, col3 = st.columns(3)
	with col1:
	st.metric("Embedding Dimension", result['size'])
	with col2:
	st.metric("Mean Value", f"{result.get('mean', 0):.3f}")
	with col3:
	st.metric("Std Dev", f"{result.get('std', 0):.3f}")

	# Visualization placeholder
	st.markdown("🎨 Embedding Visualization:")
	st.info("The protein has been encoded into a high-dimensional space where similar proteins cluster together.")

	# Applications
	st.markdown("""
	### 🎯 Applications of This Analysis:

	1. 🔍 Similar Protein Search: Find proteins with similar functions
	2. 💊 Drug Target Identification: Predict binding sites and interactions
	3. 🧬 Mutation Impact: Assess how changes affect protein function
	4. 🏗️ Structure Prediction: Input for AlphaFold-like systems
	5. ⚗️ Protein Engineering: Design improved variants
	""")
	else:
	st.warning("⚠️ AI models are loading. Please refresh in a moment.")

	# Enhanced DNA tab
	with tab3:
	st.subheader("🧬 Advanced DNA Analysis")

	with st.expander("📚 Learn About DNA Analysis", expanded=False):
	st.markdown("""
	### Understanding DNA Sequences

	DNA is the blueprint of life, encoding all genetic information in four bases:
	- A (Adenine): Pairs with T
	- T (Thymine): Pairs with A
	- G (Guanine): Pairs with C
	- C (Cytosine): Pairs with G

	Key Concepts:
	- Gene: A DNA segment that codes for a protein
	- Promoter: Controls when genes are turned on/off
	- Codon: Three bases that code for one amino acid
	- GC Content: Affects stability and gene expression

	DNABERT-2 is an AI model that understands DNA "language" to predict:
	- Gene function
	- Regulatory elements
	- Disease-causing mutations
	- Evolution patterns
	""")

	dna_seq = st.text_area(
	"Enter DNA sequence:",
	value="ATGCGATCGTAGC",
	help="Use A, T, G, C for DNA (U will be converted to T for RNA)",
	height=100
	)

	# Example sequences
	st.markdown("🧪 Example Sequences (Click to analyze):")
	col1, col2, col3, col4 = st.columns(4)
	with col1:
	if st.button("📋 TATA Box", key="tata"):
	st.code("TATAAAAGCGCGCGCG", language=None)
	st.caption("Gene start signal")
	with col2:
	if st.button("🎯 Promoter", key="prom"):
	st.code("TTGACAGGCTAGCTCAGTCCTAGGTATAATGCTAGC", language=None)
	st.caption("Gene control region")
	with col3:
	if st.button("✂️ CRISPR", key="crispr"):
	st.code("GTCACCTCCAATGACTAGGGTGG", language=None)
	st.caption("Gene editing target")
	with col4:
	if st.button("🧬 Telomere", key="telo"):
	st.code("TTAGGGTTAGGGTTAGGG", language=None)
	st.caption("Chromosome end")

	if st.button("🔬 Analyze DNA", type="primary", use_container_width=True):
	seq = dna_seq.strip().upper().replace("U", "T")
	seq = ''.join(c for c in seq if c in 'ATGC')

	if len(seq) < 3:
	st.error("Sequence too short. Please enter at least 3 bases.")
	else:
	# Advanced statistics
	st.markdown("### 📊 Sequence Analysis")

	col1, col2, col3, col4 = st.columns(4)

	with col1:
	st.metric("Length", f"{len(seq)} bp")
	st.metric("Size", f"~{len(seq)*660:.0f} Da")

	with col2:
	gc = (seq.count("G") + seq.count("C")) / len(seq) * 100
	st.metric("GC Content", f"{gc:.1f}%")
	if gc > 65:
	st.caption("🔴 Very high")
	elif gc > 55:
	st.caption("🟠 High")
	elif gc < 35:
	st.caption("🔵 Low")
	elif gc < 25:
	st.caption("🟣 Very low")
	else:
	st.caption("🟢 Normal")

	with col3:
	at = 100 - gc
	st.metric("AT Content", f"{at:.1f}%")
	tm = 4 * (seq.count("G") + seq.count("C")) + 2 * (seq.count("A") + seq.count("T"))
	st.metric("Tm (est.)", f"{tm}°C")

	with col4:
	cpg = seq.count("CG")
	cpg_ratio = (cpg * len(seq)) / (seq.count("C") * seq.count("G")) if seq.count("C") * seq.count("G") > 0 else 0
	st.metric("CpG Sites", cpg)
	st.metric("CpG O/E", f"{cpg_ratio:.2f}")

	# Motif search
	st.markdown("### 🔍 Regulatory Elements & Motifs")

	motifs_found = []
	motif_positions = []

	# Extended motif database
	motif_db = {
	"TATA Box": ["TATAAA", "TATAWAW"],
	"CAAT Box": ["CAAT", "CCAAT", "GGCCAATCT"],
	"GC Box": ["GGGCGG", "GGCGGG"],
	"Start Codon": ["ATG"],
	"Stop Codons": ["TAA", "TAG", "TGA"],
	"Kozak Sequence": ["GCCRCCATGG"],
	"Poly-A Signal": ["AATAAA", "ATTAAA"],
	"E-box": ["CANNTG"],
	"CRE": ["TGACGTCA"],
	"NF-κB": ["GGGACTTTCC"]
	}

	for motif_name, patterns in motif_db.items():
	for pattern in patterns:
	# Simple pattern matching (R=A/G, W=A/T, N=any)
	simple_pattern = pattern.replace("R", "[AG]").replace("W", "[AT]").replace("N", "[ATGC]")
	import re
	if re.search(simple_pattern, seq):
	motifs_found.append(f"✅ {motif_name}: {pattern}")
	break

	if motifs_found:
	for motif in motifs_found:
	st.write(motif)
	else:
	st.info("No known regulatory motifs detected")

	# Codon analysis
	if len(seq) >= 3:
	st.markdown("### 🧬 Coding Potential Analysis")

	col1, col2 = st.columns(2)

	with col1:
	# Reading frames
	st.markdown("Open Reading Frames:")
	for frame in range(3):
	frame_seq = seq[frame:]
	if "ATG" in frame_seq:
	start_pos = frame_seq.index("ATG") + frame
	st.write(f"Frame {frame+1}: Start at position {start_pos+1}")

	with col2:
	# Codon usage
	if len(seq) % 3 == 0:
	st.markdown("Codon Statistics:")
	codon_count = len(seq) // 3
	st.metric("Total Codons", codon_count)

	# Count stops
	stops = seq.count("TAA") + seq.count("TAG") + seq.count("TGA")
	st.metric("Stop Codons", stops)

	# AI Analysis
	if TORCH_AVAILABLE and TRANSFORMERS_AVAILABLE:
	st.markdown("### 🤖 AI-Powered Genomic Analysis")
	with st.spinner("Running DNABERT analysis... This may take 10-30 seconds"):
	result = dna_embed(seq, dna_model)

	if "error" in result:
	st.error(f"Analysis failed: {result['error']}")
	else:
	st.success("✅ AI analysis complete!")

	col1, col2, col3 = st.columns(3)
	with col1:
	st.metric("Embedding Dimension", result['size'])
	with col2:
	st.metric("k-mer Count", result.get('kmer_count', 'N/A'))
	with col3:
	st.metric("Mean Value", f"{result.get('mean', 0):.3f}")

	st.markdown("""
	### 🎯 Applications of DNA Analysis:

	1. 🔬 Gene Discovery: Identify coding and regulatory regions
	2. 🏥 Disease Diagnosis: Detect pathogenic mutations
	3. ✂️ CRISPR Design: Find optimal gene editing sites
	4. 🌱 Evolution Studies: Compare sequences across species
	5. 💊 Personalized Medicine: Tailor treatments to genetic profiles
	6. 🦠 Pathogen Detection: Identify viral/bacterial DNA
	""")
	else:
	st.warning("⚠️ AI models are loading. Please refresh in a moment.")

	# Analysis History tab
	with tab4:
	st.subheader("📊 Analysis History & Insights")

	if st.session_state.chat_history:
	st.markdown(f"### 💾 Previous Analyses ({len(st.session_state.chat_history)} total)")

	for i, entry in enumerate(reversed(st.session_state.chat_history[-5:])):
	with st.expander(f"🕐 {entry['timestamp']} - Mode: {entry['mode']}", expanded=False):
	st.markdown("Question:")
	st.write(entry['question'])
	st.markdown("Answer:")
	st.write(entry['answer'][:500] + "..." if len(entry['answer']) > 500 else entry['answer'])

	if st.button(f"View Full", key=f"view_{i}"):
	st.markdown(entry['answer'])
	else:
	st.info("No analysis history yet. Start by asking a question in the Chat tab!")

	# Export options
	if st.session_state.chat_history:
	st.markdown("### 📤 Export Options")
	col1, col2 = st.columns(2)

	with col1:
	if st.button("Export as Markdown"):
	md_content = "\n\n---\n\n".join([
	f"## {entry['timestamp']}\n\nQ: {entry['question']}\n\nA: {entry['answer']}"
	for entry in st.session_state.chat_history
	])
	st.download_button(
	"Download MD",
	md_content,
	f"bioseq_history_{time.strftime('%Y%m%d')}.md",
	"text/markdown"
	)

	with col2:
	if st.button("Clear History"):
	st.session_state.chat_history = []
	st.rerun()

	# Enhanced About tab
	with tab5:
	st.subheader("ℹ️ About BioSeq Chat Pro")

	st.markdown("""
	### 🚀 Enhanced Features

	#### Collaborative AI System
	- 🔍 Investigator: Verifies facts and identifies knowledge gaps
	- 📝 Supervisor: Creates comprehensive, structured answers
	- ✅ Critic: Reviews for accuracy and clarity
	- 🎯 Integrator: Synthesizes all inputs into final answer

	#### Technical Improvements
	- 8000 token responses for comprehensive answers
	- Enhanced context building with semantic search
	- Multiple collaboration modes (Full, Quick, Deep)
	- Scientific source prioritization in web search
	- Larger embedding models for better accuracy

	### 🧬 Supported Analyses
	- Protein Analysis: ESM-2 embeddings, property prediction
	- DNA Analysis: DNABERT-2/BERT embeddings, motif search
	- RAG Chat: Context-aware Q&A with file integration
	- PDF Support: Direct analysis of research papers

	### 📚 Models & Technologies
	- LLM: Llama 3.1 70B (via Fireworks AI)
	- Protein: ESM-2 (Meta/Facebook)
	- DNA: DNABERT-2 (Microsoft) / BERT (Google)
	- Embeddings: all-mpnet-base-v2 (Sentence Transformers)
	- Vector Search: FAISS (Facebook)

	### ⚠️ Disclaimer
	This tool is designed for research and educational purposes only.
	- Not intended for medical diagnosis or treatment
	- Not validated for clinical use
	- Always consult qualified professionals for medical decisions

	### 🔧 System Status
	""")

	# System status with better formatting
	col1, col2 = st.columns(2)

	deps_essential = {
	"PyTorch": TORCH_AVAILABLE,
	"Transformers": TRANSFORMERS_AVAILABLE,
	"Sentence Transformers": SENTENCE_TRANSFORMERS_AVAILABLE,
	"FAISS": FAISS_AVAILABLE,
	}

	deps_optional = {
	"BioPython": BIOPYTHON_AVAILABLE,
	"Datasets": DATASETS_AVAILABLE,
	"PDF (pdfplumber)": PDFPLUMBER_AVAILABLE,
	"PDF (PyPDF2)": PYPDF2_AVAILABLE
	}

	with col1:
	st.markdown("Essential Components:")
	for name, available in deps_essential.items():
	if available:
	st.success(f"✅ {name}")
	else:
	st.error(f"❌ {name}")

	with col2:
	st.markdown("Optional Components:")
	for name, available in deps_optional.items():
	if available:
	st.success(f"✅ {name}")
	else:
	st.warning(f"⚠️ {name}")

	# Performance metrics
	if st.session_state.chat_history:
	st.markdown("### 📈 Usage Statistics")
	col1, col2, col3 = st.columns(3)
	with col1:
	st.metric("Total Queries", len(st.session_state.chat_history))
	with col2:
	modes = [h['mode'] for h in st.session_state.chat_history]
	most_used = max(set(modes), key=modes.count) if modes else "N/A"
	st.metric("Most Used Mode", most_used)
	with col3:
	avg_length = sum(len(h['answer']) for h in st.session_state.chat_history) / len(st.session_state.chat_history)
	st.metric("Avg Answer Length", f"{avg_length:.0f} chars")

	st.markdown("""
	---
	### 📞 Support & Feedback
	- Report issues or suggest features
	- Contribute to development
	- Share your research results

	Version: 2.0.0 Pro \| Last Updated: 2025
	""")