Spaces:

openfree
/

BIOseq

Sleeping

App Files Files Community

openfree commited on Aug 26, 2025

Commit

0398d1e

verified ·

1 Parent(s): 63ef9ea

Update app.py

Browse files

Files changed (1) hide show

app.py +342 -317

app.py CHANGED Viewed

@@ -1,76 +1,100 @@
 import os
 import json
-import time
 from typing import List, Dict, Tuple
 import streamlit as st
 import requests
-# Guard imports for optional dependencies
 try:
     import torch
-    from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM
     TORCH_AVAILABLE = True
-except Exception:
     TORCH_AVAILABLE = False
 try:
     from datasets import load_dataset
     DATASETS_AVAILABLE = True
-except Exception:
     DATASETS_AVAILABLE = False
 try:
     from sentence_transformers import SentenceTransformer
     SENTENCE_TRANSFORMERS_AVAILABLE = True
-except Exception:
     SENTENCE_TRANSFORMERS_AVAILABLE = False
 try:
     import faiss
     FAISS_AVAILABLE = True
-except Exception:
     FAISS_AVAILABLE = False
 try:
     from Bio import SeqIO
     BIOPYTHON_AVAILABLE = True
-except Exception:
     BIOPYTHON_AVAILABLE = False
-# Constants
 APP_TITLE = "BioSeq Chat: Protein & DNA Assistant"
-DISCLAIMER = (
-    "This tool is for research/education and is not a medical device. "
-    "Do not use outputs for diagnosis or treatment decisions."
-)
 # --------------- Helper Functions ---------------
 def get_secret(name: str, fallback: str = "") -> str:
-    """Get secret from st.secrets, environment, or fallback"""
     try:
-        if hasattr(st, 'secrets'):
-            return st.secrets.get(name, os.environ.get(name, fallback))
     except:
         pass
     return os.environ.get(name, fallback)
 def brave_search(query: str, count: int = 5) -> List[Dict]:
-    """Search using Brave Search API"""
     key = get_secret("BRAVE_API_KEY", "")
     if not key:
-        return [{"title": "BRAVE_API_KEY is missing",
-                 "url": "",
-                 "snippet": "Set BRAVE_API_KEY in Space secrets or sidebar to enable web search."}]
     url = "https://api.search.brave.com/res/v1/web/search"
     headers = {
         "Accept": "application/json",
-        "X-Subscription-Token": key,
-        "Accept-Encoding": "gzip"
     }
-    params = {"q": query, "count": count, "country": "us"}
     try:
         r = requests.get(url, headers=headers, params=params, timeout=15)
@@ -81,206 +105,198 @@ def brave_search(query: str, count: int = 5) -> List[Dict]:
             results.append({
                 "title": item.get("title", ""),
                 "url": item.get("url", ""),
-                "snippet": item.get("description", ""),
             })
-        return results if results else [{"title": "No results", "url": "", "snippet": "Query returned no results."}]
     except Exception as e:
-        return [{"title": "Search error", "url": "", "snippet": str(e)}]
-def call_fireworks(messages: List[Dict], temperature: float = 0.6, max_tokens: int = 1024) -> str:
-    """Call Fireworks AI chat completion API"""
     api_key = get_secret("FIREWORKS_API_KEY", "")
     if not api_key:
-        return "FIREWORKS_API_KEY is missing. Set it in Secrets or the sidebar."
     url = "https://api.fireworks.ai/inference/v1/chat/completions"
     payload = {
         "model": "accounts/fireworks/models/llama-v3p1-70b-instruct",
         "max_tokens": max_tokens,
         "top_p": 1,
-        "top_k": 40,
-        "presence_penalty": 0,
         "frequency_penalty": 0,
-        "temperature": temperature,
-        "messages": messages
     }
     headers = {
-        "Accept": "application/json",
         "Content-Type": "application/json",
         "Authorization": f"Bearer {api_key}"
     }
     try:
-        r = requests.post(url, headers=headers, data=json.dumps(payload), timeout=60)
         r.raise_for_status()
-        data = r.json()
-        return data["choices"][0]["message"]["content"]
     except Exception as e:
-        return f"[Fireworks API error] {e}"
-def load_text_from_file(upload) -> str:
     """Load text from uploaded file"""
     name = upload.name.lower()
-    content = upload.read()
     try:
         text = content.decode("utf-8", errors="ignore")
     except:
-        text = str(content)
-    # FASTA file handling
     if name.endswith((".fa", ".fasta", ".faa", ".fna")) and BIOPYTHON_AVAILABLE:
-        upload.seek(0)
         try:
             records = list(SeqIO.parse(upload, "fasta"))
-            seqs = []
-            for r in records:
-                seqs.append(f">{r.id}\n{str(r.seq)}")
             return "\n\n".join(seqs)
         except:
             pass
     return text
-def build_vector_index(texts: List[str], embedder_name: str = "sentence-transformers/all-MiniLM-L6-v2"):
-    """Build FAISS vector index from texts"""
     if not SENTENCE_TRANSFORMERS_AVAILABLE or not FAISS_AVAILABLE:
-        return None, None, None
     try:
-        model = SentenceTransformer(embedder_name)
-        emb = model.encode(texts, show_progress_bar=False, normalize_embeddings=True)
-        dim = emb.shape[1]
         index = faiss.IndexFlatIP(dim)
-        index.add(emb.astype("float32"))
-        return index, emb, model
     except Exception as e:
-        st.warning(f"Failed to build index: {e}")
-        return None, None, None
-def search_index(query: str, index, model, texts: List[str], k: int = 4):
     """Search vector index"""
     if index is None or model is None:
         return []
     try:
-        q = model.encode([query], normalize_embeddings=True)
-        D, I = index.search(q.astype("float32"), k)
-        hits = []
         for idx, score in zip(I[0], D[0]):
             if 0 <= idx < len(texts):
-                hits.append({"score": float(score), "text": texts[idx]})
-        return hits
     except:
         return []
-def esm2_embed(seq: str, model_id: str = "facebook/esm2_t6_8M_UR50D") -> Dict:
-    """Generate ESM-2 embedding for protein sequence"""
-    if not TORCH_AVAILABLE:
-        return {"error": "Transformers/torch not available. Please wait for dependencies to install."}
     try:
-        from transformers import AutoTokenizer, AutoModelForMaskedLM
-        import torch
-        tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
-        model = AutoModelForMaskedLM.from_pretrained(model_id, trust_remote_code=True)
         model.eval()
         with torch.no_grad():
-            toks = tokenizer(seq, return_tensors="pt")
-            out = model(**toks, output_hidden_states=True)
-            hidden = out.hidden_states[-1].mean(dim=1).squeeze(0)
-            vec = hidden.detach().cpu().numpy()
-            return {"embedding": vec.tolist(), "hidden_size": vec.shape[0]}
     except Exception as e:
         return {"error": str(e)}
-def dna_embed(seq: str, model_id: str = "zhihan1996/DNABERT-2-117M") -> Dict:
-    """Generate DNA embedding"""
-    if not TORCH_AVAILABLE:
-        return {"error": "Transformers/torch not available. Please wait for dependencies to install."}
     try:
-        from transformers import AutoTokenizer, AutoModel
-        import torch
-        tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
-        model = AutoModel.from_pretrained(model_id, trust_remote_code=True)
         model.eval()
         with torch.no_grad():
-            toks = tokenizer(seq, return_tensors="pt", truncation=True, max_length=4096)
-            out = model(**toks, output_hidden_states=True)
-            hidden = out.last_hidden_state.mean(dim=1).squeeze(0)
-            vec = hidden.detach().cpu().numpy()
-            return {"embedding": vec.tolist(), "hidden_size": vec.shape[0]}
     except Exception as e:
         return {"error": str(e)}
-def chunk_text(text: str, chunk_size: int = 1200, overlap: int = 200) -> List[str]:
-    """Chunk text with overlap"""
-    text = text.replace("\r\n", "\n")
-    chunks = []
-    start = 0
-    while start < len(text):
-        end = min(len(text), start + chunk_size)
-        chunks.append(text[start:end])
-        if end >= len(text):
-            break
-        start = end - overlap
-    return chunks
-def build_context(user_query: str, index, index_model, docs: List[str], loaded_datasets: List, use_web: bool, web_k: int) -> Tuple[str, List[Dict]]:
-    """Build context from various sources"""
     pieces = []
     sources = []
-    # From uploaded files
-    if index is not None and index_model is not None and docs:
-        hits = search_index(user_query, index, index_model, docs, k=4)
-        for h in hits:
-            pieces.append(f"[FILE] {h['text'][:800]}")
-            sources.append({"type": "file", "text": h["text"][:200]})
-    # From datasets
-    for rid, sample in loaded_datasets:
-        if sample:
-            pieces.append(f"[DATASET {rid}] {sample}")
-            sources.append({"type": "dataset", "id": rid})
-    # From web
     if use_web:
-        results = brave_search(user_query, count=web_k)
         for r in results:
-            snippet = r.get("snippet", "")
-            url = r.get("url", "")
-            title = r.get("title", "")
-            pieces.append(f"[WEB] {title}\n{snippet}\n{url}")
-            sources.append({"type": "web", "title": title, "url": url})
-    context = "\n\n---\n\n".join(pieces)[:6000]
     return context, sources
-def chat_answer(user_query: str, index, index_model, docs: List[str], loaded_datasets: List, use_web: bool, web_k: int) -> Tuple[str, List[Dict]]:
-    """Generate chat answer with context"""
-    context, sources = build_context(user_query, index, index_model, docs, loaded_datasets, use_web, web_k)
     system = (
-        "You are a concise, careful bioinformatics assistant for protein and DNA. "
-        "Answer with factual, verifiable statements. "
-        "When uncertain, say so briefly. "
-        "Never give medical advice. Provide short references as plain URLs or titles if present in context. "
-        "User uploads and web/dataset snippets are provided as context below."
     )
-    prompt = f"Context:\n{context}\n\nUser question:\n{user_query}\n\nAnswer in Korean if the user used Korean; otherwise match user language."
     messages = [
         {"role": "system", "content": system},
-        {"role": "user", "content": prompt}
     ]
-    answer = call_fireworks(messages, temperature=0.4, max_tokens=1200)
-    return answer, sources
 # --------------- Streamlit UI ---------------
@@ -288,215 +304,224 @@ st.set_page_config(page_title=APP_TITLE, page_icon="🧬", layout="wide")
 st.title(APP_TITLE)
 st.caption(DISCLAIMER)
-# Check dependencies status
-if not TORCH_AVAILABLE:
-    st.warning("⏳ PyTorch is being installed. Some features may be unavailable initially. Please refresh in a minute.")
-# Initialize session state
-if 'docs' not in st.session_state:
     st.session_state.docs = []
-if 'index' not in st.session_state:
     st.session_state.index = None
-if 'index_model' not in st.session_state:
-    st.session_state.index_model = None
-if 'loaded_datasets' not in st.session_state:
-    st.session_state.loaded_datasets = []
-# Sidebar configuration
 with st.sidebar:
-    st.header("Keys and settings")
-    fw_key = st.text_input("FIREWORKS_API_KEY", value=get_secret("FIREWORKS_API_KEY", ""), type="password")
-    brave_key = st.text_input("BRAVE_API_KEY", value=get_secret("BRAVE_API_KEY", ""), type="password")
     if fw_key:
         os.environ["FIREWORKS_API_KEY"] = fw_key
     if brave_key:
         os.environ["BRAVE_API_KEY"] = brave_key
-    st.markdown("### Model selections")
-    esm2_id = st.text_input(
-        "Protein model (ESM-2)",
-        value="facebook/esm2_t6_8M_UR50D",
-        help="Try larger models like facebook/esm2_t33_650M_UR50D if resources allow."
-    )
-    dna_id = st.text_input(
-        "DNA model",
-        value="zhihan1996/DNABERT-2-117M",
-        help="Alternative: InstaDeepAI/nucleotide-transformer-500m-human-ref"
-    )
-    use_web = st.checkbox("Use Brave web search for context", value=True)
-    web_k = st.slider("Web results", 1, 10, 4)
-    st.markdown("### Datasets (optional)")
-    dataset_ids = st.text_area(
-        "Datasets to load (one per line)",
-        value="",
-        help="Enter Hugging Face dataset repo ids, e.g., 'genomics-benchmark/jaspar_motifs'"
     )
-    st.divider()
-    st.markdown("Files you upload are indexed locally and used for answers.")
-# Main tabs
-tabs = st.tabs(["Chat", "Protein", "DNA", "Examples", "About"])
-# File upload section
-with st.expander("Upload files for context (txt/csv/json/fasta/vcf)", expanded=True):
-    uploads = st.file_uploader(
-        "Add files",
-        type=["txt", "md", "csv", "tsv", "json", "fa", "fasta", "faa", "fna", "vcf"],
-        accept_multiple_files=True,
-        key="file_uploader"
     )
-    if uploads:
         docs = []
-        for up in uploads:
             try:
-                txt = load_text_from_file(up)
-                docs.extend(chunk_text(txt))
             except Exception as e:
-                st.warning(f"Failed to read {up.name}: {e}")
-        st.session_state.docs = docs
-        st.caption(f"Indexed chunks: {len(docs)}")
-        # Build index if docs available
-        if docs and SENTENCE_TRANSFORMERS_AVAILABLE and FAISS_AVAILABLE:
-            with st.spinner("Building vector index..."):
-                index, emb, index_model = build_vector_index(docs)
-                st.session_state.index = index
-                st.session_state.index_model = index_model
-    else:
-        st.caption("No files uploaded yet")
-# Load datasets if specified
-if dataset_ids.strip() and DATASETS_AVAILABLE:
-    dataset_list = [x.strip() for x in dataset_ids.splitlines() if x.strip()]
-    if dataset_list != [d[0] for d in st.session_state.loaded_datasets]:
-        st.session_state.loaded_datasets = []
-        for rid in dataset_list:
-            with st.spinner(f"Loading dataset {rid}..."):
-                try:
-                    ds = load_dataset(rid)
-                    sample = ""
-                    for split in ds.keys():
-                        try:
-                            row = ds[split][0]
-                            sample = json.dumps(row, ensure_ascii=False)[:500]
-                            break
-                        except:
-                            pass
-                    st.session_state.loaded_datasets.append((rid, sample))
-                    st.success(f"Loaded {rid}")
-                except Exception as e:
-                    st.error(f"Failed to load {rid}: {e}")
 # Chat tab
-with tabs[0]:
-    st.subheader("Chat")
-    q = st.text_area("Ask a question about protein/DNA", value="ESM-2 임베딩은 단백질 기능 해석에 어떻게 도움되나요?")
-    if st.button("Answer", type="primary"):
-        with st.spinner("Thinking..."):
-            ans, srcs = chat_answer(
-                q,
-                st.session_state.index,
-                st.session_state.index_model,
-                st.session_state.docs,
-                st.session_state.loaded_datasets,
-                use_web,
-                web_k
-            )
-        st.write(ans)
-        if srcs:
-            st.markdown("#### Sources")
-            for s in srcs:
-                if s.get("type") == "web" and s.get("url"):
-                    st.markdown(f"- {s.get('title', 'web')}: {s.get('url')}")
-                elif s.get("type") == "dataset":
-                    st.markdown(f"- dataset: {s.get('id')}")
-                elif s.get("type") == "file":
-                    snippet = s.get("text", "")
-                    st.markdown(f"- file snippet: {snippet[:120]}...")
 # Protein tab
-with tabs[1]:
-    st.subheader("Protein analysis")
-    seq = st.text_area("Protein sequence (amino acids only)", value="MKTIIALSYIFCLVFADYKDDDDK")
     col1, col2 = st.columns(2)
     with col1:
-        st.caption("ESM-2 embedding")
-        if st.button("Run ESM-2", key="run_esm2"):
-            with st.spinner("Computing ESM-2 embedding..."):
-                out = esm2_embed(seq.strip(), esm2_id)
-            if "error" in out:
-                st.error(out["error"])
             else:
-                st.success(f"Vector size: {out['hidden_size']}")
-                st.json({"embedding_preview": out["embedding"][:8]})
     with col2:
-        st.caption("Quick stats")
-        s = seq.replace("\n", "").replace(" ", "").upper()
-        length = len(s)
-        aa_set = sorted(set(list(s)))
-        st.write(f"Length: {length}")
-        st.write(f"Unique AAs: {''.join(aa_set)[:30]}")
 # DNA tab
-with tabs[2]:
-    st.subheader("DNA analysis")
-    dseq = st.text_area("DNA sequence (ACGT only)", value="ATGCGTACGTAGCTAGCTAGCTAGGCTAGC")
-    col3, col4 = st.columns(2)
-    with col3:
-        st.caption("DNA embedding")
-        if st.button("Run DNA embed", key="run_dna"):
-            with st.spinner("Computing DNA embedding..."):
-                out = dna_embed(dseq.strip(), dna_id)
-            if "error" in out:
-                st.error(out["error"])
             else:
-                st.success(f"Vector size: {out['hidden_size']}")
-                st.json({"embedding_preview": out["embedding"][:8]})
-    with col4:
-        st.caption("GC content")
-        s = dseq.upper().replace("N", "").replace(" ", "").replace("\n", "")
-        if len(s) > 0:
-            gc = (s.count("G") + s.count("C")) / len(s)
-        else:
-            gc = 0
-        st.write(f"Length: {len(s)}")
-        st.write(f"GC: {gc:.3f}")
-# Examples tab
-with tabs[3]:
-    st.subheader("Examples")
-    st.markdown("### Example questions you can ask:")
-    st.markdown("- 업로드한 FASTA에서 특정 단백질의 기능 요약과 변이 영향 질문")
-    st.markdown("- DNA 서열에서 프로모터 가능성과 전사인자 모티프 관련 근거 요청")
-    st.markdown("- Enzyme active site 근접 변이의 리스크 해석 (연구 관점)")
-    st.markdown("- ENCODE/UniProt/AlphaFold 개념 설명 요청")
-    st.markdown("- RAG 기반으로 문서 인용과 함께 간략 답변 요청")
 # About tab
-with tabs[4]:
-    st.subheader("About this Space")
-    st.write("**Models suggested:**")
-    st.write("- ESM-2 for proteins")
-    st.write("- DNABERT-2 or Nucleotide Transformer for DNA")
-    st.write("")
-    st.write("**Common datasets:**")
-    st.write("- UniProtKB, AlphaFoldDB, ENCODE, JASPAR, ClinVar")
-    st.write("")
-    st.write("**Features:**")
-    st.write("- Web search powered by Brave Search API")
-    st.write("- LLM powered by Fireworks AI")
-    st.write("- Vector search with FAISS")
-    st.write("")
-    st.info(DISCLAIMER)

 import os
+import sys
 import json
 from typing import List, Dict, Tuple
+# Streamlit 실행 확인
+def _running_in_streamlit() -> bool:
+    try:
+        from streamlit.runtime.scriptrunner import get_script_run_ctx
+        return get_script_run_ctx() is not None
+    except Exception:
+        return False
+if not _running_in_streamlit():
+    print("이 앱은 Streamlit 서버로 실행해야 합니다.")
+    print("명령: streamlit run app.py --server.port=8501 --server.address=0.0.0.0")
+    sys.exit(0)
 import streamlit as st
 import requests
+# 선택적 의존성 가드
 try:
     import torch
     TORCH_AVAILABLE = True
+except ImportError:
     TORCH_AVAILABLE = False
+    print("[WARNING] torch not available")
+try:
+    from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM
+    TRANSFORMERS_AVAILABLE = True
+except ImportError:
+    TRANSFORMERS_AVAILABLE = False
+    print("[WARNING] transformers not available")
 try:
     from datasets import load_dataset
     DATASETS_AVAILABLE = True
+except ImportError:
     DATASETS_AVAILABLE = False
+    print("[WARNING] datasets not available")
 try:
     from sentence_transformers import SentenceTransformer
     SENTENCE_TRANSFORMERS_AVAILABLE = True
+except ImportError:
     SENTENCE_TRANSFORMERS_AVAILABLE = False
+    print("[WARNING] sentence_transformers not available")
 try:
     import faiss
     FAISS_AVAILABLE = True
+except ImportError:
     FAISS_AVAILABLE = False
+    print("[WARNING] faiss not available")
 try:
     from Bio import SeqIO
     BIOPYTHON_AVAILABLE = True
+except ImportError:
     BIOPYTHON_AVAILABLE = False
+    print("[WARNING] biopython not available")
+# 상수
 APP_TITLE = "BioSeq Chat: Protein & DNA Assistant"
+DISCLAIMER = "This tool is for research/education and is not a medical device. Do not use outputs for diagnosis or treatment decisions."
 # --------------- Helper Functions ---------------
 def get_secret(name: str, fallback: str = "") -> str:
+    """Get secret from st.secrets or environment"""
     try:
+        # Streamlit secrets
+        if hasattr(st, 'secrets') and name in st.secrets:
+            return st.secrets[name]
     except:
         pass
+    # Environment variable
     return os.environ.get(name, fallback)
 def brave_search(query: str, count: int = 5) -> List[Dict]:
+    """Brave Search API"""
     key = get_secret("BRAVE_API_KEY", "")
     if not key:
+        return [{
+            "title": "BRAVE_API_KEY missing",
+            "url": "",
+            "snippet": "Set BRAVE_API_KEY in Space secrets or sidebar"
+        }]
     url = "https://api.search.brave.com/res/v1/web/search"
     headers = {
         "Accept": "application/json",
+        "X-Subscription-Token": key
     }
+    params = {"q": query, "count": count}
     try:
         r = requests.get(url, headers=headers, params=params, timeout=15)
             results.append({
                 "title": item.get("title", ""),
                 "url": item.get("url", ""),
+                "snippet": item.get("description", "")
             })
+        return results if results else [{"title": "No results", "url": "", "snippet": ""}]
     except Exception as e:
+        return [{"title": "Error", "url": "", "snippet": str(e)}]
+def call_llm(messages: List[Dict], temperature: float = 0.6, max_tokens: int = 1024) -> str:
+    """Call Fireworks AI API"""
     api_key = get_secret("FIREWORKS_API_KEY", "")
     if not api_key:
+        return "FIREWORKS_API_KEY missing. Set it in Secrets or sidebar."
     url = "https://api.fireworks.ai/inference/v1/chat/completions"
     payload = {
         "model": "accounts/fireworks/models/llama-v3p1-70b-instruct",
+        "messages": messages,
         "max_tokens": max_tokens,
+        "temperature": temperature,
         "top_p": 1,
         "frequency_penalty": 0,
+        "presence_penalty": 0
     }
     headers = {
         "Content-Type": "application/json",
         "Authorization": f"Bearer {api_key}"
     }
     try:
+        r = requests.post(url, headers=headers, json=payload, timeout=60)
         r.raise_for_status()
+        return r.json()["choices"][0]["message"]["content"]
     except Exception as e:
+        return f"[LLM Error] {e}"
+def load_file_text(upload) -> str:
     """Load text from uploaded file"""
     name = upload.name.lower()
     try:
+        content = upload.read()
         text = content.decode("utf-8", errors="ignore")
     except:
+        return ""
+    # FASTA handling
     if name.endswith((".fa", ".fasta", ".faa", ".fna")) and BIOPYTHON_AVAILABLE:
         try:
+            upload.seek(0)
             records = list(SeqIO.parse(upload, "fasta"))
+            seqs = [f">{r.id}\n{str(r.seq)}" for r in records]
             return "\n\n".join(seqs)
         except:
             pass
     return text
+def chunk_text(text: str, size: int = 1200, overlap: int = 200) -> List[str]:
+    """Split text into chunks"""
+    chunks = []
+    start = 0
+    text_len = len(text)
+    while start < text_len:
+        end = min(start + size, text_len)
+        chunks.append(text[start:end])
+        if end >= text_len:
+            break
+        start = end - overlap
+    return chunks
+def build_index(texts: List[str]):
+    """Build vector index"""
     if not SENTENCE_TRANSFORMERS_AVAILABLE or not FAISS_AVAILABLE:
+        return None, None
     try:
+        model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+        embeddings = model.encode(texts, show_progress_bar=False)
+        dim = embeddings.shape[1]
         index = faiss.IndexFlatIP(dim)
+        index.add(embeddings.astype("float32"))
+        return index, model
     except Exception as e:
+        st.warning(f"Index build failed: {e}")
+        return None, None
+def search_index(query: str, index, model, texts: List[str], k: int = 4) -> List[Dict]:
     """Search vector index"""
     if index is None or model is None:
         return []
     try:
+        q_emb = model.encode([query])
+        D, I = index.search(q_emb.astype("float32"), k)
+        results = []
         for idx, score in zip(I[0], D[0]):
             if 0 <= idx < len(texts):
+                results.append({
+                    "score": float(score),
+                    "text": texts[idx]
+                })
+        return results
     except:
         return []
+def esm2_embed(seq: str, model_name: str = "facebook/esm2_t6_8M_UR50D") -> Dict:
+    """ESM-2 protein embedding"""
+    if not TORCH_AVAILABLE or not TRANSFORMERS_AVAILABLE:
+        return {"error": "PyTorch/Transformers not available"}
     try:
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        model = AutoModelForMaskedLM.from_pretrained(model_name)
         model.eval()
         with torch.no_grad():
+            inputs = tokenizer(seq, return_tensors="pt")
+            outputs = model(**inputs, output_hidden_states=True)
+            hidden = outputs.hidden_states[-1].mean(dim=1).squeeze(0)
+            vec = hidden.numpy()
+        return {
+            "embedding": vec.tolist(),
+            "size": vec.shape[0]
+        }
     except Exception as e:
         return {"error": str(e)}
+def dna_embed(seq: str, model_name: str = "zhihan1996/DNABERT-2-117M") -> Dict:
+    """DNA embedding"""
+    if not TORCH_AVAILABLE or not TRANSFORMERS_AVAILABLE:
+        return {"error": "PyTorch/Transformers not available"}
     try:
+        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+        model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
         model.eval()
         with torch.no_grad():
+            inputs = tokenizer(seq, return_tensors="pt", truncation=True, max_length=512)
+            outputs = model(**inputs)
+            hidden = outputs.last_hidden_state.mean(dim=1).squeeze(0)
+            vec = hidden.numpy()
+        return {
+            "embedding": vec.tolist(),
+            "size": vec.shape[0]
+        }
     except Exception as e:
         return {"error": str(e)}
+def build_context(query: str, docs: List[str], index, model, use_web: bool, web_k: int) -> Tuple[str, List[Dict]]:
+    """Build context from sources"""
     pieces = []
     sources = []
+    # File search
+    if index and model and docs:
+        hits = search_index(query, index, model, docs, k=4)
+        for h in hits:
+            pieces.append(f"[FILE] {h['text'][:500]}")
+            sources.append({"type": "file", "text": h['text'][:100]})
+    # Web search
     if use_web:
+        results = brave_search(query, count=web_k)
         for r in results:
+            pieces.append(f"[WEB] {r['title']}\n{r['snippet']}")
+            sources.append({"type": "web", "title": r['title'], "url": r['url']})
+    context = "\n\n---\n\n".join(pieces)[:4000]
     return context, sources
+def answer_question(query: str, context: str) -> str:
+    """Generate answer"""
     system = (
+        "You are a bioinformatics assistant. Be concise and factual. "
+        "Never give medical advice. Answer in the user's language."
     )
+    user_msg = f"Context:\n{context}\n\nQuestion: {query}"
     messages = [
         {"role": "system", "content": system},
+        {"role": "user", "content": user_msg}
     ]
+    return call_llm(messages, temperature=0.4, max_tokens=1000)
 # --------------- Streamlit UI ---------------
 st.title(APP_TITLE)
 st.caption(DISCLAIMER)
+# Session state init
+if "docs" not in st.session_state:
     st.session_state.docs = []
+if "index" not in st.session_state:
     st.session_state.index = None
+if "model" not in st.session_state:
+    st.session_state.model = None
+# Sidebar
 with st.sidebar:
+    st.header("Configuration")
+    fw_key = st.text_input(
+        "FIREWORKS_API_KEY",
+        value=get_secret("FIREWORKS_API_KEY", ""),
+        type="password"
+    )
+    brave_key = st.text_input(
+        "BRAVE_API_KEY",
+        value=get_secret("BRAVE_API_KEY", ""),
+        type="password"
+    )
     if fw_key:
         os.environ["FIREWORKS_API_KEY"] = fw_key
     if brave_key:
         os.environ["BRAVE_API_KEY"] = brave_key
+    st.divider()
+    esm_model = st.text_input(
+        "ESM-2 Model",
+        value="facebook/esm2_t6_8M_UR50D"
+    )
+    dna_model = st.text_input(
+        "DNA Model",
+        value="zhihan1996/DNABERT-2-117M"
     )
+    use_web = st.checkbox("Enable web search", value=True)
+    web_results = st.slider("Web results", 1, 10, 3)
+# Tabs
+tab1, tab2, tab3, tab4 = st.tabs(["Chat", "Protein", "DNA", "About"])
+# File upload
+with st.expander("📁 Upload Files", expanded=True):
+    files = st.file_uploader(
+        "Upload text/FASTA files",
+        type=["txt", "fa", "fasta", "csv", "json"],
+        accept_multiple_files=True
     )
+    if files:
         docs = []
+        for f in files:
             try:
+                text = load_file_text(f)
+                if text:
+                    docs.extend(chunk_text(text))
             except Exception as e:
+                st.error(f"Error reading {f.name}: {e}")
+        if docs:
+            st.session_state.docs = docs
+            st.success(f"Loaded {len(docs)} chunks")
+            if SENTENCE_TRANSFORMERS_AVAILABLE and FAISS_AVAILABLE:
+                with st.spinner("Building index..."):
+                    index, model = build_index(docs)
+                    if index:
+                        st.session_state.index = index
+                        st.session_state.model = model
 # Chat tab
+with tab1:
+    st.subheader("💬 Chat Assistant")
+    question = st.text_area(
+        "Ask about proteins, DNA, or bioinformatics:",
+        value="What is the role of ESM-2 embeddings in protein analysis?",
+        height=100
+    )
+    if st.button("Get Answer", type="primary"):
+        if not get_secret("FIREWORKS_API_KEY"):
+            st.error("Please set FIREWORKS_API_KEY")
+        else:
+            with st.spinner("Thinking..."):
+                context, sources = build_context(
+                    question,
+                    st.session_state.docs,
+                    st.session_state.index,
+                    st.session_state.model,
+                    use_web,
+                    web_results
+                )
+                answer = answer_question(question, context)
+                st.markdown("### Answer")
+                st.write(answer)
+                if sources:
+                    st.markdown("### Sources")
+                    for s in sources:
+                        if s["type"] == "web":
+                            st.write(f"- 🌐 [{s['title']}]({s['url']})")
+                        elif s["type"] == "file":
+                            st.write(f"- 📄 File: {s['text'][:80]}...")
 # Protein tab
+with tab2:
+    st.subheader("🧬 Protein Analysis")
+    protein_seq = st.text_area(
+        "Enter protein sequence:",
+        value="MKTIIALSYIFCLVFA",
+        height=100
+    )
     col1, col2 = st.columns(2)
     with col1:
+        if st.button("Analyze Protein"):
+            seq = protein_seq.strip().upper()
+            # Basic stats
+            st.write(f"**Length:** {len(seq)}")
+            st.write(f"**Unique AAs:** {len(set(seq))}")
+            # ESM-2 embedding
+            if TORCH_AVAILABLE and TRANSFORMERS_AVAILABLE:
+                with st.spinner("Computing embedding..."):
+                    result = esm2_embed(seq, esm_model)
+                    if "error" in result:
+                        st.error(result["error"])
+                    else:
+                        st.success(f"Embedding size: {result['size']}")
+                        st.json({"preview": result["embedding"][:5]})
             else:
+                st.warning("PyTorch not available for embeddings")
     with col2:
+        st.info("Amino acid composition and structure prediction features coming soon")
 # DNA tab
+with tab3:
+    st.subheader("🧬 DNA Analysis")
+    dna_seq = st.text_area(
+        "Enter DNA sequence:",
+        value="ATGCGATCGTAGC",
+        height=100
+    )
+    col1, col2 = st.columns(2)
+    with col1:
+        if st.button("Analyze DNA"):
+            seq = dna_seq.strip().upper()
+            # GC content
+            gc = (seq.count("G") + seq.count("C")) / len(seq) if seq else 0
+            st.write(f"**Length:** {len(seq)}")
+            st.write(f"**GC Content:** {gc:.2%}")
+            # DNA embedding
+            if TORCH_AVAILABLE and TRANSFORMERS_AVAILABLE:
+                with st.spinner("Computing embedding..."):
+                    result = dna_embed(seq, dna_model)
+                    if "error" in result:
+                        st.error(result["error"])
+                    else:
+                        st.success(f"Embedding size: {result['size']}")
+                        st.json({"preview": result["embedding"][:5]})
             else:
+                st.warning("PyTorch not available for embeddings")
+    with col2:
+        st.info("Motif analysis and structure prediction coming soon")
 # About tab
+with tab4:
+    st.subheader("ℹ️ About")
+    st.markdown("""
+    ### Features
+    - 💬 RAG-based chat for bioinformatics questions
+    - 🧬 Protein sequence analysis with ESM-2
+    - 🧬 DNA sequence analysis with DNABERT-2
+    - 🔍 Web search integration via Brave API
+    - 📁 File upload and vector search
+    ### Models
+    - **Proteins:** ESM-2 (Facebook)
+    - **DNA:** DNABERT-2 (Microsoft)
+    - **LLM:** Llama 3.1 70B (via Fireworks)
+    ### Disclaimer
+    This tool is for research and educational purposes only.
+    Not for medical diagnosis or treatment decisions.
+    """)
+    # Dependency check
+    st.divider()
+    st.subheader("System Status")
+    deps = {
+        "PyTorch": TORCH_AVAILABLE,
+        "Transformers": TRANSFORMERS_AVAILABLE,
+        "Sentence Transformers": SENTENCE_TRANSFORMERS_AVAILABLE,
+        "FAISS": FAISS_AVAILABLE,
+        "BioPython": BIOPYTHON_AVAILABLE,
+        "Datasets": DATASETS_AVAILABLE
+    }
+    for name, available in deps.items():
+        if available:
+            st.success(f"✅ {name}")
+        else:
+            st.warning(f"⚠️ {name} not available")