Spaces:

openfree
/

BIOseq

Sleeping

App Files Files Community

openfree commited on Aug 26

Commit

cb1dc3c

verified ·

1 Parent(s): 8c47e15

Update app.py

Browse files

Files changed (1) hide show

app.py +261 -219

app.py CHANGED Viewed

@@ -1,56 +1,60 @@
 import os
 import json
 import time
-import hashlib
 from typing import List, Dict, Tuple
 import streamlit as st
 import requests
-# Optional heavy deps; guard imports so the app still loads
 try:
     import torch
     from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM
-except Exception as e:
-    torch = None
-    AutoTokenizer = None
-    AutoModel = None
-    AutoModelForMaskedLM = None
 try:
     from datasets import load_dataset
 except Exception:
-    load_dataset = None
 try:
     from sentence_transformers import SentenceTransformer
 except Exception:
-    SentenceTransformer = None
 try:
-    import faiss                   # faiss-cpu
 except Exception:
-    faiss = None
 try:
     from Bio import SeqIO
 except Exception:
-    SeqIO = None
 APP_TITLE = "BioSeq Chat: Protein & DNA Assistant"
 DISCLAIMER = (
     "This tool is for research/education and is not a medical device. "
     "Do not use outputs for diagnosis or treatment decisions."
 )
-# --------------- Helpers ---------------
 def get_secret(name: str, fallback: str = "") -> str:
     """Get secret from st.secrets, environment, or fallback"""
     try:
-        return st.secrets.get(name, os.environ.get(name, fallback))
-    except Exception:
-        return os.environ.get(name, fallback)
 def brave_search(query: str, count: int = 5) -> List[Dict]:
     """Search using Brave Search API"""
@@ -79,9 +83,7 @@ def brave_search(query: str, count: int = 5) -> List[Dict]:
                 "url": item.get("url", ""),
                 "snippet": item.get("description", ""),
             })
-        if not results:
-            results = [{"title": "No results", "url": "", "snippet": "Query returned no results."}]
-        return results
     except Exception as e:
         return [{"title": "Search error", "url": "", "snippet": str(e)}]
@@ -123,11 +125,11 @@ def load_text_from_file(upload) -> str:
     try:
         text = content.decode("utf-8", errors="ignore")
-    except Exception:
         text = str(content)
-    # FASTA quick parse
-    if name.endswith((".fa", ".fasta", ".faa", ".fna")) and SeqIO is not None:
         upload.seek(0)
         try:
             records = list(SeqIO.parse(upload, "fasta"))
@@ -135,14 +137,14 @@ def load_text_from_file(upload) -> str:
             for r in records:
                 seqs.append(f">{r.id}\n{str(r.seq)}")
             return "\n\n".join(seqs)
-        except Exception:
-            return text
     return text
 def build_vector_index(texts: List[str], embedder_name: str = "sentence-transformers/all-MiniLM-L6-v2"):
     """Build FAISS vector index from texts"""
-    if SentenceTransformer is None or faiss is None:
         return None, None, None
     try:
@@ -169,15 +171,18 @@ def search_index(query: str, index, model, texts: List[str], k: int = 4):
             if 0 <= idx < len(texts):
                 hits.append({"score": float(score), "text": texts[idx]})
         return hits
-    except Exception:
         return []
 def esm2_embed(seq: str, model_id: str = "facebook/esm2_t6_8M_UR50D") -> Dict:
     """Generate ESM-2 embedding for protein sequence"""
-    if AutoTokenizer is None or AutoModelForMaskedLM is None or torch is None:
-        return {"error": "Transformers/torch not available"}
     try:
         tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
         model = AutoModelForMaskedLM.from_pretrained(model_id, trust_remote_code=True)
         model.eval()
@@ -185,18 +190,21 @@ def esm2_embed(seq: str, model_id: str = "facebook/esm2_t6_8M_UR50D") -> Dict:
         with torch.no_grad():
             toks = tokenizer(seq, return_tensors="pt")
             out = model(**toks, output_hidden_states=True)
-            hidden = out.hidden_states[-1].mean(dim=1).squeeze(0)  # [hidden_size]
             vec = hidden.detach().cpu().numpy()
             return {"embedding": vec.tolist(), "hidden_size": vec.shape[0]}
     except Exception as e:
         return {"error": str(e)}
 def dna_embed(seq: str, model_id: str = "zhihan1996/DNABERT-2-117M") -> Dict:
-    """Generate DNABERT-2 or Nucleotide Transformer embedding for DNA sequence"""
-    if AutoTokenizer is None or AutoModel is None or torch is None:
-        return {"error": "Transformers/torch not available"}
     try:
         tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
         model = AutoModel.from_pretrained(model_id, trust_remote_code=True)
         model.eval()
@@ -219,112 +227,13 @@ def chunk_text(text: str, chunk_size: int = 1200, overlap: int = 200) -> List[st
     while start < len(text):
         end = min(len(text), start + chunk_size)
         chunks.append(text[start:end])
-        start = end - overlap
-        if start < 0:
-            start = 0
         if end >= len(text):
             break
     return chunks
-def safe_len(obj, default=0):
-    """Safely get length of object"""
-    try:
-        return len(obj)
-    except Exception:
-        return default
-# --------------- UI ---------------
-st.set_page_config(page_title=APP_TITLE, page_icon="🧬", layout="wide")
-st.title(APP_TITLE)
-st.caption(DISCLAIMER)
-# Sidebar configuration
-with st.sidebar:
-    st.header("Keys and settings")
-    fw_key = st.text_input("FIREWORKS_API_KEY", value=get_secret("FIREWORKS_API_KEY", ""), type="password")
-    brave_key = st.text_input("BRAVE_API_KEY", value=get_secret("BRAVE_API_KEY", ""), type="password")
-    if fw_key:
-        os.environ["FIREWORKS_API_KEY"] = fw_key
-    if brave_key:
-        os.environ["BRAVE_API_KEY"] = brave_key
-    st.markdown("### Model selections")
-    esm2_id = st.text_input(
-        "Protein model (ESM-2)",
-        value="facebook/esm2_t6_8M_UR50D",
-        help="Try larger models like facebook/esm2_t33_650M_UR50D if resources allow."
-    )
-    dna_id = st.text_input(
-        "DNA model",
-        value="zhihan1996/DNABERT-2-117M",
-        help="Alternative: InstaDeepAI/nucleotide-transformer-500m-human-ref"
-    )
-    use_web = st.checkbox("Use Brave web search for context", value=True)
-    web_k = st.slider("Web results", 1, 10, 4)
-    st.markdown("### Datasets (optional)")
-    ds_hint = "Enter a Hugging Face dataset repo id, e.g., 'genomics-benchmark/jaspar_motifs'"
-    dataset_ids = st.text_area("Datasets to load (one per line)", value="", help=ds_hint)
-    st.divider()
-    st.markdown("Files you upload are indexed locally and used for answers.")
-# Main tabs
-tabs = st.tabs(["Chat", "Protein", "DNA", "Examples", "About"])
-# File upload and indexing
-with st.expander("Upload files for context (txt/csv/json/fasta/vcf)", expanded=True):
-    uploads = st.file_uploader(
-        "Add files",
-        type=["txt", "md", "csv", "tsv", "json", "fa", "fasta", "faa", "fna", "vcf"],
-        accept_multiple_files=True
-    )
-    docs = []
-    if uploads:
-        for up in uploads:
-            try:
-                txt = load_text_from_file(up)
-                docs.extend(chunk_text(txt))
-            except Exception as e:
-                st.warning(f"Failed to read {up.name}: {e}")
-    st.caption(f"Indexed chunks: {len(docs)}")
-# Build vector index
-index = None
-index_model = None
-if docs:
-    with st.spinner("Building vector index..."):
-        index, emb, index_model = build_vector_index(docs)
-# Load datasets
-loaded_datasets = []
-if dataset_ids.strip():
-    if load_dataset is None:
-        st.warning("datasets library not available")
-    else:
-        for rid in [x.strip() for x in dataset_ids.splitlines() if x.strip()]:
-            with st.spinner(f"Loading dataset {rid} ..."):
-                try:
-                    ds = load_dataset(rid)
-                    # Show a sample without materializing fully
-                    sample = ""
-                    for split in ds.keys():
-                        try:
-                            row = ds[split][0]
-                            sample = json.dumps(row, ensure_ascii=False)[:500]
-                            break
-                        except Exception:
-                            pass
-                    loaded_datasets.append((rid, sample))
-                    st.success(f"Loaded {rid}")
-                except Exception as e:
-                    st.error(f"Failed to load {rid}: {e}")
-def build_context(user_query: str) -> Tuple[str, List[Dict]]:
     """Build context from various sources"""
     pieces = []
     sources = []
@@ -355,9 +264,9 @@ def build_context(user_query: str) -> Tuple[str, List[Dict]]:
     context = "\n\n---\n\n".join(pieces)[:6000]
     return context, sources
-def chat_answer(user_query: str) -> Tuple[str, List[Dict]]:
     """Generate chat answer with context"""
-    context, sources = build_context(user_query)
     system = (
         "You are a concise, careful bioinformatics assistant for protein and DNA. "
         "Answer with factual, verifiable statements. "
@@ -373,93 +282,226 @@ def chat_answer(user_query: str) -> Tuple[str, List[Dict]]:
     answer = call_fireworks(messages, temperature=0.4, max_tokens=1200)
     return answer, sources
-# Chat tab
-with tabs[0]:
-    st.subheader("Chat")
-    q = st.text_area("Ask a question about protein/DNA", value="ESM-2 임베딩은 단백질 기능 해석에 어떻게 도움되나요?")
-    if st.button("Answer", type="primary"):
-        with st.spinner("Thinking..."):
-            ans, srcs = chat_answer(q)
-        st.write(ans)
-        if srcs:
-            st.markdown("#### Sources")
-            for s in srcs:
-                if s.get("type") == "web" and s.get("url"):
-                    st.markdown(f"- {s.get('title','web')}: {s.get('url')}")
-                elif s.get("type") == "dataset":
-                    st.markdown(f"- dataset: {s.get('id')}")
-                elif s.get("type") == "file":
-                    snippet = s.get("text", "")
-                    st.markdown(f"- file snippet: {snippet[:120]}...")
-# Protein tab
-with tabs[1]:
-    st.subheader("Protein analysis")
-    seq = st.text_area("Protein sequence (FASTA seq only; single sequence)", value="MKTIIALSYIFCLVFADYKDDDDK")
-    col1, col2 = st.columns(2)
-    with col1:
-        st.caption("ESM-2 embedding")
-        if st.button("Run ESM-2", key="run_esm2"):
-            with st.spinner("Computing ESM-2 embedding..."):
-                out = esm2_embed(seq, esm2_id)
-            if "error" in out:
-                st.error(out["error"])
-            else:
-                st.success(f"Vector size: {out['hidden_size']}")
-                st.json({"embedding_preview": out["embedding"][:8]})
-    with col2:
-        st.caption("Quick stats")
-        s = seq.replace("\n", "").replace(" ", "")
-        length = len(s)
-        aa_set = sorted(set(list(s)))
-        st.write(f"Length: {length}")
-        st.write(f"Unique AAs: {''.join(aa_set)[:30]}")
-# DNA tab
-with tabs[2]:
-    st.subheader("DNA analysis")
-    dseq = st.text_area("DNA sequence (ACGT only)", value="ATGCGTACGTAGCTAGCTAGCTAGGCTAGC")
-    col3, col4 = st.columns(2)
-    with col3:
-        st.caption("DNABERT-2 / Nucleotide Transformer embedding")
-        if st.button("Run DNA embed", key="run_dna"):
-            with st.spinner("Computing DNA embedding..."):
-                out = dna_embed(dseq, dna_id)
-            if "error" in out:
-                st.error(out["error"])
             else:
-                st.success(f"Vector size: {out['hidden_size']}")
-                st.json({"embedding_preview": out["embedding"][:8]})
-    with col4:
-        st.caption("GC content")
-        s = dseq.upper().replace("N", "")
-        if len(s) > 0:
-            gc = (s.count("G") + s.count("C")) / len(s)
-        else:
-            gc = 0
-        st.write(f"Length: {len(s)}")
-        st.write(f"GC: {gc:.3f}")
-# Examples tab
-with tabs[3]:
-    st.subheader("Examples")
-    st.markdown("- 업로드한 FASTA에서 특정 단백질의 기능 요약과 변이 영향 질문")
-    st.markdown("- DNA 서열에서 프로모터 가능성과 전사인자 모티프 관련 근거 요청")
-    st.markdown("- Enzyme active site 근접 변이의 리스크 해석(연구 관점)")
-    st.markdown("- ENCODE/UniProt/AlphaFold 개념 설명 요청")
-    st.markdown("- RAG 기반으로 문서 인용과 함께 간략 답변 요청")
-# About tab
-with tabs[4]:
-    st.subheader("About this Space")
-    st.write("Models suggested: ESM-2 for proteins; DNABERT-2 or Nucleotide Transformer for DNA.")
-    st.write("Datasets commonly used: UniProtKB, AlphaFoldDB, ENCODE, JASPAR, ClinVar.")
-    st.write("Web search powered by Brave Search if API key is provided.")
-    st.write("")
-    st.info(DISCLAIMER)

 import os
 import json
 import time
 from typing import List, Dict, Tuple
 import streamlit as st
 import requests
+# Guard imports for optional dependencies
 try:
     import torch
     from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM
+    TORCH_AVAILABLE = True
+except Exception:
+    TORCH_AVAILABLE = False
 try:
     from datasets import load_dataset
+    DATASETS_AVAILABLE = True
 except Exception:
+    DATASETS_AVAILABLE = False
 try:
     from sentence_transformers import SentenceTransformer
+    SENTENCE_TRANSFORMERS_AVAILABLE = True
 except Exception:
+    SENTENCE_TRANSFORMERS_AVAILABLE = False
 try:
+    import faiss
+    FAISS_AVAILABLE = True
 except Exception:
+    FAISS_AVAILABLE = False
 try:
     from Bio import SeqIO
+    BIOPYTHON_AVAILABLE = True
 except Exception:
+    BIOPYTHON_AVAILABLE = False
+# Constants
 APP_TITLE = "BioSeq Chat: Protein & DNA Assistant"
 DISCLAIMER = (
     "This tool is for research/education and is not a medical device. "
     "Do not use outputs for diagnosis or treatment decisions."
 )
+# --------------- Helper Functions ---------------
 def get_secret(name: str, fallback: str = "") -> str:
     """Get secret from st.secrets, environment, or fallback"""
     try:
+        if hasattr(st, 'secrets'):
+            return st.secrets.get(name, os.environ.get(name, fallback))
+    except:
+        pass
+    return os.environ.get(name, fallback)
 def brave_search(query: str, count: int = 5) -> List[Dict]:
     """Search using Brave Search API"""
                 "url": item.get("url", ""),
                 "snippet": item.get("description", ""),
             })
+        return results if results else [{"title": "No results", "url": "", "snippet": "Query returned no results."}]
     except Exception as e:
         return [{"title": "Search error", "url": "", "snippet": str(e)}]
     try:
         text = content.decode("utf-8", errors="ignore")
+    except:
         text = str(content)
+    # FASTA file handling
+    if name.endswith((".fa", ".fasta", ".faa", ".fna")) and BIOPYTHON_AVAILABLE:
         upload.seek(0)
         try:
             records = list(SeqIO.parse(upload, "fasta"))
             for r in records:
                 seqs.append(f">{r.id}\n{str(r.seq)}")
             return "\n\n".join(seqs)
+        except:
+            pass
     return text
 def build_vector_index(texts: List[str], embedder_name: str = "sentence-transformers/all-MiniLM-L6-v2"):
     """Build FAISS vector index from texts"""
+    if not SENTENCE_TRANSFORMERS_AVAILABLE or not FAISS_AVAILABLE:
         return None, None, None
     try:
             if 0 <= idx < len(texts):
                 hits.append({"score": float(score), "text": texts[idx]})
         return hits
+    except:
         return []
 def esm2_embed(seq: str, model_id: str = "facebook/esm2_t6_8M_UR50D") -> Dict:
     """Generate ESM-2 embedding for protein sequence"""
+    if not TORCH_AVAILABLE:
+        return {"error": "Transformers/torch not available. Please wait for dependencies to install."}
     try:
+        from transformers import AutoTokenizer, AutoModelForMaskedLM
+        import torch
         tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
         model = AutoModelForMaskedLM.from_pretrained(model_id, trust_remote_code=True)
         model.eval()
         with torch.no_grad():
             toks = tokenizer(seq, return_tensors="pt")
             out = model(**toks, output_hidden_states=True)
+            hidden = out.hidden_states[-1].mean(dim=1).squeeze(0)
             vec = hidden.detach().cpu().numpy()
             return {"embedding": vec.tolist(), "hidden_size": vec.shape[0]}
     except Exception as e:
         return {"error": str(e)}
 def dna_embed(seq: str, model_id: str = "zhihan1996/DNABERT-2-117M") -> Dict:
+    """Generate DNA embedding"""
+    if not TORCH_AVAILABLE:
+        return {"error": "Transformers/torch not available. Please wait for dependencies to install."}
     try:
+        from transformers import AutoTokenizer, AutoModel
+        import torch
         tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
         model = AutoModel.from_pretrained(model_id, trust_remote_code=True)
         model.eval()
     while start < len(text):
         end = min(len(text), start + chunk_size)
         chunks.append(text[start:end])
         if end >= len(text):
             break
+        start = end - overlap
     return chunks
+def build_context(user_query: str, index, index_model, docs: List[str], loaded_datasets: List, use_web: bool, web_k: int) -> Tuple[str, List[Dict]]:
     """Build context from various sources"""
     pieces = []
     sources = []
     context = "\n\n---\n\n".join(pieces)[:6000]
     return context, sources
+def chat_answer(user_query: str, index, index_model, docs: List[str], loaded_datasets: List, use_web: bool, web_k: int) -> Tuple[str, List[Dict]]:
     """Generate chat answer with context"""
+    context, sources = build_context(user_query, index, index_model, docs, loaded_datasets, use_web, web_k)
     system = (
         "You are a concise, careful bioinformatics assistant for protein and DNA. "
         "Answer with factual, verifiable statements. "
     answer = call_fireworks(messages, temperature=0.4, max_tokens=1200)
     return answer, sources
+# --------------- Main Application ---------------
+def main():
+    st.set_page_config(page_title=APP_TITLE, page_icon="🧬", layout="wide")
+    st.title(APP_TITLE)
+    st.caption(DISCLAIMER)
+    # Check dependencies status
+    if not TORCH_AVAILABLE:
+        st.warning("⏳ PyTorch is being installed. Some features may be unavailable initially. Please refresh in a minute.")
+    # Initialize session state
+    if 'docs' not in st.session_state:
+        st.session_state.docs = []
+    if 'index' not in st.session_state:
+        st.session_state.index = None
+    if 'index_model' not in st.session_state:
+        st.session_state.index_model = None
+    if 'loaded_datasets' not in st.session_state:
+        st.session_state.loaded_datasets = []
+    # Sidebar configuration
+    with st.sidebar:
+        st.header("Keys and settings")
+        fw_key = st.text_input("FIREWORKS_API_KEY", value=get_secret("FIREWORKS_API_KEY", ""), type="password")
+        brave_key = st.text_input("BRAVE_API_KEY", value=get_secret("BRAVE_API_KEY", ""), type="password")
+        if fw_key:
+            os.environ["FIREWORKS_API_KEY"] = fw_key
+        if brave_key:
+            os.environ["BRAVE_API_KEY"] = brave_key
+        st.markdown("### Model selections")
+        esm2_id = st.text_input(
+            "Protein model (ESM-2)",
+            value="facebook/esm2_t6_8M_UR50D",
+            help="Try larger models like facebook/esm2_t33_650M_UR50D if resources allow."
+        )
+        dna_id = st.text_input(
+            "DNA model",
+            value="zhihan1996/DNABERT-2-117M",
+            help="Alternative: InstaDeepAI/nucleotide-transformer-500m-human-ref"
+        )
+        use_web = st.checkbox("Use Brave web search for context", value=True)
+        web_k = st.slider("Web results", 1, 10, 4)
+        st.markdown("### Datasets (optional)")
+        dataset_ids = st.text_area(
+            "Datasets to load (one per line)",
+            value="",
+            help="Enter Hugging Face dataset repo ids, e.g., 'genomics-benchmark/jaspar_motifs'"
+        )
+        st.divider()
+        st.markdown("Files you upload are indexed locally and used for answers.")
+    # Main tabs
+    tabs = st.tabs(["Chat", "Protein", "DNA", "Examples", "About"])
+    # File upload section
+    with st.expander("Upload files for context (txt/csv/json/fasta/vcf)", expanded=True):
+        uploads = st.file_uploader(
+            "Add files",
+            type=["txt", "md", "csv", "tsv", "json", "fa", "fasta", "faa", "fna", "vcf"],
+            accept_multiple_files=True,
+            key="file_uploader"
+        )
+        if uploads:
+            docs = []
+            for up in uploads:
+                try:
+                    txt = load_text_from_file(up)
+                    docs.extend(chunk_text(txt))
+                except Exception as e:
+                    st.warning(f"Failed to read {up.name}: {e}")
+            st.session_state.docs = docs
+            st.caption(f"Indexed chunks: {len(docs)}")
+            # Build index if docs available
+            if docs and SENTENCE_TRANSFORMERS_AVAILABLE and FAISS_AVAILABLE:
+                with st.spinner("Building vector index..."):
+                    index, emb, index_model = build_vector_index(docs)
+                    st.session_state.index = index
+                    st.session_state.index_model = index_model
+        else:
+            st.caption("No files uploaded yet")
+    # Load datasets if specified
+    if dataset_ids.strip() and DATASETS_AVAILABLE:
+        dataset_list = [x.strip() for x in dataset_ids.splitlines() if x.strip()]
+        if dataset_list != [d[0] for d in st.session_state.loaded_datasets]:
+            st.session_state.loaded_datasets = []
+            for rid in dataset_list:
+                with st.spinner(f"Loading dataset {rid}..."):
+                    try:
+                        ds = load_dataset(rid)
+                        sample = ""
+                        for split in ds.keys():
+                            try:
+                                row = ds[split][0]
+                                sample = json.dumps(row, ensure_ascii=False)[:500]
+                                break
+                            except:
+                                pass
+                        st.session_state.loaded_datasets.append((rid, sample))
+                        st.success(f"Loaded {rid}")
+                    except Exception as e:
+                        st.error(f"Failed to load {rid}: {e}")
+    # Chat tab
+    with tabs[0]:
+        st.subheader("Chat")
+        q = st.text_area("Ask a question about protein/DNA", value="ESM-2 임베딩은 단백질 기능 해석에 어떻게 도움되나요?")
+        if st.button("Answer", type="primary"):
+            with st.spinner("Thinking..."):
+                ans, srcs = chat_answer(
+                    q,
+                    st.session_state.index,
+                    st.session_state.index_model,
+                    st.session_state.docs,
+                    st.session_state.loaded_datasets,
+                    use_web,
+                    web_k
+                )
+            st.write(ans)
+            if srcs:
+                st.markdown("#### Sources")
+                for s in srcs:
+                    if s.get("type") == "web" and s.get("url"):
+                        st.markdown(f"- {s.get('title', 'web')}: {s.get('url')}")
+                    elif s.get("type") == "dataset":
+                        st.markdown(f"- dataset: {s.get('id')}")
+                    elif s.get("type") == "file":
+                        snippet = s.get("text", "")
+                        st.markdown(f"- file snippet: {snippet[:120]}...")
+    # Protein tab
+    with tabs[1]:
+        st.subheader("Protein analysis")
+        seq = st.text_area("Protein sequence (amino acids only)", value="MKTIIALSYIFCLVFADYKDDDDK")
+        col1, col2 = st.columns(2)
+        with col1:
+            st.caption("ESM-2 embedding")
+            if st.button("Run ESM-2", key="run_esm2"):
+                with st.spinner("Computing ESM-2 embedding..."):
+                    out = esm2_embed(seq.strip(), esm2_id)
+                if "error" in out:
+                    st.error(out["error"])
+                else:
+                    st.success(f"Vector size: {out['hidden_size']}")
+                    st.json({"embedding_preview": out["embedding"][:8]})
+        with col2:
+            st.caption("Quick stats")
+            s = seq.replace("\n", "").replace(" ", "").upper()
+            length = len(s)
+            aa_set = sorted(set(list(s)))
+            st.write(f"Length: {length}")
+            st.write(f"Unique AAs: {''.join(aa_set)[:30]}")
+    # DNA tab
+    with tabs[2]:
+        st.subheader("DNA analysis")
+        dseq = st.text_area("DNA sequence (ACGT only)", value="ATGCGTACGTAGCTAGCTAGCTAGGCTAGC")
+        col3, col4 = st.columns(2)
+        with col3:
+            st.caption("DNA embedding")
+            if st.button("Run DNA embed", key="run_dna"):
+                with st.spinner("Computing DNA embedding..."):
+                    out = dna_embed(dseq.strip(), dna_id)
+                if "error" in out:
+                    st.error(out["error"])
+                else:
+                    st.success(f"Vector size: {out['hidden_size']}")
+                    st.json({"embedding_preview": out["embedding"][:8]})
+        with col4:
+            st.caption("GC content")
+            s = dseq.upper().replace("N", "").replace(" ", "").replace("\n", "")
+            if len(s) > 0:
+                gc = (s.count("G") + s.count("C")) / len(s)
             else:
+                gc = 0
+            st.write(f"Length: {len(s)}")
+            st.write(f"GC: {gc:.3f}")
+    # Examples tab
+    with tabs[3]:
+        st.subheader("Examples")
+        st.markdown("### Example questions you can ask:")
+        st.markdown("- 업로드한 FASTA에서 특정 단백질의 기능 요약과 변이 영향 질문")
+        st.markdown("- DNA 서열에서 프로모터 가능성과 전사인자 모티프 관련 근거 요청")
+        st.markdown("- Enzyme active site 근접 변이의 리스크 해석 (연구 관점)")
+        st.markdown("- ENCODE/UniProt/AlphaFold 개념 설명 요청")
+        st.markdown("- RAG 기반으로 문서 인용과 함께 간략 답변 요청")
+    # About tab
+    with tabs[4]:
+        st.subheader("About this Space")
+        st.write("**Models suggested:**")
+        st.write("- ESM-2 for proteins")
+        st.write("- DNABERT-2 or Nucleotide Transformer for DNA")
+        st.write("")
+        st.write("**Common datasets:**")
+        st.write("- UniProtKB, AlphaFoldDB, ENCODE, JASPAR, ClinVar")
+        st.write("")
+        st.write("**Features:**")
+        st.write("- Web search powered by Brave Search API")
+        st.write("- LLM powered by Fireworks AI")
+        st.write("- Vector search with FAISS")
+        st.write("")
+        st.info(DISCLAIMER)
+# Run the app
+if __name__ == "__main__":
+    main()