Spaces:

openfree
/

BIOseq

Sleeping

App Files Files Community

openfree commited on Aug 26, 2025

Commit

29ce347

verified ·

1 Parent(s): cb1dc3c

Update app.py

Browse files

Files changed (1) hide show

app.py +204 -209

app.py CHANGED Viewed

@@ -95,7 +95,7 @@ def call_fireworks(messages: List[Dict], temperature: float = 0.6, max_tokens: i
     url = "https://api.fireworks.ai/inference/v1/chat/completions"
     payload = {
-        "model": "accounts/fireworks/models/qwen3-235b-a22b-instruct-2507",
         "max_tokens": max_tokens,
         "top_p": 1,
         "top_k": 40,
@@ -282,226 +282,221 @@ def chat_answer(user_query: str, index, index_model, docs: List[str], loaded_dat
     answer = call_fireworks(messages, temperature=0.4, max_tokens=1200)
     return answer, sources
-# --------------- Main Application ---------------
-def main():
-    st.set_page_config(page_title=APP_TITLE, page_icon="🧬", layout="wide")
-    st.title(APP_TITLE)
-    st.caption(DISCLAIMER)
-    # Check dependencies status
-    if not TORCH_AVAILABLE:
-        st.warning("⏳ PyTorch is being installed. Some features may be unavailable initially. Please refresh in a minute.")
-    # Initialize session state
-    if 'docs' not in st.session_state:
-        st.session_state.docs = []
-    if 'index' not in st.session_state:
-        st.session_state.index = None
-    if 'index_model' not in st.session_state:
-        st.session_state.index_model = None
-    if 'loaded_datasets' not in st.session_state:
-        st.session_state.loaded_datasets = []
-    # Sidebar configuration
-    with st.sidebar:
-        st.header("Keys and settings")
-        fw_key = st.text_input("FIREWORKS_API_KEY", value=get_secret("FIREWORKS_API_KEY", ""), type="password")
-        brave_key = st.text_input("BRAVE_API_KEY", value=get_secret("BRAVE_API_KEY", ""), type="password")
-        if fw_key:
-            os.environ["FIREWORKS_API_KEY"] = fw_key
-        if brave_key:
-            os.environ["BRAVE_API_KEY"] = brave_key
-        st.markdown("### Model selections")
-        esm2_id = st.text_input(
-            "Protein model (ESM-2)",
-            value="facebook/esm2_t6_8M_UR50D",
-            help="Try larger models like facebook/esm2_t33_650M_UR50D if resources allow."
-        )
-        dna_id = st.text_input(
-            "DNA model",
-            value="zhihan1996/DNABERT-2-117M",
-            help="Alternative: InstaDeepAI/nucleotide-transformer-500m-human-ref"
-        )
-        use_web = st.checkbox("Use Brave web search for context", value=True)
-        web_k = st.slider("Web results", 1, 10, 4)
-        st.markdown("### Datasets (optional)")
-        dataset_ids = st.text_area(
-            "Datasets to load (one per line)",
-            value="",
-            help="Enter Hugging Face dataset repo ids, e.g., 'genomics-benchmark/jaspar_motifs'"
-        )
-        st.divider()
-        st.markdown("Files you upload are indexed locally and used for answers.")
-    # Main tabs
-    tabs = st.tabs(["Chat", "Protein", "DNA", "Examples", "About"])
-    # File upload section
-    with st.expander("Upload files for context (txt/csv/json/fasta/vcf)", expanded=True):
-        uploads = st.file_uploader(
-            "Add files",
-            type=["txt", "md", "csv", "tsv", "json", "fa", "fasta", "faa", "fna", "vcf"],
-            accept_multiple_files=True,
-            key="file_uploader"
-        )
-        if uploads:
-            docs = []
-            for up in uploads:
                 try:
-                    txt = load_text_from_file(up)
-                    docs.extend(chunk_text(txt))
                 except Exception as e:
-                    st.warning(f"Failed to read {up.name}: {e}")
-            st.session_state.docs = docs
-            st.caption(f"Indexed chunks: {len(docs)}")
-            # Build index if docs available
-            if docs and SENTENCE_TRANSFORMERS_AVAILABLE and FAISS_AVAILABLE:
-                with st.spinner("Building vector index..."):
-                    index, emb, index_model = build_vector_index(docs)
-                    st.session_state.index = index
-                    st.session_state.index_model = index_model
-        else:
-            st.caption("No files uploaded yet")
-    # Load datasets if specified
-    if dataset_ids.strip() and DATASETS_AVAILABLE:
-        dataset_list = [x.strip() for x in dataset_ids.splitlines() if x.strip()]
-        if dataset_list != [d[0] for d in st.session_state.loaded_datasets]:
-            st.session_state.loaded_datasets = []
-            for rid in dataset_list:
-                with st.spinner(f"Loading dataset {rid}..."):
-                    try:
-                        ds = load_dataset(rid)
-                        sample = ""
-                        for split in ds.keys():
-                            try:
-                                row = ds[split][0]
-                                sample = json.dumps(row, ensure_ascii=False)[:500]
-                                break
-                            except:
-                                pass
-                        st.session_state.loaded_datasets.append((rid, sample))
-                        st.success(f"Loaded {rid}")
-                    except Exception as e:
-                        st.error(f"Failed to load {rid}: {e}")
-    # Chat tab
-    with tabs[0]:
-        st.subheader("Chat")
-        q = st.text_area("Ask a question about protein/DNA", value="ESM-2 임베딩은 단백질 기능 해석에 어떻게 도움되나요?")
-        if st.button("Answer", type="primary"):
-            with st.spinner("Thinking..."):
-                ans, srcs = chat_answer(
-                    q,
-                    st.session_state.index,
-                    st.session_state.index_model,
-                    st.session_state.docs,
-                    st.session_state.loaded_datasets,
-                    use_web,
-                    web_k
-                )
-            st.write(ans)
-            if srcs:
-                st.markdown("#### Sources")
-                for s in srcs:
-                    if s.get("type") == "web" and s.get("url"):
-                        st.markdown(f"- {s.get('title', 'web')}: {s.get('url')}")
-                    elif s.get("type") == "dataset":
-                        st.markdown(f"- dataset: {s.get('id')}")
-                    elif s.get("type") == "file":
-                        snippet = s.get("text", "")
-                        st.markdown(f"- file snippet: {snippet[:120]}...")
-    # Protein tab
-    with tabs[1]:
-        st.subheader("Protein analysis")
-        seq = st.text_area("Protein sequence (amino acids only)", value="MKTIIALSYIFCLVFADYKDDDDK")
-        col1, col2 = st.columns(2)
-        with col1:
-            st.caption("ESM-2 embedding")
-            if st.button("Run ESM-2", key="run_esm2"):
-                with st.spinner("Computing ESM-2 embedding..."):
-                    out = esm2_embed(seq.strip(), esm2_id)
-                if "error" in out:
-                    st.error(out["error"])
-                else:
-                    st.success(f"Vector size: {out['hidden_size']}")
-                    st.json({"embedding_preview": out["embedding"][:8]})
-        with col2:
-            st.caption("Quick stats")
-            s = seq.replace("\n", "").replace(" ", "").upper()
-            length = len(s)
-            aa_set = sorted(set(list(s)))
-            st.write(f"Length: {length}")
-            st.write(f"Unique AAs: {''.join(aa_set)[:30]}")
-    # DNA tab
-    with tabs[2]:
-        st.subheader("DNA analysis")
-        dseq = st.text_area("DNA sequence (ACGT only)", value="ATGCGTACGTAGCTAGCTAGCTAGGCTAGC")
-        col3, col4 = st.columns(2)
-        with col3:
-            st.caption("DNA embedding")
-            if st.button("Run DNA embed", key="run_dna"):
-                with st.spinner("Computing DNA embedding..."):
-                    out = dna_embed(dseq.strip(), dna_id)
-                if "error" in out:
-                    st.error(out["error"])
-                else:
-                    st.success(f"Vector size: {out['hidden_size']}")
-                    st.json({"embedding_preview": out["embedding"][:8]})
-        with col4:
-            st.caption("GC content")
-            s = dseq.upper().replace("N", "").replace(" ", "").replace("\n", "")
-            if len(s) > 0:
-                gc = (s.count("G") + s.count("C")) / len(s)
             else:
-                gc = 0
-            st.write(f"Length: {len(s)}")
-            st.write(f"GC: {gc:.3f}")
-    # Examples tab
-    with tabs[3]:
-        st.subheader("Examples")
-        st.markdown("### Example questions you can ask:")
-        st.markdown("- 업로드한 FASTA에서 특정 단백질의 기능 요약과 변이 영향 질문")
-        st.markdown("- DNA 서열에서 프로모터 가능성과 전사인자 모티프 관련 근거 요청")
-        st.markdown("- Enzyme active site 근접 변이의 리스크 해석 (연구 관점)")
-        st.markdown("- ENCODE/UniProt/AlphaFold 개념 설명 요청")
-        st.markdown("- RAG 기반으로 문서 인용과 함께 간략 답변 요청")
-    # About tab
-    with tabs[4]:
-        st.subheader("About this Space")
-        st.write("**Models suggested:**")
-        st.write("- ESM-2 for proteins")
-        st.write("- DNABERT-2 or Nucleotide Transformer for DNA")
-        st.write("")
-        st.write("**Common datasets:**")
-        st.write("- UniProtKB, AlphaFoldDB, ENCODE, JASPAR, ClinVar")
-        st.write("")
-        st.write("**Features:**")
-        st.write("- Web search powered by Brave Search API")
-        st.write("- LLM powered by Fireworks AI")
-        st.write("- Vector search with FAISS")
-        st.write("")
-        st.info(DISCLAIMER)
-# Run the app
-if __name__ == "__main__":
-    main()

     url = "https://api.fireworks.ai/inference/v1/chat/completions"
     payload = {
+        "model": "accounts/fireworks/models/llama-v3p1-70b-instruct",
         "max_tokens": max_tokens,
         "top_p": 1,
         "top_k": 40,
     answer = call_fireworks(messages, temperature=0.4, max_tokens=1200)
     return answer, sources
+# --------------- Streamlit UI ---------------
+st.set_page_config(page_title=APP_TITLE, page_icon="🧬", layout="wide")
+st.title(APP_TITLE)
+st.caption(DISCLAIMER)
+# Check dependencies status
+if not TORCH_AVAILABLE:
+    st.warning("⏳ PyTorch is being installed. Some features may be unavailable initially. Please refresh in a minute.")
+# Initialize session state
+if 'docs' not in st.session_state:
+    st.session_state.docs = []
+if 'index' not in st.session_state:
+    st.session_state.index = None
+if 'index_model' not in st.session_state:
+    st.session_state.index_model = None
+if 'loaded_datasets' not in st.session_state:
+    st.session_state.loaded_datasets = []
+# Sidebar configuration
+with st.sidebar:
+    st.header("Keys and settings")
+    fw_key = st.text_input("FIREWORKS_API_KEY", value=get_secret("FIREWORKS_API_KEY", ""), type="password")
+    brave_key = st.text_input("BRAVE_API_KEY", value=get_secret("BRAVE_API_KEY", ""), type="password")
+    if fw_key:
+        os.environ["FIREWORKS_API_KEY"] = fw_key
+    if brave_key:
+        os.environ["BRAVE_API_KEY"] = brave_key
+    st.markdown("### Model selections")
+    esm2_id = st.text_input(
+        "Protein model (ESM-2)",
+        value="facebook/esm2_t6_8M_UR50D",
+        help="Try larger models like facebook/esm2_t33_650M_UR50D if resources allow."
+    )
+    dna_id = st.text_input(
+        "DNA model",
+        value="zhihan1996/DNABERT-2-117M",
+        help="Alternative: InstaDeepAI/nucleotide-transformer-500m-human-ref"
+    )
+    use_web = st.checkbox("Use Brave web search for context", value=True)
+    web_k = st.slider("Web results", 1, 10, 4)
+    st.markdown("### Datasets (optional)")
+    dataset_ids = st.text_area(
+        "Datasets to load (one per line)",
+        value="",
+        help="Enter Hugging Face dataset repo ids, e.g., 'genomics-benchmark/jaspar_motifs'"
+    )
+    st.divider()
+    st.markdown("Files you upload are indexed locally and used for answers.")
+# Main tabs
+tabs = st.tabs(["Chat", "Protein", "DNA", "Examples", "About"])
+# File upload section
+with st.expander("Upload files for context (txt/csv/json/fasta/vcf)", expanded=True):
+    uploads = st.file_uploader(
+        "Add files",
+        type=["txt", "md", "csv", "tsv", "json", "fa", "fasta", "faa", "fna", "vcf"],
+        accept_multiple_files=True,
+        key="file_uploader"
+    )
+    if uploads:
+        docs = []
+        for up in uploads:
+            try:
+                txt = load_text_from_file(up)
+                docs.extend(chunk_text(txt))
+            except Exception as e:
+                st.warning(f"Failed to read {up.name}: {e}")
+        st.session_state.docs = docs
+        st.caption(f"Indexed chunks: {len(docs)}")
+        # Build index if docs available
+        if docs and SENTENCE_TRANSFORMERS_AVAILABLE and FAISS_AVAILABLE:
+            with st.spinner("Building vector index..."):
+                index, emb, index_model = build_vector_index(docs)
+                st.session_state.index = index
+                st.session_state.index_model = index_model
+    else:
+        st.caption("No files uploaded yet")
+# Load datasets if specified
+if dataset_ids.strip() and DATASETS_AVAILABLE:
+    dataset_list = [x.strip() for x in dataset_ids.splitlines() if x.strip()]
+    if dataset_list != [d[0] for d in st.session_state.loaded_datasets]:
+        st.session_state.loaded_datasets = []
+        for rid in dataset_list:
+            with st.spinner(f"Loading dataset {rid}..."):
                 try:
+                    ds = load_dataset(rid)
+                    sample = ""
+                    for split in ds.keys():
+                        try:
+                            row = ds[split][0]
+                            sample = json.dumps(row, ensure_ascii=False)[:500]
+                            break
+                        except:
+                            pass
+                    st.session_state.loaded_datasets.append((rid, sample))
+                    st.success(f"Loaded {rid}")
                 except Exception as e:
+                    st.error(f"Failed to load {rid}: {e}")
+# Chat tab
+with tabs[0]:
+    st.subheader("Chat")
+    q = st.text_area("Ask a question about protein/DNA", value="ESM-2 임베딩은 단백질 기능 해석에 어떻게 도움되나요?")
+    if st.button("Answer", type="primary"):
+        with st.spinner("Thinking..."):
+            ans, srcs = chat_answer(
+                q,
+                st.session_state.index,
+                st.session_state.index_model,
+                st.session_state.docs,
+                st.session_state.loaded_datasets,
+                use_web,
+                web_k
+            )
+        st.write(ans)
+        if srcs:
+            st.markdown("#### Sources")
+            for s in srcs:
+                if s.get("type") == "web" and s.get("url"):
+                    st.markdown(f"- {s.get('title', 'web')}: {s.get('url')}")
+                elif s.get("type") == "dataset":
+                    st.markdown(f"- dataset: {s.get('id')}")
+                elif s.get("type") == "file":
+                    snippet = s.get("text", "")
+                    st.markdown(f"- file snippet: {snippet[:120]}...")
+# Protein tab
+with tabs[1]:
+    st.subheader("Protein analysis")
+    seq = st.text_area("Protein sequence (amino acids only)", value="MKTIIALSYIFCLVFADYKDDDDK")
+    col1, col2 = st.columns(2)
+    with col1:
+        st.caption("ESM-2 embedding")
+        if st.button("Run ESM-2", key="run_esm2"):
+            with st.spinner("Computing ESM-2 embedding..."):
+                out = esm2_embed(seq.strip(), esm2_id)
+            if "error" in out:
+                st.error(out["error"])
             else:
+                st.success(f"Vector size: {out['hidden_size']}")
+                st.json({"embedding_preview": out["embedding"][:8]})
+    with col2:
+        st.caption("Quick stats")
+        s = seq.replace("\n", "").replace(" ", "").upper()
+        length = len(s)
+        aa_set = sorted(set(list(s)))
+        st.write(f"Length: {length}")
+        st.write(f"Unique AAs: {''.join(aa_set)[:30]}")
+# DNA tab
+with tabs[2]:
+    st.subheader("DNA analysis")
+    dseq = st.text_area("DNA sequence (ACGT only)", value="ATGCGTACGTAGCTAGCTAGCTAGGCTAGC")
+    col3, col4 = st.columns(2)
+    with col3:
+        st.caption("DNA embedding")
+        if st.button("Run DNA embed", key="run_dna"):
+            with st.spinner("Computing DNA embedding..."):
+                out = dna_embed(dseq.strip(), dna_id)
+            if "error" in out:
+                st.error(out["error"])
+            else:
+                st.success(f"Vector size: {out['hidden_size']}")
+                st.json({"embedding_preview": out["embedding"][:8]}")
+    with col4:
+        st.caption("GC content")
+        s = dseq.upper().replace("N", "").replace(" ", "").replace("\n", "")
+        if len(s) > 0:
+            gc = (s.count("G") + s.count("C")) / len(s)
+        else:
+            gc = 0
+        st.write(f"Length: {len(s)}")
+        st.write(f"GC: {gc:.3f}")
+# Examples tab
+with tabs[3]:
+    st.subheader("Examples")
+    st.markdown("### Example questions you can ask:")
+    st.markdown("- 업로드한 FASTA에서 특정 단백질의 기능 요약과 변이 영향 질문")
+    st.markdown("- DNA 서열에서 프로모터 가능성과 전사인자 모티프 관련 근거 요청")
+    st.markdown("- Enzyme active site 근접 변이의 리스크 해석 (연구 관점)")
+    st.markdown("- ENCODE/UniProt/AlphaFold 개념 설명 요청")
+    st.markdown("- RAG 기반으로 문서 인용과 함께 간략 답변 요청")
+# About tab
+with tabs[4]:
+    st.subheader("About this Space")
+    st.write("**Models suggested:**")
+    st.write("- ESM-2 for proteins")
+    st.write("- DNABERT-2 or Nucleotide Transformer for DNA")
+    st.write("")
+    st.write("**Common datasets:**")
+    st.write("- UniProtKB, AlphaFoldDB, ENCODE, JASPAR, ClinVar")
+    st.write("")
+    st.write("**Features:**")
+    st.write("- Web search powered by Brave Search API")
+    st.write("- LLM powered by Fireworks AI")
+    st.write("- Vector search with FAISS")
+    st.write("")
+    st.info(DISCLAIMER)