from huggingface_hub import hf_hub_download from gensim.models import Word2Vec import faiss import streamlit as st import pandas as pd import dask.dataframe as dd @st.cache_data def get_dask_df(df_path='bin/data.parquet'): return dd.read_parquet(df_path) @st.cache_data def query_rows(rows: list): df = get_dask_df() @st.cache_data def get_model(): model_path = hf_hub_download( repo_id="nullHawk/word2vec-skipgram-arxive", filename="word2vec_arxiv_skipgram.model" ) model_npy_path = hf_hub_download( repo_id="nullHawk/word2vec-skipgram-arxive", filename="word2vec_arxiv_skipgram.model.syn1neg.npy" ) model_wv_path2 = hf_hub_download( repo_id="nullHawk/word2vec-skipgram-arxive", filename="word2vec_arxiv_skipgram.model.wv.vectors.npy" ) return Word2Vec.load(model_path) @st.cache_data def get_faiss_index(): return faiss.read_index("faiss_index.bin") # -------------------------------------------------------------- # Placeholder: You will plug your search code here. # Should return a list of paper dicts with: # { "title": ..., "authors": ..., "abstract": ..., "url": ... } # -------------------------------------------------------------- def run_semantic_search(query, top_k): # ---- Replace with your search logic ---- # Example dummy results: return [ { "title": "Example Paper Title", "authors": "John Doe, Jane Smith", "abstract": "This is a sample abstract describing the research paper...", "url": "https://arxiv.org/abs/1234.5678" } ] * top_k # ---------------------------------- # Streamlit Page Setup # ---------------------------------- st.set_page_config(page_title="ArXiv Semantic Search", layout="wide") st.title("🔎 ArXiv Semantic Search Engine") st.write("Search over millions of research papers using semantic similarity.") # Sidebar st.sidebar.header("⚙️ Search Options") top_k = st.sidebar.slider("Top K Results", 5, 50, 10) # Main Search Bar query = st.text_input( "Enter your search query:", placeholder="e.g. diffusion models for text-to-image, graph neural networks, LLM alignment..." ) search_button = st.button("Search") # -------------------------------------------------------------- # Handle search click # -------------------------------------------------------------- if search_button and query.strip(): with st.spinner("Searching... 🚀"): results = run_semantic_search(query, top_k) st.subheader(f"Top {top_k} Results") # ---------------------------------------------------------- # Display results (card-style) # ---------------------------------------------------------- for i, paper in enumerate(results, start=1): st.markdown(f"### **{i}. {paper['title']}**") st.markdown(f"**Authors:** {paper['authors']}") st.markdown(f"[🔗 View on arXiv]({paper['url']})") with st.expander("Abstract Preview"): st.write(paper["abstract"][:600] + "...") st.markdown("---")