Spaces:
Running
Running
| from huggingface_hub import hf_hub_download | |
| from gensim.models import Word2Vec | |
| import faiss | |
| import streamlit as st | |
| import pandas as pd | |
| import dask.dataframe as dd | |
| def get_dask_df(df_path='bin/data.parquet'): | |
| return dd.read_parquet(df_path) | |
| def query_rows(rows: list): | |
| df = get_dask_df() | |
| def get_model(): | |
| model_path = hf_hub_download( | |
| repo_id="nullHawk/word2vec-skipgram-arxive", | |
| filename="word2vec_arxiv_skipgram.model" | |
| ) | |
| model_npy_path = hf_hub_download( | |
| repo_id="nullHawk/word2vec-skipgram-arxive", | |
| filename="word2vec_arxiv_skipgram.model.syn1neg.npy" | |
| ) | |
| model_wv_path2 = hf_hub_download( | |
| repo_id="nullHawk/word2vec-skipgram-arxive", | |
| filename="word2vec_arxiv_skipgram.model.wv.vectors.npy" | |
| ) | |
| return Word2Vec.load(model_path) | |
| def get_faiss_index(): | |
| return faiss.read_index("faiss_index.bin") | |
| # -------------------------------------------------------------- | |
| # Placeholder: You will plug your search code here. | |
| # Should return a list of paper dicts with: | |
| # { "title": ..., "authors": ..., "abstract": ..., "url": ... } | |
| # -------------------------------------------------------------- | |
| def run_semantic_search(query, top_k): | |
| # ---- Replace with your search logic ---- | |
| # Example dummy results: | |
| return [ | |
| { | |
| "title": "Example Paper Title", | |
| "authors": "John Doe, Jane Smith", | |
| "abstract": "This is a sample abstract describing the research paper...", | |
| "url": "https://arxiv.org/abs/1234.5678" | |
| } | |
| ] * top_k | |
| # ---------------------------------- | |
| # Streamlit Page Setup | |
| # ---------------------------------- | |
| st.set_page_config(page_title="ArXiv Semantic Search", layout="wide") | |
| st.title("π ArXiv Semantic Search Engine") | |
| st.write("Search over millions of research papers using semantic similarity.") | |
| # Sidebar | |
| st.sidebar.header("βοΈ Search Options") | |
| top_k = st.sidebar.slider("Top K Results", 5, 50, 10) | |
| # Main Search Bar | |
| query = st.text_input( | |
| "Enter your search query:", | |
| placeholder="e.g. diffusion models for text-to-image, graph neural networks, LLM alignment..." | |
| ) | |
| search_button = st.button("Search") | |
| # -------------------------------------------------------------- | |
| # Handle search click | |
| # -------------------------------------------------------------- | |
| if search_button and query.strip(): | |
| with st.spinner("Searching... π"): | |
| results = run_semantic_search(query, top_k) | |
| st.subheader(f"Top {top_k} Results") | |
| # ---------------------------------------------------------- | |
| # Display results (card-style) | |
| # ---------------------------------------------------------- | |
| for i, paper in enumerate(results, start=1): | |
| st.markdown(f"### **{i}. {paper['title']}**") | |
| st.markdown(f"**Authors:** {paper['authors']}") | |
| st.markdown(f"[π View on arXiv]({paper['url']})") | |
| with st.expander("Abstract Preview"): | |
| st.write(paper["abstract"][:600] + "...") | |
| st.markdown("---") | |