Spaces:

nullHawk
/

arxive-semantic-search

Sleeping

File size: 3,090 Bytes

2f9fb02

from huggingface_hub import hf_hub_download
from gensim.models import Word2Vec

import faiss

import streamlit as st
import pandas as pd
import dask.dataframe as dd

@st.cache_data
def get_dask_df(df_path='bin/data.parquet'):
    return dd.read_parquet(df_path)


@st.cache_data
def query_rows(rows: list):
    df = get_dask_df()

@st.cache_data
def get_model():
    model_path = hf_hub_download(
        repo_id="nullHawk/word2vec-skipgram-arxive",
        filename="word2vec_arxiv_skipgram.model"
    )
    model_npy_path = hf_hub_download(
        repo_id="nullHawk/word2vec-skipgram-arxive",
        filename="word2vec_arxiv_skipgram.model.syn1neg.npy"
    )
    model_wv_path2 = hf_hub_download(
        repo_id="nullHawk/word2vec-skipgram-arxive",
        filename="word2vec_arxiv_skipgram.model.wv.vectors.npy"
    )

    return Word2Vec.load(model_path)

@st.cache_data
def get_faiss_index():
    return faiss.read_index("faiss_index.bin")



# --------------------------------------------------------------
# Placeholder: You will plug your search code here.
# Should return a list of paper dicts with:
# { "title": ..., "authors": ..., "abstract": ..., "url": ... }
# --------------------------------------------------------------
def run_semantic_search(query, top_k):
    # ---- Replace with your search logic ----
    # Example dummy results:
    return [
        {
            "title": "Example Paper Title",
            "authors": "John Doe, Jane Smith",
            "abstract": "This is a sample abstract describing the research paper...",
            "url": "https://arxiv.org/abs/1234.5678"
        }
    ] * top_k

# ----------------------------------
# Streamlit Page Setup
# ----------------------------------
st.set_page_config(page_title="ArXiv Semantic Search", layout="wide")

st.title("🔎 ArXiv Semantic Search Engine")
st.write("Search over millions of research papers using semantic similarity.")

# Sidebar
st.sidebar.header("⚙️ Search Options")
top_k = st.sidebar.slider("Top K Results", 5, 50, 10)

# Main Search Bar
query = st.text_input(
    "Enter your search query:",
    placeholder="e.g. diffusion models for text-to-image, graph neural networks, LLM alignment..."
)

search_button = st.button("Search")


# --------------------------------------------------------------
# Handle search click
# --------------------------------------------------------------
if search_button and query.strip():
    with st.spinner("Searching... 🚀"):
        results = run_semantic_search(query, top_k)

    st.subheader(f"Top {top_k} Results")

    # ----------------------------------------------------------
    # Display results (card-style)
    # ----------------------------------------------------------
    for i, paper in enumerate(results, start=1):
        st.markdown(f"### **{i}. {paper['title']}**")

        st.markdown(f"**Authors:** {paper['authors']}")
        st.markdown(f"[🔗 View on arXiv]({paper['url']})")

        with st.expander("Abstract Preview"):
            st.write(paper["abstract"][:600] + "...")

        st.markdown("---")