File size: 3,090 Bytes
2f9fb02
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
from huggingface_hub import hf_hub_download
from gensim.models import Word2Vec

import faiss

import streamlit as st
import pandas as pd
import dask.dataframe as dd

@st.cache_data
def get_dask_df(df_path='bin/data.parquet'):
    return dd.read_parquet(df_path)


@st.cache_data
def query_rows(rows: list):
    df = get_dask_df()

@st.cache_data
def get_model():
    model_path = hf_hub_download(
        repo_id="nullHawk/word2vec-skipgram-arxive",
        filename="word2vec_arxiv_skipgram.model"
    )
    model_npy_path = hf_hub_download(
        repo_id="nullHawk/word2vec-skipgram-arxive",
        filename="word2vec_arxiv_skipgram.model.syn1neg.npy"
    )
    model_wv_path2 = hf_hub_download(
        repo_id="nullHawk/word2vec-skipgram-arxive",
        filename="word2vec_arxiv_skipgram.model.wv.vectors.npy"
    )

    return Word2Vec.load(model_path)

@st.cache_data
def get_faiss_index():
    return faiss.read_index("faiss_index.bin")



# --------------------------------------------------------------
# Placeholder: You will plug your search code here.
# Should return a list of paper dicts with:
# { "title": ..., "authors": ..., "abstract": ..., "url": ... }
# --------------------------------------------------------------
def run_semantic_search(query, top_k):
    # ---- Replace with your search logic ----
    # Example dummy results:
    return [
        {
            "title": "Example Paper Title",
            "authors": "John Doe, Jane Smith",
            "abstract": "This is a sample abstract describing the research paper...",
            "url": "https://arxiv.org/abs/1234.5678"
        }
    ] * top_k

# ----------------------------------
# Streamlit Page Setup
# ----------------------------------
st.set_page_config(page_title="ArXiv Semantic Search", layout="wide")

st.title("πŸ”Ž ArXiv Semantic Search Engine")
st.write("Search over millions of research papers using semantic similarity.")

# Sidebar
st.sidebar.header("βš™οΈ Search Options")
top_k = st.sidebar.slider("Top K Results", 5, 50, 10)

# Main Search Bar
query = st.text_input(
    "Enter your search query:",
    placeholder="e.g. diffusion models for text-to-image, graph neural networks, LLM alignment..."
)

search_button = st.button("Search")


# --------------------------------------------------------------
# Handle search click
# --------------------------------------------------------------
if search_button and query.strip():
    with st.spinner("Searching... πŸš€"):
        results = run_semantic_search(query, top_k)

    st.subheader(f"Top {top_k} Results")

    # ----------------------------------------------------------
    # Display results (card-style)
    # ----------------------------------------------------------
    for i, paper in enumerate(results, start=1):
        st.markdown(f"### **{i}. {paper['title']}**")

        st.markdown(f"**Authors:** {paper['authors']}")
        st.markdown(f"[πŸ”— View on arXiv]({paper['url']})")

        with st.expander("Abstract Preview"):
            st.write(paper["abstract"][:600] + "...")

        st.markdown("---")