nullHawk commited on
Commit
2f9fb02
ยท
verified ยท
1 Parent(s): c184121

add: streamlit app

Browse files
Files changed (1) hide show
  1. app.py +102 -0
app.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import hf_hub_download
2
+ from gensim.models import Word2Vec
3
+
4
+ import faiss
5
+
6
+ import streamlit as st
7
+ import pandas as pd
8
+ import dask.dataframe as dd
9
+
10
+ @st.cache_data
11
+ def get_dask_df(df_path='bin/data.parquet'):
12
+ return dd.read_parquet(df_path)
13
+
14
+
15
+ @st.cache_data
16
+ def query_rows(rows: list):
17
+ df = get_dask_df()
18
+
19
+ @st.cache_data
20
+ def get_model():
21
+ model_path = hf_hub_download(
22
+ repo_id="nullHawk/word2vec-skipgram-arxive",
23
+ filename="word2vec_arxiv_skipgram.model"
24
+ )
25
+ model_npy_path = hf_hub_download(
26
+ repo_id="nullHawk/word2vec-skipgram-arxive",
27
+ filename="word2vec_arxiv_skipgram.model.syn1neg.npy"
28
+ )
29
+ model_wv_path2 = hf_hub_download(
30
+ repo_id="nullHawk/word2vec-skipgram-arxive",
31
+ filename="word2vec_arxiv_skipgram.model.wv.vectors.npy"
32
+ )
33
+
34
+ return Word2Vec.load(model_path)
35
+
36
+ @st.cache_data
37
+ def get_faiss_index():
38
+ return faiss.read_index("faiss_index.bin")
39
+
40
+
41
+
42
+ # --------------------------------------------------------------
43
+ # Placeholder: You will plug your search code here.
44
+ # Should return a list of paper dicts with:
45
+ # { "title": ..., "authors": ..., "abstract": ..., "url": ... }
46
+ # --------------------------------------------------------------
47
+ def run_semantic_search(query, top_k):
48
+ # ---- Replace with your search logic ----
49
+ # Example dummy results:
50
+ return [
51
+ {
52
+ "title": "Example Paper Title",
53
+ "authors": "John Doe, Jane Smith",
54
+ "abstract": "This is a sample abstract describing the research paper...",
55
+ "url": "https://arxiv.org/abs/1234.5678"
56
+ }
57
+ ] * top_k
58
+
59
+ # ----------------------------------
60
+ # Streamlit Page Setup
61
+ # ----------------------------------
62
+ st.set_page_config(page_title="ArXiv Semantic Search", layout="wide")
63
+
64
+ st.title("๐Ÿ”Ž ArXiv Semantic Search Engine")
65
+ st.write("Search over millions of research papers using semantic similarity.")
66
+
67
+ # Sidebar
68
+ st.sidebar.header("โš™๏ธ Search Options")
69
+ top_k = st.sidebar.slider("Top K Results", 5, 50, 10)
70
+
71
+ # Main Search Bar
72
+ query = st.text_input(
73
+ "Enter your search query:",
74
+ placeholder="e.g. diffusion models for text-to-image, graph neural networks, LLM alignment..."
75
+ )
76
+
77
+ search_button = st.button("Search")
78
+
79
+
80
+ # --------------------------------------------------------------
81
+ # Handle search click
82
+ # --------------------------------------------------------------
83
+ if search_button and query.strip():
84
+ with st.spinner("Searching... ๐Ÿš€"):
85
+ results = run_semantic_search(query, top_k)
86
+
87
+ st.subheader(f"Top {top_k} Results")
88
+
89
+ # ----------------------------------------------------------
90
+ # Display results (card-style)
91
+ # ----------------------------------------------------------
92
+ for i, paper in enumerate(results, start=1):
93
+ st.markdown(f"### **{i}. {paper['title']}**")
94
+
95
+ st.markdown(f"**Authors:** {paper['authors']}")
96
+ st.markdown(f"[๐Ÿ”— View on arXiv]({paper['url']})")
97
+
98
+ with st.expander("Abstract Preview"):
99
+ st.write(paper["abstract"][:600] + "...")
100
+
101
+ st.markdown("---")
102
+