nullHawk commited on
Commit
bcdf559
Β·
verified Β·
1 Parent(s): 1ae1fe8

fix: optimized search by caching

Browse files
Files changed (1) hide show
  1. app.py +25 -19
app.py CHANGED
@@ -1,6 +1,7 @@
1
  from huggingface_hub import hf_hub_download
2
  from gensim.models import Word2Vec
3
  from nltk import word_tokenize
 
4
 
5
  import faiss
6
  import duckdb
@@ -16,7 +17,8 @@ def get_db(path='arxiv.db'):
16
 
17
 
18
  def query_neighbours(rows: list):
19
- con = get_db()
 
20
  rows = [int(x) for x in rows] # Convert numpy.int64 β†’ Python int
21
  placeholders = ",".join("?" for _ in rows)
22
  df = con.execute(
@@ -26,7 +28,7 @@ def query_neighbours(rows: list):
26
 
27
  return df.to_dict("records")
28
 
29
- @st.cache_data
30
  def get_model():
31
  model_path = hf_hub_download(
32
  repo_id="nullHawk/word2vec-skipgram-arxive",
@@ -43,20 +45,17 @@ def get_model():
43
 
44
  return Word2Vec.load(model_path)
45
 
46
- @st.cache_data
47
  def get_faiss_index():
48
  return faiss.read_index("bin/faiss_search_index.bin")
49
 
50
 
51
 
52
- # --------------------------------------------------------------
53
- # Placeholder: You will plug your search code here.
54
- # Should return a list of paper dicts with:
55
- # { "title": ..., "authors": ..., "abstract": ..., "url": ... }
56
- # --------------------------------------------------------------
57
  def run_semantic_search(query, top_k):
58
- model = get_model()
59
- index = get_faiss_index()
 
 
60
 
61
  words = word_tokenize(query.lower())
62
  vecs = []
@@ -72,16 +71,25 @@ def run_semantic_search(query, top_k):
72
 
73
  return query_neighbours(neighbors[0])
74
 
 
 
 
 
 
 
 
 
 
75
  # ----------------------------------
76
  # Streamlit Page Setup
77
  # ----------------------------------
78
  st.set_page_config(page_title="ArXiv Semantic Search", layout="wide")
79
 
80
- st.title("πŸ”Ž ArXiv Semantic Search Engine")
81
  st.write("Search over millions of research papers using semantic similarity.")
82
 
83
  # Sidebar
84
- st.sidebar.header("βš™οΈ Search Options")
85
  top_k = st.sidebar.slider("Top K Results", 5, 50, 10)
86
 
87
  # Main Search Bar
@@ -97,22 +105,20 @@ search_button = st.button("Search")
97
  # Handle search click
98
  # --------------------------------------------------------------
99
  if search_button and query.strip():
100
- with st.spinner("Searching... πŸš€"):
101
  results = run_semantic_search(query, top_k)
102
 
103
- st.subheader(f"Top {top_k} Results")
104
 
105
  # ----------------------------------------------------------
106
  # Display results (card-style)
107
  # ----------------------------------------------------------
108
  for i, paper in enumerate(results, start=1):
109
- st.markdown(f"### **{i}. {paper['title']}**")
110
 
111
  st.markdown(f"**Categories:** {paper['categories']}")
112
- st.markdown(f"[πŸ”— View on arXiv](https://arxiv.org/abs/{paper['id']})")
113
-
114
- with st.expander("Abstract Preview"):
115
- st.write(paper["abstract"][:600] + "...")
116
 
117
  st.markdown("---")
118
 
 
1
  from huggingface_hub import hf_hub_download
2
  from gensim.models import Word2Vec
3
  from nltk import word_tokenize
4
+ from pylatexenc.latex2text import LatexNodes2Text
5
 
6
  import faiss
7
  import duckdb
 
17
 
18
 
19
  def query_neighbours(rows: list):
20
+ global db
21
+ con = db
22
  rows = [int(x) for x in rows] # Convert numpy.int64 β†’ Python int
23
  placeholders = ",".join("?" for _ in rows)
24
  df = con.execute(
 
28
 
29
  return df.to_dict("records")
30
 
31
+ @st.cache_resource
32
  def get_model():
33
  model_path = hf_hub_download(
34
  repo_id="nullHawk/word2vec-skipgram-arxive",
 
45
 
46
  return Word2Vec.load(model_path)
47
 
48
+ @st.cache_resource
49
  def get_faiss_index():
50
  return faiss.read_index("bin/faiss_search_index.bin")
51
 
52
 
53
 
 
 
 
 
 
54
  def run_semantic_search(query, top_k):
55
+ global model
56
+ global faiss_index
57
+
58
+ index = faiss_index
59
 
60
  words = word_tokenize(query.lower())
61
  vecs = []
 
71
 
72
  return query_neighbours(neighbors[0])
73
 
74
+
75
+ #-----------------------------------
76
+ # Global Variables
77
+ #-----------------------------------
78
+
79
+ model = get_model()
80
+ faiss_index = get_faiss_index()
81
+ db = get_db()
82
+
83
  # ----------------------------------
84
  # Streamlit Page Setup
85
  # ----------------------------------
86
  st.set_page_config(page_title="ArXiv Semantic Search", layout="wide")
87
 
88
+ st.title("ArXiv Semantic Search Engine")
89
  st.write("Search over millions of research papers using semantic similarity.")
90
 
91
  # Sidebar
92
+ st.sidebar.header("Search Options")
93
  top_k = st.sidebar.slider("Top K Results", 5, 50, 10)
94
 
95
  # Main Search Bar
 
105
  # Handle search click
106
  # --------------------------------------------------------------
107
  if search_button and query.strip():
108
+ with st.spinner("Searching..."):
109
  results = run_semantic_search(query, top_k)
110
 
111
+ st.header(f"Top {top_k} Results")
112
 
113
  # ----------------------------------------------------------
114
  # Display results (card-style)
115
  # ----------------------------------------------------------
116
  for i, paper in enumerate(results, start=1):
117
+ st.markdown(f"### **{i}. {LatexNodes2Text().latex_to_text(paper['title'].replace("\n", " ").strip())}**")
118
 
119
  st.markdown(f"**Categories:** {paper['categories']}")
120
+ st.markdown(f"**Abstract:** {LatexNodes2Text().latex_to_text(paper["abstract"][:600])}...")
121
+ st.markdown(f"[View on arXiv](https://arxiv.org/abs/{paper['id']})")
 
 
122
 
123
  st.markdown("---")
124