Spaces:

nullHawk
/

arxive-semantic-search

Sleeping

App Files Files Community

nullHawk commited on Nov 17, 2025

Commit

bcdf559

verified ·

1 Parent(s): 1ae1fe8

fix: optimized search by caching

Browse files

Files changed (1) hide show

app.py +25 -19

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from huggingface_hub import hf_hub_download
 from gensim.models import Word2Vec
 from nltk import word_tokenize
 import faiss
 import duckdb
@@ -16,7 +17,8 @@ def get_db(path='arxiv.db'):
 def query_neighbours(rows: list):
-    con = get_db()
     rows = [int(x) for x in rows] # Convert numpy.int64 → Python int
     placeholders = ",".join("?" for _ in rows)
     df = con.execute(
@@ -26,7 +28,7 @@ def query_neighbours(rows: list):
     return df.to_dict("records")
-@st.cache_data
 def get_model():
     model_path = hf_hub_download(
         repo_id="nullHawk/word2vec-skipgram-arxive",
@@ -43,20 +45,17 @@ def get_model():
     return Word2Vec.load(model_path)
-@st.cache_data
 def get_faiss_index():
     return faiss.read_index("bin/faiss_search_index.bin")
-# --------------------------------------------------------------
-# Placeholder: You will plug your search code here.
-# Should return a list of paper dicts with:
-# { "title": ..., "authors": ..., "abstract": ..., "url": ... }
-# --------------------------------------------------------------
 def run_semantic_search(query, top_k):
-    model = get_model()
-    index = get_faiss_index()
     words = word_tokenize(query.lower())
     vecs = []
@@ -72,16 +71,25 @@ def run_semantic_search(query, top_k):
     return query_neighbours(neighbors[0])
 # ----------------------------------
 # Streamlit Page Setup
 # ----------------------------------
 st.set_page_config(page_title="ArXiv Semantic Search", layout="wide")
-st.title("🔎 ArXiv Semantic Search Engine")
 st.write("Search over millions of research papers using semantic similarity.")
 # Sidebar
-st.sidebar.header("⚙️ Search Options")
 top_k = st.sidebar.slider("Top K Results", 5, 50, 10)
 # Main Search Bar
@@ -97,22 +105,20 @@ search_button = st.button("Search")
 # Handle search click
 # --------------------------------------------------------------
 if search_button and query.strip():
-    with st.spinner("Searching... 🚀"):
         results = run_semantic_search(query, top_k)
-    st.subheader(f"Top {top_k} Results")
     # ----------------------------------------------------------
     # Display results (card-style)
     # ----------------------------------------------------------
     for i, paper in enumerate(results, start=1):
-        st.markdown(f"### **{i}. {paper['title']}**")
         st.markdown(f"**Categories:** {paper['categories']}")
-        st.markdown(f"[🔗 View on arXiv](https://arxiv.org/abs/{paper['id']})")
-        with st.expander("Abstract Preview"):
-            st.write(paper["abstract"][:600] + "...")
         st.markdown("---")

 from huggingface_hub import hf_hub_download
 from gensim.models import Word2Vec
 from nltk import word_tokenize
+from pylatexenc.latex2text import LatexNodes2Text
 import faiss
 import duckdb
 def query_neighbours(rows: list):
+    global db
+    con = db
     rows = [int(x) for x in rows] # Convert numpy.int64 → Python int
     placeholders = ",".join("?" for _ in rows)
     df = con.execute(
     return df.to_dict("records")
+@st.cache_resource
 def get_model():
     model_path = hf_hub_download(
         repo_id="nullHawk/word2vec-skipgram-arxive",
     return Word2Vec.load(model_path)
+@st.cache_resource
 def get_faiss_index():
     return faiss.read_index("bin/faiss_search_index.bin")
 def run_semantic_search(query, top_k):
+    global model
+    global faiss_index
+    index = faiss_index
     words = word_tokenize(query.lower())
     vecs = []
     return query_neighbours(neighbors[0])
+#-----------------------------------
+# Global Variables
+#-----------------------------------
+model = get_model()
+faiss_index = get_faiss_index()
+db = get_db()
 # ----------------------------------
 # Streamlit Page Setup
 # ----------------------------------
 st.set_page_config(page_title="ArXiv Semantic Search", layout="wide")
+st.title("ArXiv Semantic Search Engine")
 st.write("Search over millions of research papers using semantic similarity.")
 # Sidebar
+st.sidebar.header("Search Options")
 top_k = st.sidebar.slider("Top K Results", 5, 50, 10)
 # Main Search Bar
 # Handle search click
 # --------------------------------------------------------------
 if search_button and query.strip():
+    with st.spinner("Searching..."):
         results = run_semantic_search(query, top_k)
+    st.header(f"Top {top_k} Results")
     # ----------------------------------------------------------
     # Display results (card-style)
     # ----------------------------------------------------------
     for i, paper in enumerate(results, start=1):
+        st.markdown(f"### **{i}. {LatexNodes2Text().latex_to_text(paper['title'].replace("\n", " ").strip())}**")
         st.markdown(f"**Categories:** {paper['categories']}")
+        st.markdown(f"**Abstract:** {LatexNodes2Text().latex_to_text(paper["abstract"][:600])}...")
+        st.markdown(f"[View on arXiv](https://arxiv.org/abs/{paper['id']})")
         st.markdown("---")