Spaces:

nullHawk
/

arxive-semantic-search

Running

App Files Files Community

nullHawk commited on Nov 17, 2025

Commit

1ae1fe8

verified ·

1 Parent(s): 3f1d954

feat: semantic_search

Browse files

Files changed (1) hide show

app.py +28 -17

app.py CHANGED Viewed

@@ -1,25 +1,30 @@
 from huggingface_hub import hf_hub_download
 from gensim.models import Word2Vec
 import faiss
 import duckdb
 import streamlit as st
 import pandas as pd
 import dask.dataframe as dd
-@st.cache_data
 def get_db(path='arxiv.db'):
-    con = duckdb.connect(path)
 def query_neighbours(rows: list):
     con = get_db()
     placeholders = ",".join("?" for _ in rows)
-    return con.execute(
         f"SELECT * FROM arxiv WHERE column0 IN ({placeholders})",
         rows,
-    ).fetchall()
 @st.cache_data
 def get_model():
@@ -40,7 +45,7 @@ def get_model():
 @st.cache_data
 def get_faiss_index():
-    return faiss.read_index("faiss_index.bin")
@@ -50,16 +55,22 @@ def get_faiss_index():
 # { "title": ..., "authors": ..., "abstract": ..., "url": ... }
 # --------------------------------------------------------------
 def run_semantic_search(query, top_k):
-    # ---- Replace with your search logic ----
-    # Example dummy results:
-    return [
-        {
-            "title": "Example Paper Title",
-            "authors": "John Doe, Jane Smith",
-            "abstract": "This is a sample abstract describing the research paper...",
-            "url": "https://arxiv.org/abs/1234.5678"
-        }
-    ] * top_k
 # ----------------------------------
 # Streamlit Page Setup
@@ -97,8 +108,8 @@ if search_button and query.strip():
     for i, paper in enumerate(results, start=1):
         st.markdown(f"### **{i}. {paper['title']}**")
-        st.markdown(f"**Authors:** {paper['authors']}")
-        st.markdown(f"[🔗 View on arXiv]({paper['url']})")
         with st.expander("Abstract Preview"):
             st.write(paper["abstract"][:600] + "...")

 from huggingface_hub import hf_hub_download
 from gensim.models import Word2Vec
+from nltk import word_tokenize
 import faiss
 import duckdb
 import streamlit as st
+import numpy as np
 import pandas as pd
 import dask.dataframe as dd
+@st.cache_resource
 def get_db(path='arxiv.db'):
+    return duckdb.connect(path)
 def query_neighbours(rows: list):
     con = get_db()
+    rows = [int(x) for x in rows] # Convert numpy.int64 → Python int
     placeholders = ",".join("?" for _ in rows)
+    df = con.execute(
         f"SELECT * FROM arxiv WHERE column0 IN ({placeholders})",
         rows,
+    ).fetchdf()
+    return df.to_dict("records")
 @st.cache_data
 def get_model():
 @st.cache_data
 def get_faiss_index():
+    return faiss.read_index("bin/faiss_search_index.bin")
 # { "title": ..., "authors": ..., "abstract": ..., "url": ... }
 # --------------------------------------------------------------
 def run_semantic_search(query, top_k):
+    model = get_model()
+    index = get_faiss_index()
+    words = word_tokenize(query.lower())
+    vecs = []
+    for w in words:
+        if w in model.wv:
+            vecs.append(model.wv[w])
+    if len(vecs) == 0:
+        return []
+    qvec = np.mean(vecs, axis=0).astype('float32').reshape(1, -1)
+    faiss.normalize_L2(qvec)
+    scores, neighbors = index.search(qvec, top_k)
+    return query_neighbours(neighbors[0])
 # ----------------------------------
 # Streamlit Page Setup
     for i, paper in enumerate(results, start=1):
         st.markdown(f"### **{i}. {paper['title']}**")
+        st.markdown(f"**Categories:** {paper['categories']}")
+        st.markdown(f"[🔗 View on arXiv](https://arxiv.org/abs/{paper['id']})")
         with st.expander("Abstract Preview"):
             st.write(paper["abstract"][:600] + "...")