nullHawk commited on
Commit
1ae1fe8
·
verified ·
1 Parent(s): 3f1d954

feat: semantic_search

Browse files
Files changed (1) hide show
  1. app.py +28 -17
app.py CHANGED
@@ -1,25 +1,30 @@
1
  from huggingface_hub import hf_hub_download
2
  from gensim.models import Word2Vec
 
3
 
4
  import faiss
5
  import duckdb
6
 
7
  import streamlit as st
 
8
  import pandas as pd
9
  import dask.dataframe as dd
10
 
11
- @st.cache_data
12
  def get_db(path='arxiv.db'):
13
- con = duckdb.connect(path)
14
 
15
 
16
  def query_neighbours(rows: list):
17
  con = get_db()
 
18
  placeholders = ",".join("?" for _ in rows)
19
- return con.execute(
20
  f"SELECT * FROM arxiv WHERE column0 IN ({placeholders})",
21
  rows,
22
- ).fetchall()
 
 
23
 
24
  @st.cache_data
25
  def get_model():
@@ -40,7 +45,7 @@ def get_model():
40
 
41
  @st.cache_data
42
  def get_faiss_index():
43
- return faiss.read_index("faiss_index.bin")
44
 
45
 
46
 
@@ -50,16 +55,22 @@ def get_faiss_index():
50
  # { "title": ..., "authors": ..., "abstract": ..., "url": ... }
51
  # --------------------------------------------------------------
52
  def run_semantic_search(query, top_k):
53
- # ---- Replace with your search logic ----
54
- # Example dummy results:
55
- return [
56
- {
57
- "title": "Example Paper Title",
58
- "authors": "John Doe, Jane Smith",
59
- "abstract": "This is a sample abstract describing the research paper...",
60
- "url": "https://arxiv.org/abs/1234.5678"
61
- }
62
- ] * top_k
 
 
 
 
 
 
63
 
64
  # ----------------------------------
65
  # Streamlit Page Setup
@@ -97,8 +108,8 @@ if search_button and query.strip():
97
  for i, paper in enumerate(results, start=1):
98
  st.markdown(f"### **{i}. {paper['title']}**")
99
 
100
- st.markdown(f"**Authors:** {paper['authors']}")
101
- st.markdown(f"[🔗 View on arXiv]({paper['url']})")
102
 
103
  with st.expander("Abstract Preview"):
104
  st.write(paper["abstract"][:600] + "...")
 
1
  from huggingface_hub import hf_hub_download
2
  from gensim.models import Word2Vec
3
+ from nltk import word_tokenize
4
 
5
  import faiss
6
  import duckdb
7
 
8
  import streamlit as st
9
+ import numpy as np
10
  import pandas as pd
11
  import dask.dataframe as dd
12
 
13
+ @st.cache_resource
14
  def get_db(path='arxiv.db'):
15
+ return duckdb.connect(path)
16
 
17
 
18
  def query_neighbours(rows: list):
19
  con = get_db()
20
+ rows = [int(x) for x in rows] # Convert numpy.int64 → Python int
21
  placeholders = ",".join("?" for _ in rows)
22
+ df = con.execute(
23
  f"SELECT * FROM arxiv WHERE column0 IN ({placeholders})",
24
  rows,
25
+ ).fetchdf()
26
+
27
+ return df.to_dict("records")
28
 
29
  @st.cache_data
30
  def get_model():
 
45
 
46
  @st.cache_data
47
  def get_faiss_index():
48
+ return faiss.read_index("bin/faiss_search_index.bin")
49
 
50
 
51
 
 
55
  # { "title": ..., "authors": ..., "abstract": ..., "url": ... }
56
  # --------------------------------------------------------------
57
  def run_semantic_search(query, top_k):
58
+ model = get_model()
59
+ index = get_faiss_index()
60
+
61
+ words = word_tokenize(query.lower())
62
+ vecs = []
63
+
64
+ for w in words:
65
+ if w in model.wv:
66
+ vecs.append(model.wv[w])
67
+ if len(vecs) == 0:
68
+ return []
69
+ qvec = np.mean(vecs, axis=0).astype('float32').reshape(1, -1)
70
+ faiss.normalize_L2(qvec)
71
+ scores, neighbors = index.search(qvec, top_k)
72
+
73
+ return query_neighbours(neighbors[0])
74
 
75
  # ----------------------------------
76
  # Streamlit Page Setup
 
108
  for i, paper in enumerate(results, start=1):
109
  st.markdown(f"### **{i}. {paper['title']}**")
110
 
111
+ st.markdown(f"**Categories:** {paper['categories']}")
112
+ st.markdown(f"[🔗 View on arXiv](https://arxiv.org/abs/{paper['id']})")
113
 
114
  with st.expander("Abstract Preview"):
115
  st.write(paper["abstract"][:600] + "...")