Jan Mühlnikel
commited on
Commit
·
139b395
1
Parent(s):
5ae2590
added semantic search engine
Browse files
__pycache__/similarity.cpython-310.pyc
CHANGED
|
Binary files a/__pycache__/similarity.cpython-310.pyc and b/__pycache__/similarity.cpython-310.pyc differ
|
|
|
similarity.py
CHANGED
|
@@ -8,7 +8,11 @@ Page for similarities
|
|
| 8 |
import streamlit as st
|
| 9 |
import pandas as pd
|
| 10 |
from scipy.sparse import load_npz
|
|
|
|
|
|
|
|
|
|
| 11 |
import utils.similarity_table as similarity_table
|
|
|
|
| 12 |
import psutil
|
| 13 |
import os
|
| 14 |
|
|
@@ -40,14 +44,39 @@ def load_projects():
|
|
| 40 |
|
| 41 |
return projects_df
|
| 42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
# LOAD DATA
|
| 44 |
sim_matrix = load_sim_matrix()
|
| 45 |
projects_df = load_projects()
|
|
|
|
|
|
|
| 46 |
|
| 47 |
def show_page():
|
| 48 |
st.write(f"Current RAM usage of this app: {get_process_memory():.2f} MB")
|
| 49 |
st.write("Similarities")
|
| 50 |
|
|
|
|
|
|
|
| 51 |
df_subset = projects_df.head(10)
|
| 52 |
selected_index = st.selectbox('Select an entry', df_subset.index, format_func=lambda x: df_subset.loc[x, 'iati_id'])
|
| 53 |
|
|
|
|
| 8 |
import streamlit as st
|
| 9 |
import pandas as pd
|
| 10 |
from scipy.sparse import load_npz
|
| 11 |
+
import pickle
|
| 12 |
+
import faiss
|
| 13 |
+
from sentence_transformers import SentenceTransformer
|
| 14 |
import utils.similarity_table as similarity_table
|
| 15 |
+
import utils.semantic_search as semantic_search
|
| 16 |
import psutil
|
| 17 |
import os
|
| 18 |
|
|
|
|
| 44 |
|
| 45 |
return projects_df
|
| 46 |
|
| 47 |
+
@st.cache_resource
|
| 48 |
+
def load_model():
|
| 49 |
+
model = SentenceTransformer('all-MiniLM-L6-v2')
|
| 50 |
+
return model
|
| 51 |
+
|
| 52 |
+
# LOAD EMBEDDINGS
|
| 53 |
+
@st.cache_data
|
| 54 |
+
def load_embeddings_and_index():
|
| 55 |
+
# Load embeddings
|
| 56 |
+
with open("..\synergy-app\src\embeddings.pkl", "rb") as fIn:
|
| 57 |
+
stored_data = pickle.load(fIn)
|
| 58 |
+
sentences = stored_data["sentences"]
|
| 59 |
+
embeddings = stored_data["embeddings"]
|
| 60 |
+
|
| 61 |
+
# Load or create FAISS index
|
| 62 |
+
dimension = embeddings.shape[1]
|
| 63 |
+
faiss_index = faiss.IndexFlatL2(dimension)
|
| 64 |
+
faiss_index.add(embeddings)
|
| 65 |
+
|
| 66 |
+
return sentences, embeddings, faiss_index
|
| 67 |
+
|
| 68 |
# LOAD DATA
|
| 69 |
sim_matrix = load_sim_matrix()
|
| 70 |
projects_df = load_projects()
|
| 71 |
+
model = load_model()
|
| 72 |
+
sentences, embeddings, faiss_index = load_embeddings_and_index()
|
| 73 |
|
| 74 |
def show_page():
|
| 75 |
st.write(f"Current RAM usage of this app: {get_process_memory():.2f} MB")
|
| 76 |
st.write("Similarities")
|
| 77 |
|
| 78 |
+
semantic_search.show_search(model, faiss_index, sentences)
|
| 79 |
+
|
| 80 |
df_subset = projects_df.head(10)
|
| 81 |
selected_index = st.selectbox('Select an entry', df_subset.index, format_func=lambda x: df_subset.loc[x, 'iati_id'])
|
| 82 |
|
utils/__pycache__/semantic_search.cpython-310.pyc
ADDED
|
Binary file (825 Bytes). View file
|
|
|
utils/semantic_search.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pickle
|
| 2 |
+
import faiss
|
| 3 |
+
import streamlit as st
|
| 4 |
+
from sentence_transformers import SentenceTransformer
|
| 5 |
+
|
| 6 |
+
def show_search(model, faiss_index, sentences):
|
| 7 |
+
query = st.text_input("Enter your search query:")
|
| 8 |
+
|
| 9 |
+
if query:
|
| 10 |
+
# Convert query to embedding
|
| 11 |
+
query_embedding = model.encode([query])[0].reshape(1, -1)
|
| 12 |
+
|
| 13 |
+
# Perform search
|
| 14 |
+
D, I = faiss_index.search(query_embedding, k=5) # Search for top 5 similar items
|
| 15 |
+
|
| 16 |
+
# Display results
|
| 17 |
+
st.write("Top results:")
|
| 18 |
+
for i in I[0]:
|
| 19 |
+
st.write(sentences[i])
|