Jan Mühlnikel
commited on
Commit
·
71de22d
1
Parent(s):
e3302f1
added semantic search function
Browse files- __pycache__/similarity_page.cpython-310.pyc +0 -0
- functions/__pycache__/calc_matches.cpython-310.pyc +0 -0
- functions/__pycache__/filter_projects.cpython-310.pyc +0 -0
- functions/__pycache__/semantic_search.cpython-310.pyc +0 -0
- functions/calc_matches.py +2 -2
- functions/filter_projects.py +10 -2
- {modules → functions}/semantic_search.py +15 -11
- modules/__pycache__/semantic_search.cpython-310.pyc +0 -0
- similarity_page.py +13 -12
__pycache__/similarity_page.cpython-310.pyc
CHANGED
|
Binary files a/__pycache__/similarity_page.cpython-310.pyc and b/__pycache__/similarity_page.cpython-310.pyc differ
|
|
|
functions/__pycache__/calc_matches.cpython-310.pyc
CHANGED
|
Binary files a/functions/__pycache__/calc_matches.cpython-310.pyc and b/functions/__pycache__/calc_matches.cpython-310.pyc differ
|
|
|
functions/__pycache__/filter_projects.cpython-310.pyc
CHANGED
|
Binary files a/functions/__pycache__/filter_projects.cpython-310.pyc and b/functions/__pycache__/filter_projects.cpython-310.pyc differ
|
|
|
functions/__pycache__/semantic_search.cpython-310.pyc
ADDED
|
Binary file (1.07 kB). View file
|
|
|
functions/calc_matches.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
import pandas as pd
|
| 2 |
import numpy as np
|
| 3 |
|
| 4 |
-
def calc_matches(filtered_df, project_df, similarity_matrix):
|
| 5 |
# matching project2 can be nay project
|
| 6 |
# indecies (rows) = project1
|
| 7 |
# columns = project2
|
|
@@ -14,7 +14,7 @@ def calc_matches(filtered_df, project_df, similarity_matrix):
|
|
| 14 |
match_matrix = similarity_matrix[filtered_df_indecies_list]
|
| 15 |
|
| 16 |
# get row (project1) and column (project2) with highest similarity in filtered df
|
| 17 |
-
top_indices = np.unravel_index(np.argsort(match_matrix, axis=None)[-
|
| 18 |
|
| 19 |
# get the corresponding similarity values
|
| 20 |
top_values = match_matrix[top_indices]
|
|
|
|
| 1 |
import pandas as pd
|
| 2 |
import numpy as np
|
| 3 |
|
| 4 |
+
def calc_matches(filtered_df, project_df, similarity_matrix, top_x):
|
| 5 |
# matching project2 can be nay project
|
| 6 |
# indecies (rows) = project1
|
| 7 |
# columns = project2
|
|
|
|
| 14 |
match_matrix = similarity_matrix[filtered_df_indecies_list]
|
| 15 |
|
| 16 |
# get row (project1) and column (project2) with highest similarity in filtered df
|
| 17 |
+
top_indices = np.unravel_index(np.argsort(match_matrix, axis=None)[-top_x:], match_matrix.shape)
|
| 18 |
|
| 19 |
# get the corresponding similarity values
|
| 20 |
top_values = match_matrix[top_indices]
|
functions/filter_projects.py
CHANGED
|
@@ -1,12 +1,13 @@
|
|
| 1 |
import pandas as pd
|
|
|
|
| 2 |
|
| 3 |
def contains_code(crs_codes, code_list):
|
| 4 |
codes = str(crs_codes).split(';')
|
| 5 |
return any(code in code_list for code in codes)
|
| 6 |
|
| 7 |
-
def filter_projects(df, crs3_list, crs5_list, sdg_str, country_code_list, orga_code_list):
|
| 8 |
# Check if filters where not all should be selected are empty
|
| 9 |
-
if crs3_list != [] or crs5_list != [] or sdg_str != "":
|
| 10 |
|
| 11 |
# FILTER CRS
|
| 12 |
if crs3_list and not crs5_list:
|
|
@@ -33,6 +34,13 @@ def filter_projects(df, crs3_list, crs5_list, sdg_str, country_code_list, orga_c
|
|
| 33 |
if orga_code_list != []:
|
| 34 |
df = df[df['orga_abbreviation'].isin(orga_code_list)]
|
| 35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
return df
|
| 38 |
|
|
|
|
| 1 |
import pandas as pd
|
| 2 |
+
from functions.semantic_search import search
|
| 3 |
|
| 4 |
def contains_code(crs_codes, code_list):
|
| 5 |
codes = str(crs_codes).split(';')
|
| 6 |
return any(code in code_list for code in codes)
|
| 7 |
|
| 8 |
+
def filter_projects(df, crs3_list, crs5_list, sdg_str, country_code_list, orga_code_list, query, model, embeddings, TOP_X_PROJECTS=30):
|
| 9 |
# Check if filters where not all should be selected are empty
|
| 10 |
+
if crs3_list != [] or crs5_list != [] or sdg_str != "" or query != "":
|
| 11 |
|
| 12 |
# FILTER CRS
|
| 13 |
if crs3_list and not crs5_list:
|
|
|
|
| 34 |
if orga_code_list != []:
|
| 35 |
df = df[df['orga_abbreviation'].isin(orga_code_list)]
|
| 36 |
|
| 37 |
+
# FILTER QUERY
|
| 38 |
+
if query != "" and len(df) > 0:
|
| 39 |
+
if len(df) < TOP_X_PROJECTS:
|
| 40 |
+
TOP_X_PROJECTS = len(df)
|
| 41 |
+
df = search(query, model, embeddings, df, TOP_X_PROJECTS)
|
| 42 |
+
|
| 43 |
+
|
| 44 |
|
| 45 |
return df
|
| 46 |
|
{modules → functions}/semantic_search.py
RENAMED
|
@@ -2,22 +2,26 @@ import pickle
|
|
| 2 |
import faiss
|
| 3 |
import streamlit as st
|
| 4 |
from sentence_transformers import SentenceTransformer
|
|
|
|
| 5 |
|
| 6 |
-
def
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
-
if query:
|
| 10 |
# Convert query to embedding
|
| 11 |
query_embedding = model.encode([query])[0].reshape(1, -1)
|
| 12 |
|
| 13 |
# Perform search
|
| 14 |
-
D, I = faiss_index.search(query_embedding, k=
|
| 15 |
|
| 16 |
# Extract the sentences corresponding to the top indices
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
# Optionally, do something with the selected sentence
|
| 23 |
-
st.write("You selected:", selected_sentence)
|
|
|
|
| 2 |
import faiss
|
| 3 |
import streamlit as st
|
| 4 |
from sentence_transformers import SentenceTransformer
|
| 5 |
+
import pandas as pd
|
| 6 |
|
| 7 |
+
def search(query, model, embeddings, filtered_df, top_x=30):
|
| 8 |
+
|
| 9 |
+
filtered_df_indecies_list = filtered_df.index
|
| 10 |
+
filtered_embeddings = embeddings[filtered_df_indecies_list]
|
| 11 |
+
|
| 12 |
+
# Load or create FAISS index
|
| 13 |
+
dimension = filtered_embeddings.shape[1]
|
| 14 |
+
faiss_index = faiss.IndexFlatL2(dimension)
|
| 15 |
+
faiss_index.add(filtered_embeddings)
|
| 16 |
|
|
|
|
| 17 |
# Convert query to embedding
|
| 18 |
query_embedding = model.encode([query])[0].reshape(1, -1)
|
| 19 |
|
| 20 |
# Perform search
|
| 21 |
+
D, I = faiss_index.search(query_embedding, k=top_x) # Search for top x similar items
|
| 22 |
|
| 23 |
# Extract the sentences corresponding to the top indices
|
| 24 |
+
#print(filtered_df.columns())
|
| 25 |
+
top_indecies = [i for i in I[0]]
|
| 26 |
+
|
| 27 |
+
return filtered_df.iloc[top_indecies]
|
|
|
|
|
|
|
|
|
modules/__pycache__/semantic_search.cpython-310.pyc
CHANGED
|
Binary files a/modules/__pycache__/semantic_search.cpython-310.pyc and b/modules/__pycache__/semantic_search.cpython-310.pyc differ
|
|
|
similarity_page.py
CHANGED
|
@@ -12,7 +12,6 @@ import pickle
|
|
| 12 |
import faiss
|
| 13 |
from sentence_transformers import SentenceTransformer
|
| 14 |
from modules.result_table import show_table
|
| 15 |
-
import modules.semantic_search as semantic_search
|
| 16 |
from functions.filter_projects import filter_projects
|
| 17 |
from functions.calc_matches import calc_matches
|
| 18 |
import psutil
|
|
@@ -107,12 +106,7 @@ def load_embeddings_and_index():
|
|
| 107 |
sentences = stored_data["sentences"]
|
| 108 |
embeddings = stored_data["embeddings"]
|
| 109 |
|
| 110 |
-
|
| 111 |
-
dimension = embeddings.shape[1]
|
| 112 |
-
faiss_index = faiss.IndexFlatL2(dimension)
|
| 113 |
-
faiss_index.add(embeddings)
|
| 114 |
-
|
| 115 |
-
return sentences, embeddings, faiss_index
|
| 116 |
|
| 117 |
# USE CACHE FUNCTIONS
|
| 118 |
sim_matrix = load_sim_matrix()
|
|
@@ -124,8 +118,9 @@ SDG_NAMES = getSDG()
|
|
| 124 |
|
| 125 |
COUNTRY_OPTION_LIST = getCountry()
|
| 126 |
|
|
|
|
| 127 |
model = load_model()
|
| 128 |
-
sentences, embeddings
|
| 129 |
|
| 130 |
def show_page():
|
| 131 |
st.write(f"Current RAM usage of this app: {get_process_memory():.2f} MB")
|
|
@@ -168,8 +163,6 @@ def show_page():
|
|
| 168 |
|
| 169 |
with col2:
|
| 170 |
# COUNTRY SELECTION
|
| 171 |
-
|
| 172 |
-
|
| 173 |
country_option = st.multiselect(
|
| 174 |
'Country / Countries',
|
| 175 |
COUNTRY_OPTION_LIST,
|
|
@@ -186,6 +179,9 @@ def show_page():
|
|
| 186 |
orga_list,
|
| 187 |
placeholder="Select"
|
| 188 |
)
|
|
|
|
|
|
|
|
|
|
| 189 |
|
| 190 |
|
| 191 |
# CRS CODE LIST
|
|
@@ -205,10 +201,15 @@ def show_page():
|
|
| 205 |
orga_code_list = [option.split("(")[1][:-1].lower() for option in orga_option]
|
| 206 |
|
| 207 |
# FILTER DF WITH SELECTED FILTER OPTIONS
|
| 208 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
|
| 210 |
# FIND MATCHES
|
| 211 |
-
p1_df, p2_df = calc_matches(filtered_df, projects_df, sim_matrix)
|
| 212 |
|
| 213 |
# SHOW THE RESULT
|
| 214 |
show_table(p1_df, p2_df)
|
|
|
|
| 12 |
import faiss
|
| 13 |
from sentence_transformers import SentenceTransformer
|
| 14 |
from modules.result_table import show_table
|
|
|
|
| 15 |
from functions.filter_projects import filter_projects
|
| 16 |
from functions.calc_matches import calc_matches
|
| 17 |
import psutil
|
|
|
|
| 106 |
sentences = stored_data["sentences"]
|
| 107 |
embeddings = stored_data["embeddings"]
|
| 108 |
|
| 109 |
+
return sentences, embeddings
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
|
| 111 |
# USE CACHE FUNCTIONS
|
| 112 |
sim_matrix = load_sim_matrix()
|
|
|
|
| 118 |
|
| 119 |
COUNTRY_OPTION_LIST = getCountry()
|
| 120 |
|
| 121 |
+
# LOAD MODEL FROM CACHE FO SEMANTIC SEARCH
|
| 122 |
model = load_model()
|
| 123 |
+
sentences, embeddings = load_embeddings_and_index()
|
| 124 |
|
| 125 |
def show_page():
|
| 126 |
st.write(f"Current RAM usage of this app: {get_process_memory():.2f} MB")
|
|
|
|
| 163 |
|
| 164 |
with col2:
|
| 165 |
# COUNTRY SELECTION
|
|
|
|
|
|
|
| 166 |
country_option = st.multiselect(
|
| 167 |
'Country / Countries',
|
| 168 |
COUNTRY_OPTION_LIST,
|
|
|
|
| 179 |
orga_list,
|
| 180 |
placeholder="Select"
|
| 181 |
)
|
| 182 |
+
|
| 183 |
+
# SEARCH BOX
|
| 184 |
+
query = st.text_input("Enter your search query:")
|
| 185 |
|
| 186 |
|
| 187 |
# CRS CODE LIST
|
|
|
|
| 201 |
orga_code_list = [option.split("(")[1][:-1].lower() for option in orga_option]
|
| 202 |
|
| 203 |
# FILTER DF WITH SELECTED FILTER OPTIONS
|
| 204 |
+
|
| 205 |
+
TOP_X_PROJECTS = 30
|
| 206 |
+
filtered_df = filter_projects(projects_df, crs3_list, crs5_list, sdg_str, country_code_list, orga_code_list, query, model, embeddings, TOP_X_PROJECTS)
|
| 207 |
+
#with col2:
|
| 208 |
+
# Semantic Search
|
| 209 |
+
#searched_filtered_df = semantic_search.show_search(model, embeddings, sentences, filtered_df, TOP_X_PROJECTS)
|
| 210 |
|
| 211 |
# FIND MATCHES
|
| 212 |
+
p1_df, p2_df = calc_matches(filtered_df, projects_df, sim_matrix, TOP_X_PROJECTS)
|
| 213 |
|
| 214 |
# SHOW THE RESULT
|
| 215 |
show_table(p1_df, p2_df)
|