Spaces:

GIZ
/

Development-Project-Synergy-Finder

Sleeping

App Files Files Community

Jan Mühlnikel commited on Mar 19, 2024

Commit

71de22d

1 Parent(s): e3302f1

added semantic search function

Browse files

Files changed (9) hide show

__pycache__/similarity_page.cpython-310.pyc +0 -0
functions/__pycache__/calc_matches.cpython-310.pyc +0 -0
functions/__pycache__/filter_projects.cpython-310.pyc +0 -0
functions/__pycache__/semantic_search.cpython-310.pyc +0 -0
functions/calc_matches.py +2 -2
functions/filter_projects.py +10 -2
{modules → functions}/semantic_search.py +15 -11
modules/__pycache__/semantic_search.cpython-310.pyc +0 -0
similarity_page.py +13 -12

__pycache__/similarity_page.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/similarity_page.cpython-310.pyc and b/__pycache__/similarity_page.cpython-310.pyc differ

functions/__pycache__/calc_matches.cpython-310.pyc CHANGED Viewed

Binary files a/functions/__pycache__/calc_matches.cpython-310.pyc and b/functions/__pycache__/calc_matches.cpython-310.pyc differ

functions/__pycache__/filter_projects.cpython-310.pyc CHANGED Viewed

Binary files a/functions/__pycache__/filter_projects.cpython-310.pyc and b/functions/__pycache__/filter_projects.cpython-310.pyc differ

functions/__pycache__/semantic_search.cpython-310.pyc ADDED Viewed

Binary file (1.07 kB). View file

functions/calc_matches.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import pandas as pd
 import numpy as np
-def calc_matches(filtered_df, project_df, similarity_matrix):
     # matching project2 can be nay project
     # indecies (rows) = project1
     # columns = project2
@@ -14,7 +14,7 @@ def calc_matches(filtered_df, project_df, similarity_matrix):
     match_matrix = similarity_matrix[filtered_df_indecies_list]
     # get row (project1) and column (project2) with highest similarity in filtered df
-    top_indices = np.unravel_index(np.argsort(match_matrix, axis=None)[-30:], match_matrix.shape)
     # get the corresponding similarity values
     top_values = match_matrix[top_indices]

 import pandas as pd
 import numpy as np
+def calc_matches(filtered_df, project_df, similarity_matrix, top_x):
     # matching project2 can be nay project
     # indecies (rows) = project1
     # columns = project2
     match_matrix = similarity_matrix[filtered_df_indecies_list]
     # get row (project1) and column (project2) with highest similarity in filtered df
+    top_indices = np.unravel_index(np.argsort(match_matrix, axis=None)[-top_x:], match_matrix.shape)
     # get the corresponding similarity values
     top_values = match_matrix[top_indices]

functions/filter_projects.py CHANGED Viewed

@@ -1,12 +1,13 @@
 import pandas as pd
 def contains_code(crs_codes, code_list):
     codes = str(crs_codes).split(';')
     return any(code in code_list for code in codes)
-def filter_projects(df, crs3_list, crs5_list, sdg_str, country_code_list, orga_code_list):
     # Check if filters where not all should be selected are empty
-    if crs3_list != [] or crs5_list != [] or sdg_str != "":
         # FILTER CRS
         if crs3_list and not crs5_list:
@@ -33,6 +34,13 @@ def filter_projects(df, crs3_list, crs5_list, sdg_str, country_code_list, orga_c
         if orga_code_list != []:
             df = df[df['orga_abbreviation'].isin(orga_code_list)]
         return df

 import pandas as pd
+from functions.semantic_search import search
 def contains_code(crs_codes, code_list):
     codes = str(crs_codes).split(';')
     return any(code in code_list for code in codes)
+def filter_projects(df, crs3_list, crs5_list, sdg_str, country_code_list, orga_code_list, query, model, embeddings, TOP_X_PROJECTS=30):
     # Check if filters where not all should be selected are empty
+    if crs3_list != [] or crs5_list != [] or sdg_str != "" or query != "":
         # FILTER CRS
         if crs3_list and not crs5_list:
         if orga_code_list != []:
             df = df[df['orga_abbreviation'].isin(orga_code_list)]
+        # FILTER QUERY
+        if query != "" and len(df) > 0:
+            if len(df) < TOP_X_PROJECTS:
+                TOP_X_PROJECTS = len(df)
+            df = search(query, model, embeddings, df, TOP_X_PROJECTS)
         return df

{modules → functions}/semantic_search.py RENAMED Viewed

@@ -2,22 +2,26 @@ import pickle
 import faiss
 import streamlit as st
 from sentence_transformers import SentenceTransformer
-def show_search(model, faiss_index, sentences):
-    query = st.text_input("Enter your search query:")
-    if query:
         # Convert query to embedding
         query_embedding = model.encode([query])[0].reshape(1, -1)
         # Perform search
-        D, I = faiss_index.search(query_embedding, k=5)  # Search for top 5 similar items
         # Extract the sentences corresponding to the top indices
-        top_sentences = [sentences[i] for i in I[0]]
-        # Display results as a selection list
-        selected_sentence = st.selectbox("Top results:", top_sentences)
-        # Optionally, do something with the selected sentence
-        st.write("You selected:", selected_sentence)

 import faiss
 import streamlit as st
 from sentence_transformers import SentenceTransformer
+import pandas as pd
+def search(query, model, embeddings, filtered_df, top_x=30):
+        filtered_df_indecies_list = filtered_df.index
+        filtered_embeddings = embeddings[filtered_df_indecies_list]
+        # Load or create FAISS index
+        dimension = filtered_embeddings.shape[1]
+        faiss_index = faiss.IndexFlatL2(dimension)
+        faiss_index.add(filtered_embeddings)
         # Convert query to embedding
         query_embedding = model.encode([query])[0].reshape(1, -1)
         # Perform search
+        D, I = faiss_index.search(query_embedding, k=top_x)  # Search for top x similar items
         # Extract the sentences corresponding to the top indices
+        #print(filtered_df.columns())
+        top_indecies = [i for i in I[0]]
+        return filtered_df.iloc[top_indecies]

modules/__pycache__/semantic_search.cpython-310.pyc CHANGED Viewed

Binary files a/modules/__pycache__/semantic_search.cpython-310.pyc and b/modules/__pycache__/semantic_search.cpython-310.pyc differ

similarity_page.py CHANGED Viewed

@@ -12,7 +12,6 @@ import pickle
 import faiss
 from sentence_transformers import SentenceTransformer
 from modules.result_table import show_table
-import modules.semantic_search as semantic_search
 from functions.filter_projects import filter_projects
 from functions.calc_matches import calc_matches
 import psutil
@@ -107,12 +106,7 @@ def load_embeddings_and_index():
     sentences = stored_data["sentences"]
     embeddings = stored_data["embeddings"]
-    # Load or create FAISS index
-    dimension = embeddings.shape[1]
-    faiss_index = faiss.IndexFlatL2(dimension)
-    faiss_index.add(embeddings)
-    return sentences, embeddings, faiss_index
 # USE CACHE FUNCTIONS
 sim_matrix = load_sim_matrix()
@@ -124,8 +118,9 @@ SDG_NAMES = getSDG()
 COUNTRY_OPTION_LIST = getCountry()
 model = load_model()
-sentences, embeddings, faiss_index = load_embeddings_and_index()
 def show_page():
     st.write(f"Current RAM usage of this app: {get_process_memory():.2f} MB")
@@ -168,8 +163,6 @@ def show_page():
     with col2:
         # COUNTRY SELECTION
         country_option = st.multiselect(
                 'Country / Countries',
                 COUNTRY_OPTION_LIST,
@@ -186,6 +179,9 @@ def show_page():
                 orga_list,
                 placeholder="Select"
                 )
     # CRS CODE LIST
@@ -205,10 +201,15 @@ def show_page():
     orga_code_list = [option.split("(")[1][:-1].lower() for option in orga_option]
     # FILTER DF WITH SELECTED FILTER OPTIONS
-    filtered_df = filter_projects(projects_df, crs3_list, crs5_list, sdg_str, country_code_list, orga_code_list)
     # FIND MATCHES
-    p1_df, p2_df = calc_matches(filtered_df, projects_df, sim_matrix)
     # SHOW THE RESULT
     show_table(p1_df, p2_df)

 import faiss
 from sentence_transformers import SentenceTransformer
 from modules.result_table import show_table
 from functions.filter_projects import filter_projects
 from functions.calc_matches import calc_matches
 import psutil
     sentences = stored_data["sentences"]
     embeddings = stored_data["embeddings"]
+    return sentences, embeddings
 # USE CACHE FUNCTIONS
 sim_matrix = load_sim_matrix()
 COUNTRY_OPTION_LIST = getCountry()
+# LOAD MODEL FROM CACHE FO SEMANTIC SEARCH
 model = load_model()
+sentences, embeddings = load_embeddings_and_index()
 def show_page():
     st.write(f"Current RAM usage of this app: {get_process_memory():.2f} MB")
     with col2:
         # COUNTRY SELECTION
         country_option = st.multiselect(
                 'Country / Countries',
                 COUNTRY_OPTION_LIST,
                 orga_list,
                 placeholder="Select"
                 )
+        # SEARCH BOX
+        query = st.text_input("Enter your search query:")
     # CRS CODE LIST
     orga_code_list = [option.split("(")[1][:-1].lower() for option in orga_option]
     # FILTER DF WITH SELECTED FILTER OPTIONS
+    TOP_X_PROJECTS = 30
+    filtered_df = filter_projects(projects_df, crs3_list, crs5_list, sdg_str, country_code_list, orga_code_list, query, model, embeddings, TOP_X_PROJECTS)
+    #with col2:
+        # Semantic Search
+        #searched_filtered_df = semantic_search.show_search(model, embeddings, sentences, filtered_df, TOP_X_PROJECTS)
     # FIND MATCHES
+    p1_df, p2_df = calc_matches(filtered_df, projects_df, sim_matrix, TOP_X_PROJECTS)
     # SHOW THE RESULT
     show_table(p1_df, p2_df)