Jan Mühlnikel
commited on
Commit
·
7d8805d
1
Parent(s):
09c16ce
enhanced documentation
Browse files
functions/{calc_matches.py → multi_project_matching.py}
RENAMED
|
@@ -1,24 +1,35 @@
|
|
| 1 |
-
import pandas as pd
|
| 2 |
import numpy as np
|
| 3 |
-
from scipy.sparse import csr_matrix
|
| 4 |
-
import streamlit as st
|
| 5 |
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
if not isinstance(similarity_matrix, csr_matrix):
|
| 10 |
similarity_matrix = csr_matrix(similarity_matrix)
|
| 11 |
|
|
|
|
| 12 |
filtered_indices = filtered_df.index.to_list()
|
| 13 |
project_indices = project_df.index.to_list()
|
| 14 |
|
|
|
|
| 15 |
match_matrix = similarity_matrix[project_indices, :][:, filtered_indices] # row / column
|
| 16 |
-
|
| 17 |
dense_match_matrix = match_matrix.toarray()
|
| 18 |
-
|
| 19 |
flat_matrix = dense_match_matrix.flatten()
|
| 20 |
|
| 21 |
-
#
|
| 22 |
top_15_indices = np.argsort(flat_matrix)[-top_x:]
|
| 23 |
|
| 24 |
# Convert flat indices back to 2D indices
|
|
@@ -28,7 +39,6 @@ def calc_matches(filtered_df, project_df, similarity_matrix, top_x):
|
|
| 28 |
top_15_values = flat_matrix[top_15_indices]
|
| 29 |
|
| 30 |
# Prepare the result with row and column indices from original dataframes
|
| 31 |
-
top_15_matches = []
|
| 32 |
org_rows = []
|
| 33 |
org_cols = []
|
| 34 |
for value, row, col in zip(top_15_values, top_15_2d_indices[0], top_15_2d_indices[1]):
|
|
@@ -36,14 +46,24 @@ def calc_matches(filtered_df, project_df, similarity_matrix, top_x):
|
|
| 36 |
original_col_index = filtered_indices[col]
|
| 37 |
org_rows.append(original_row_index)
|
| 38 |
org_cols.append(original_col_index)
|
| 39 |
-
top_15_matches.append((value, original_row_index, original_col_index))
|
| 40 |
|
| 41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
p1_df = filtered_df.loc[org_cols].copy()
|
| 43 |
p1_df['similarity'] = top_15_values
|
| 44 |
|
| 45 |
p2_df = project_df.loc[org_rows].copy()
|
| 46 |
p2_df['similarity'] = top_15_values
|
| 47 |
-
print("finished calc matches")
|
| 48 |
|
|
|
|
| 49 |
return p1_df, p2_df
|
|
|
|
|
|
|
| 1 |
import numpy as np
|
| 2 |
+
from scipy.sparse import csr_matrix
|
|
|
|
| 3 |
|
| 4 |
+
"""
|
| 5 |
+
Function to calculate the multi project matching results
|
| 6 |
+
|
| 7 |
+
The Multi-Project Matching Feature uncovers synergy opportunities among various development banks and organizations by facilitating the search for similar projects
|
| 8 |
+
within a selected filter setting (filtered_df) and all projects (project_df).
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
def calc_multi_matches(filtered_df, project_df, similarity_matrix, top_x):
|
| 12 |
+
"""
|
| 13 |
+
filtered_df: df with applied filters
|
| 14 |
+
project_df: df with all projects
|
| 15 |
+
similarity_matrix: np sparse matrix with all similarities between projects
|
| 16 |
+
top_x: top x project which should be displayed
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
# convert npz sparse matrix into csr matrix
|
| 20 |
if not isinstance(similarity_matrix, csr_matrix):
|
| 21 |
similarity_matrix = csr_matrix(similarity_matrix)
|
| 22 |
|
| 23 |
+
# extract indecies of the projects
|
| 24 |
filtered_indices = filtered_df.index.to_list()
|
| 25 |
project_indices = project_df.index.to_list()
|
| 26 |
|
| 27 |
+
# size down the matrix to only projects within the filter and convert to dense matrix and flatten it
|
| 28 |
match_matrix = similarity_matrix[project_indices, :][:, filtered_indices] # row / column
|
|
|
|
| 29 |
dense_match_matrix = match_matrix.toarray()
|
|
|
|
| 30 |
flat_matrix = dense_match_matrix.flatten()
|
| 31 |
|
| 32 |
+
# get the indices of the top 15 values in the flattened matrix
|
| 33 |
top_15_indices = np.argsort(flat_matrix)[-top_x:]
|
| 34 |
|
| 35 |
# Convert flat indices back to 2D indices
|
|
|
|
| 39 |
top_15_values = flat_matrix[top_15_indices]
|
| 40 |
|
| 41 |
# Prepare the result with row and column indices from original dataframes
|
|
|
|
| 42 |
org_rows = []
|
| 43 |
org_cols = []
|
| 44 |
for value, row, col in zip(top_15_values, top_15_2d_indices[0], top_15_2d_indices[1]):
|
|
|
|
| 46 |
original_col_index = filtered_indices[col]
|
| 47 |
org_rows.append(original_row_index)
|
| 48 |
org_cols.append(original_col_index)
|
|
|
|
| 49 |
|
| 50 |
|
| 51 |
+
# create two result dataframes
|
| 52 |
+
|
| 53 |
+
"""
|
| 54 |
+
p1_df: first results of match
|
| 55 |
+
p2_df: matching result
|
| 56 |
+
|
| 57 |
+
matches are displayed through the indecies od p1 and p2 dfs
|
| 58 |
+
|
| 59 |
+
match1 p1_df.iloc[0] & p2_df.iloc[0]
|
| 60 |
+
match2 p1_df.iloc[1] & p2_df.iloc[1]
|
| 61 |
+
"""
|
| 62 |
p1_df = filtered_df.loc[org_cols].copy()
|
| 63 |
p1_df['similarity'] = top_15_values
|
| 64 |
|
| 65 |
p2_df = project_df.loc[org_rows].copy()
|
| 66 |
p2_df['similarity'] = top_15_values
|
|
|
|
| 67 |
|
| 68 |
+
# return both results df with amtching projects
|
| 69 |
return p1_df, p2_df
|
similarity_page.py
CHANGED
|
@@ -14,7 +14,7 @@ from modules.multimatch_result_table import show_multi_table
|
|
| 14 |
from modules.singlematch_result_table import show_single_table
|
| 15 |
from functions.filter_projects import filter_projects
|
| 16 |
from functions.filter_single import filter_single
|
| 17 |
-
from functions.
|
| 18 |
from functions.same_country_filter import same_country_filter
|
| 19 |
from functions.single_similar import find_similar
|
| 20 |
#import psutil
|
|
@@ -30,29 +30,14 @@ def get_process_memory():
|
|
| 30 |
# Catch DATA
|
| 31 |
|
| 32 |
# Load Similarity matrix
|
| 33 |
-
"""
|
| 34 |
-
@st.cache_data
|
| 35 |
-
def load_sim_matrix():
|
| 36 |
-
loaded_matrix = load_npz("src/extended_similarities.npz")
|
| 37 |
-
dense_matrix = loaded_matrix.toarray().astype('float16')
|
| 38 |
-
|
| 39 |
-
return dense_matrix
|
| 40 |
-
"""
|
| 41 |
@st.cache_data
|
| 42 |
def load_sim_matrix():
|
| 43 |
loaded_matrix = load_npz("src/extended_similarities.npz")
|
| 44 |
#dense_matrix = loaded_matrix.toarray().astype('float16')
|
| 45 |
|
| 46 |
return loaded_matrix
|
| 47 |
-
# Load Non Similar Orga Matrix
|
| 48 |
-
"""
|
| 49 |
-
@st.cache_data
|
| 50 |
-
def load_nonsameorga_sim_matrix():
|
| 51 |
-
loaded_matrix = load_npz("src/extended_similarities_nonsimorga.npz")
|
| 52 |
-
dense_matrix = loaded_matrix.toarray().astype('float16')
|
| 53 |
|
| 54 |
-
|
| 55 |
-
"""
|
| 56 |
def load_nonsameorga_sim_matrix():
|
| 57 |
loaded_matrix = load_npz("src/extended_similarities_nonsimorga.npz")
|
| 58 |
#dense_matrix = loaded_matrix.toarray().astype('float16')
|
|
@@ -272,10 +257,10 @@ def show_multi_matching_page():
|
|
| 272 |
## if show only different orgas checkbox is activated
|
| 273 |
if different_orga_checkbox:
|
| 274 |
with st.spinner('Please wait...'):
|
| 275 |
-
p1_df, p2_df =
|
| 276 |
else:
|
| 277 |
with st.spinner('Please wait...'):
|
| 278 |
-
p1_df, p2_df =
|
| 279 |
|
| 280 |
# SHOW THE RESULT
|
| 281 |
show_multi_table(p1_df, p2_df)
|
|
|
|
| 14 |
from modules.singlematch_result_table import show_single_table
|
| 15 |
from functions.filter_projects import filter_projects
|
| 16 |
from functions.filter_single import filter_single
|
| 17 |
+
from functions.multi_project_matching import calc_multi_matches
|
| 18 |
from functions.same_country_filter import same_country_filter
|
| 19 |
from functions.single_similar import find_similar
|
| 20 |
#import psutil
|
|
|
|
| 30 |
# Catch DATA
|
| 31 |
|
| 32 |
# Load Similarity matrix
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
@st.cache_data
|
| 34 |
def load_sim_matrix():
|
| 35 |
loaded_matrix = load_npz("src/extended_similarities.npz")
|
| 36 |
#dense_matrix = loaded_matrix.toarray().astype('float16')
|
| 37 |
|
| 38 |
return loaded_matrix
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
+
# Load Non Similar Orga Matrix
|
|
|
|
| 41 |
def load_nonsameorga_sim_matrix():
|
| 42 |
loaded_matrix = load_npz("src/extended_similarities_nonsimorga.npz")
|
| 43 |
#dense_matrix = loaded_matrix.toarray().astype('float16')
|
|
|
|
| 257 |
## if show only different orgas checkbox is activated
|
| 258 |
if different_orga_checkbox:
|
| 259 |
with st.spinner('Please wait...'):
|
| 260 |
+
p1_df, p2_df = calc_multi_matches(filtered_df, compare_df, nonsameorgas_sim_matrix, TOP_X_PROJECTS)
|
| 261 |
else:
|
| 262 |
with st.spinner('Please wait...'):
|
| 263 |
+
p1_df, p2_df = calc_multi_matches(filtered_df, compare_df, sim_matrix, TOP_X_PROJECTS)
|
| 264 |
|
| 265 |
# SHOW THE RESULT
|
| 266 |
show_multi_table(p1_df, p2_df)
|