Jan Mühlnikel
commited on
Commit
·
8250706
1
Parent(s):
2eaf511
experiment
Browse files- functions/calc_matches.py +22 -5
functions/calc_matches.py
CHANGED
|
@@ -5,10 +5,6 @@ import streamlit as st
|
|
| 5 |
|
| 6 |
# multi_project_matching
|
| 7 |
def calc_matches(filtered_df, project_df, similarity_matrix, top_x):
|
| 8 |
-
st.write(filtered_df.shape)
|
| 9 |
-
st.write(project_df.shape)
|
| 10 |
-
st.write(similarity_matrix.shape)
|
| 11 |
-
|
| 12 |
# Ensure the matrix is in a suitable format for manipulation
|
| 13 |
if not isinstance(similarity_matrix, csr_matrix):
|
| 14 |
similarity_matrix = csr_matrix(similarity_matrix)
|
|
@@ -16,12 +12,33 @@ def calc_matches(filtered_df, project_df, similarity_matrix, top_x):
|
|
| 16 |
filtered_indices = filtered_df.index.to_list()
|
| 17 |
project_indices = project_df.index.to_list()
|
| 18 |
|
| 19 |
-
match_matrix = similarity_matrix[project_indices, :][:, filtered_indices]
|
| 20 |
|
| 21 |
dense_match_matrix = match_matrix.toarray()
|
| 22 |
|
| 23 |
st.write(dense_match_matrix.shape)
|
| 24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
"""
|
| 26 |
p1_df = filtered_df.loc[top_col_indices].copy()
|
| 27 |
p1_df['similarity'] = top_values
|
|
|
|
| 5 |
|
| 6 |
# multi_project_matching
|
| 7 |
def calc_matches(filtered_df, project_df, similarity_matrix, top_x):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
# Ensure the matrix is in a suitable format for manipulation
|
| 9 |
if not isinstance(similarity_matrix, csr_matrix):
|
| 10 |
similarity_matrix = csr_matrix(similarity_matrix)
|
|
|
|
| 12 |
filtered_indices = filtered_df.index.to_list()
|
| 13 |
project_indices = project_df.index.to_list()
|
| 14 |
|
| 15 |
+
match_matrix = similarity_matrix[project_indices, :][:, filtered_indices] # row / column
|
| 16 |
|
| 17 |
dense_match_matrix = match_matrix.toarray()
|
| 18 |
|
| 19 |
st.write(dense_match_matrix.shape)
|
| 20 |
|
| 21 |
+
flat_matrix = dense_match_matrix.flatten()
|
| 22 |
+
|
| 23 |
+
# Get the indices of the top 15 values in the flattened matrix
|
| 24 |
+
top_15_indices = np.argsort(flat_matrix)[-top_x:][::-1]
|
| 25 |
+
|
| 26 |
+
# Convert flat indices back to 2D indices
|
| 27 |
+
top_15_2d_indices = np.unravel_index(top_15_indices, dense_match_matrix.shape)
|
| 28 |
+
|
| 29 |
+
# Extract the corresponding values
|
| 30 |
+
top_15_values = flat_matrix[top_15_indices]
|
| 31 |
+
|
| 32 |
+
# Prepare the result with row and column indices from original dataframes
|
| 33 |
+
top_15_matches = []
|
| 34 |
+
for value, row, col in zip(top_15_values, top_15_2d_indices[0], top_15_2d_indices[1]):
|
| 35 |
+
original_row_index = project_indices[row]
|
| 36 |
+
original_col_index = filtered_indices[col]
|
| 37 |
+
top_15_matches.append((value, original_row_index, original_col_index))
|
| 38 |
+
|
| 39 |
+
st.write(top_15_matches)
|
| 40 |
+
|
| 41 |
+
|
| 42 |
"""
|
| 43 |
p1_df = filtered_df.loc[top_col_indices].copy()
|
| 44 |
p1_df['similarity'] = top_values
|