James McCool
commited on
Commit
·
fa7b819
1
Parent(s):
50ccf3c
Trying to fix the rerun of diversity score
Browse files- app.py +0 -1
- global_func/recalc_diversity.py +30 -24
app.py
CHANGED
|
@@ -1774,7 +1774,6 @@ if selected_tab == 'Manage Portfolio':
|
|
| 1774 |
with recalc_div_col:
|
| 1775 |
if st.button("Recalculate Diversity"):
|
| 1776 |
st.session_state['display_frame']['Diversity'] = recalc_diversity(st.session_state['display_frame'], st.session_state['player_columns'])
|
| 1777 |
-
st.rerun()
|
| 1778 |
with set_base_col:
|
| 1779 |
with st.popover("New Base Setting"):
|
| 1780 |
st.markdown("Name of new base:")
|
|
|
|
| 1774 |
with recalc_div_col:
|
| 1775 |
if st.button("Recalculate Diversity"):
|
| 1776 |
st.session_state['display_frame']['Diversity'] = recalc_diversity(st.session_state['display_frame'], st.session_state['player_columns'])
|
|
|
|
| 1777 |
with set_base_col:
|
| 1778 |
with st.popover("New Base Setting"):
|
| 1779 |
st.markdown("Name of new base:")
|
global_func/recalc_diversity.py
CHANGED
|
@@ -5,53 +5,59 @@ import time
|
|
| 5 |
import math
|
| 6 |
from difflib import SequenceMatcher
|
| 7 |
|
| 8 |
-
def recalc_diversity(portfolio, player_columns):
|
| 9 |
"""
|
| 10 |
-
|
| 11 |
"""
|
| 12 |
-
#
|
| 13 |
player_data = portfolio[player_columns].astype(str).fillna('').values
|
| 14 |
|
| 15 |
-
# Get all unique players and create a mapping to numeric IDs
|
| 16 |
all_players = set()
|
| 17 |
for row in player_data:
|
| 18 |
for val in row:
|
| 19 |
if isinstance(val, str) and val.strip() != '':
|
| 20 |
all_players.add(val)
|
| 21 |
|
| 22 |
-
# Create player ID mapping
|
| 23 |
player_to_id = {player: idx for idx, player in enumerate(sorted(all_players))}
|
| 24 |
|
| 25 |
-
# Convert each row to a binary vector (1 if player is present, 0 if not)
|
| 26 |
n_players = len(all_players)
|
| 27 |
n_rows = len(portfolio)
|
| 28 |
binary_matrix = np.zeros((n_rows, n_players), dtype=np.int8)
|
| 29 |
|
| 30 |
-
# Vectorized binary matrix creation
|
| 31 |
for i, row in enumerate(player_data):
|
| 32 |
for val in row:
|
| 33 |
if isinstance(val, str) and str(val).strip() != '' and str(val) in player_to_id:
|
| 34 |
binary_matrix[i, player_to_id[str(val)]] = 1
|
| 35 |
|
| 36 |
-
#
|
| 37 |
-
|
| 38 |
-
row_sums = np.sum(binary_matrix, axis=1)
|
| 39 |
-
union_matrix = row_sums[:, np.newaxis] + row_sums - intersection_matrix
|
| 40 |
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
# Exclude self-comparison and calculate average distance for each row
|
| 50 |
-
np.fill_diagonal(jaccard_distance, 0)
|
| 51 |
-
row_counts = n_rows - 1
|
| 52 |
-
similarity_scores = np.sum(jaccard_distance, axis=1) / row_counts
|
| 53 |
-
|
| 54 |
-
# Normalize to 0-1 scale
|
| 55 |
score_range = similarity_scores.max() - similarity_scores.min()
|
| 56 |
if score_range > 0:
|
| 57 |
similarity_scores = (similarity_scores - similarity_scores.min()) / score_range
|
|
|
|
| 5 |
import math
|
| 6 |
from difflib import SequenceMatcher
|
| 7 |
|
| 8 |
+
def recalc_diversity(portfolio, player_columns, chunk_size=1000):
|
| 9 |
"""
|
| 10 |
+
Memory-efficient version that processes similarities in chunks
|
| 11 |
"""
|
| 12 |
+
# Same setup as before
|
| 13 |
player_data = portfolio[player_columns].astype(str).fillna('').values
|
| 14 |
|
|
|
|
| 15 |
all_players = set()
|
| 16 |
for row in player_data:
|
| 17 |
for val in row:
|
| 18 |
if isinstance(val, str) and val.strip() != '':
|
| 19 |
all_players.add(val)
|
| 20 |
|
|
|
|
| 21 |
player_to_id = {player: idx for idx, player in enumerate(sorted(all_players))}
|
| 22 |
|
|
|
|
| 23 |
n_players = len(all_players)
|
| 24 |
n_rows = len(portfolio)
|
| 25 |
binary_matrix = np.zeros((n_rows, n_players), dtype=np.int8)
|
| 26 |
|
|
|
|
| 27 |
for i, row in enumerate(player_data):
|
| 28 |
for val in row:
|
| 29 |
if isinstance(val, str) and str(val).strip() != '' and str(val) in player_to_id:
|
| 30 |
binary_matrix[i, player_to_id[str(val)]] = 1
|
| 31 |
|
| 32 |
+
# Process similarities in chunks to avoid massive matrices
|
| 33 |
+
similarity_scores = np.zeros(n_rows)
|
|
|
|
|
|
|
| 34 |
|
| 35 |
+
for i in range(0, n_rows, chunk_size):
|
| 36 |
+
end_i = min(i + chunk_size, n_rows)
|
| 37 |
+
chunk_binary = binary_matrix[i:end_i]
|
| 38 |
+
|
| 39 |
+
# Calculate similarities for this chunk only
|
| 40 |
+
intersection = np.dot(chunk_binary, binary_matrix.T)
|
| 41 |
+
chunk_row_sums = np.sum(chunk_binary, axis=1)
|
| 42 |
+
all_row_sums = np.sum(binary_matrix, axis=1)
|
| 43 |
+
|
| 44 |
+
union = chunk_row_sums[:, np.newaxis] + all_row_sums - intersection
|
| 45 |
+
|
| 46 |
+
with np.errstate(divide='ignore', invalid='ignore'):
|
| 47 |
+
jaccard_sim = np.divide(intersection, union,
|
| 48 |
+
out=np.zeros_like(intersection, dtype=float),
|
| 49 |
+
where=union != 0)
|
| 50 |
+
|
| 51 |
+
jaccard_dist = 1 - jaccard_sim
|
| 52 |
+
|
| 53 |
+
# Exclude self-comparison and calculate average
|
| 54 |
+
for j in range(len(jaccard_dist)):
|
| 55 |
+
actual_idx = i + j
|
| 56 |
+
jaccard_dist[j, actual_idx] = 0 # Exclude self
|
| 57 |
+
|
| 58 |
+
similarity_scores[i:end_i] = np.sum(jaccard_dist, axis=1) / (n_rows - 1)
|
| 59 |
|
| 60 |
+
# Normalize
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
score_range = similarity_scores.max() - similarity_scores.min()
|
| 62 |
if score_range > 0:
|
| 63 |
similarity_scores = (similarity_scores - similarity_scores.min()) / score_range
|