James McCool commited on
Commit
fa7b819
·
1 Parent(s): 50ccf3c

Trying to fix the rerun of diversity score

Browse files
Files changed (2) hide show
  1. app.py +0 -1
  2. global_func/recalc_diversity.py +30 -24
app.py CHANGED
@@ -1774,7 +1774,6 @@ if selected_tab == 'Manage Portfolio':
1774
  with recalc_div_col:
1775
  if st.button("Recalculate Diversity"):
1776
  st.session_state['display_frame']['Diversity'] = recalc_diversity(st.session_state['display_frame'], st.session_state['player_columns'])
1777
- st.rerun()
1778
  with set_base_col:
1779
  with st.popover("New Base Setting"):
1780
  st.markdown("Name of new base:")
 
1774
  with recalc_div_col:
1775
  if st.button("Recalculate Diversity"):
1776
  st.session_state['display_frame']['Diversity'] = recalc_diversity(st.session_state['display_frame'], st.session_state['player_columns'])
 
1777
  with set_base_col:
1778
  with st.popover("New Base Setting"):
1779
  st.markdown("Name of new base:")
global_func/recalc_diversity.py CHANGED
@@ -5,53 +5,59 @@ import time
5
  import math
6
  from difflib import SequenceMatcher
7
 
8
- def recalc_diversity(portfolio, player_columns):
9
  """
10
- Vectorized version of recalc_diversity using NumPy operations.
11
  """
12
- # Extract player data and convert to string array
13
  player_data = portfolio[player_columns].astype(str).fillna('').values
14
 
15
- # Get all unique players and create a mapping to numeric IDs
16
  all_players = set()
17
  for row in player_data:
18
  for val in row:
19
  if isinstance(val, str) and val.strip() != '':
20
  all_players.add(val)
21
 
22
- # Create player ID mapping
23
  player_to_id = {player: idx for idx, player in enumerate(sorted(all_players))}
24
 
25
- # Convert each row to a binary vector (1 if player is present, 0 if not)
26
  n_players = len(all_players)
27
  n_rows = len(portfolio)
28
  binary_matrix = np.zeros((n_rows, n_players), dtype=np.int8)
29
 
30
- # Vectorized binary matrix creation
31
  for i, row in enumerate(player_data):
32
  for val in row:
33
  if isinstance(val, str) and str(val).strip() != '' and str(val) in player_to_id:
34
  binary_matrix[i, player_to_id[str(val)]] = 1
35
 
36
- # Vectorized Jaccard distance calculation
37
- intersection_matrix = np.dot(binary_matrix, binary_matrix.T)
38
- row_sums = np.sum(binary_matrix, axis=1)
39
- union_matrix = row_sums[:, np.newaxis] + row_sums - intersection_matrix
40
 
41
- # Calculate Jaccard distance: 1 - (intersection / union)
42
- with np.errstate(divide='ignore', invalid='ignore'):
43
- jaccard_similarity = np.divide(intersection_matrix, union_matrix,
44
- out=np.zeros_like(intersection_matrix, dtype=float),
45
- where=union_matrix != 0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
- jaccard_distance = 1 - jaccard_similarity
48
-
49
- # Exclude self-comparison and calculate average distance for each row
50
- np.fill_diagonal(jaccard_distance, 0)
51
- row_counts = n_rows - 1
52
- similarity_scores = np.sum(jaccard_distance, axis=1) / row_counts
53
-
54
- # Normalize to 0-1 scale
55
  score_range = similarity_scores.max() - similarity_scores.min()
56
  if score_range > 0:
57
  similarity_scores = (similarity_scores - similarity_scores.min()) / score_range
 
5
  import math
6
  from difflib import SequenceMatcher
7
 
8
+ def recalc_diversity(portfolio, player_columns, chunk_size=1000):
9
  """
10
+ Memory-efficient version that processes similarities in chunks
11
  """
12
+ # Same setup as before
13
  player_data = portfolio[player_columns].astype(str).fillna('').values
14
 
 
15
  all_players = set()
16
  for row in player_data:
17
  for val in row:
18
  if isinstance(val, str) and val.strip() != '':
19
  all_players.add(val)
20
 
 
21
  player_to_id = {player: idx for idx, player in enumerate(sorted(all_players))}
22
 
 
23
  n_players = len(all_players)
24
  n_rows = len(portfolio)
25
  binary_matrix = np.zeros((n_rows, n_players), dtype=np.int8)
26
 
 
27
  for i, row in enumerate(player_data):
28
  for val in row:
29
  if isinstance(val, str) and str(val).strip() != '' and str(val) in player_to_id:
30
  binary_matrix[i, player_to_id[str(val)]] = 1
31
 
32
+ # Process similarities in chunks to avoid massive matrices
33
+ similarity_scores = np.zeros(n_rows)
 
 
34
 
35
+ for i in range(0, n_rows, chunk_size):
36
+ end_i = min(i + chunk_size, n_rows)
37
+ chunk_binary = binary_matrix[i:end_i]
38
+
39
+ # Calculate similarities for this chunk only
40
+ intersection = np.dot(chunk_binary, binary_matrix.T)
41
+ chunk_row_sums = np.sum(chunk_binary, axis=1)
42
+ all_row_sums = np.sum(binary_matrix, axis=1)
43
+
44
+ union = chunk_row_sums[:, np.newaxis] + all_row_sums - intersection
45
+
46
+ with np.errstate(divide='ignore', invalid='ignore'):
47
+ jaccard_sim = np.divide(intersection, union,
48
+ out=np.zeros_like(intersection, dtype=float),
49
+ where=union != 0)
50
+
51
+ jaccard_dist = 1 - jaccard_sim
52
+
53
+ # Exclude self-comparison and calculate average
54
+ for j in range(len(jaccard_dist)):
55
+ actual_idx = i + j
56
+ jaccard_dist[j, actual_idx] = 0 # Exclude self
57
+
58
+ similarity_scores[i:end_i] = np.sum(jaccard_dist, axis=1) / (n_rows - 1)
59
 
60
+ # Normalize
 
 
 
 
 
 
 
61
  score_range = similarity_scores.max() - similarity_scores.min()
62
  if score_range > 0:
63
  similarity_scores = (similarity_scores - similarity_scores.min()) / score_range