DFS_Portfolio_Manager / global_func /recalc_diversity.py
James McCool
Trying to fix the rerun of diversity score
fa7b819
import streamlit as st
import numpy as np
import pandas as pd
import time
import math
from difflib import SequenceMatcher
def recalc_diversity(portfolio, player_columns, chunk_size=1000):
"""
Memory-efficient version that processes similarities in chunks
"""
# Same setup as before
player_data = portfolio[player_columns].astype(str).fillna('').values
all_players = set()
for row in player_data:
for val in row:
if isinstance(val, str) and val.strip() != '':
all_players.add(val)
player_to_id = {player: idx for idx, player in enumerate(sorted(all_players))}
n_players = len(all_players)
n_rows = len(portfolio)
binary_matrix = np.zeros((n_rows, n_players), dtype=np.int8)
for i, row in enumerate(player_data):
for val in row:
if isinstance(val, str) and str(val).strip() != '' and str(val) in player_to_id:
binary_matrix[i, player_to_id[str(val)]] = 1
# Process similarities in chunks to avoid massive matrices
similarity_scores = np.zeros(n_rows)
for i in range(0, n_rows, chunk_size):
end_i = min(i + chunk_size, n_rows)
chunk_binary = binary_matrix[i:end_i]
# Calculate similarities for this chunk only
intersection = np.dot(chunk_binary, binary_matrix.T)
chunk_row_sums = np.sum(chunk_binary, axis=1)
all_row_sums = np.sum(binary_matrix, axis=1)
union = chunk_row_sums[:, np.newaxis] + all_row_sums - intersection
with np.errstate(divide='ignore', invalid='ignore'):
jaccard_sim = np.divide(intersection, union,
out=np.zeros_like(intersection, dtype=float),
where=union != 0)
jaccard_dist = 1 - jaccard_sim
# Exclude self-comparison and calculate average
for j in range(len(jaccard_dist)):
actual_idx = i + j
jaccard_dist[j, actual_idx] = 0 # Exclude self
similarity_scores[i:end_i] = np.sum(jaccard_dist, axis=1) / (n_rows - 1)
# Normalize
score_range = similarity_scores.max() - similarity_scores.min()
if score_range > 0:
similarity_scores = (similarity_scores - similarity_scores.min()) / score_range
return similarity_scores