import streamlit as st import numpy as np import pandas as pd import time import math from difflib import SequenceMatcher def recalc_diversity(portfolio, player_columns, chunk_size=1000): """ Memory-efficient version that processes similarities in chunks """ # Same setup as before player_data = portfolio[player_columns].astype(str).fillna('').values all_players = set() for row in player_data: for val in row: if isinstance(val, str) and val.strip() != '': all_players.add(val) player_to_id = {player: idx for idx, player in enumerate(sorted(all_players))} n_players = len(all_players) n_rows = len(portfolio) binary_matrix = np.zeros((n_rows, n_players), dtype=np.int8) for i, row in enumerate(player_data): for val in row: if isinstance(val, str) and str(val).strip() != '' and str(val) in player_to_id: binary_matrix[i, player_to_id[str(val)]] = 1 # Process similarities in chunks to avoid massive matrices similarity_scores = np.zeros(n_rows) for i in range(0, n_rows, chunk_size): end_i = min(i + chunk_size, n_rows) chunk_binary = binary_matrix[i:end_i] # Calculate similarities for this chunk only intersection = np.dot(chunk_binary, binary_matrix.T) chunk_row_sums = np.sum(chunk_binary, axis=1) all_row_sums = np.sum(binary_matrix, axis=1) union = chunk_row_sums[:, np.newaxis] + all_row_sums - intersection with np.errstate(divide='ignore', invalid='ignore'): jaccard_sim = np.divide(intersection, union, out=np.zeros_like(intersection, dtype=float), where=union != 0) jaccard_dist = 1 - jaccard_sim # Exclude self-comparison and calculate average for j in range(len(jaccard_dist)): actual_idx = i + j jaccard_dist[j, actual_idx] = 0 # Exclude self similarity_scores[i:end_i] = np.sum(jaccard_dist, axis=1) / (n_rows - 1) # Normalize score_range = similarity_scores.max() - similarity_scores.min() if score_range > 0: similarity_scores = (similarity_scores - similarity_scores.min()) / score_range return similarity_scores