James McCool
commited on
Commit
·
4390bf0
1
Parent(s):
c283108
Refactor player similarity score calculation in predict_dupes function
Browse filesThis update enhances the calculate_player_similarity_score function by replacing the previous Jaccard distance method with SequenceMatcher for improved accuracy in measuring lineup similarity. Each lineup is now represented as a string, allowing for a more efficient comparison process. This change simplifies the logic and ensures consistent handling of player selections.
- global_func/predict_dupes.py +16 -38
global_func/predict_dupes.py
CHANGED
|
@@ -4,6 +4,7 @@ import pandas as pd
|
|
| 4 |
import time
|
| 5 |
from fuzzywuzzy import process
|
| 6 |
import math
|
|
|
|
| 7 |
|
| 8 |
def calculate_weighted_ownership(row_ownerships):
|
| 9 |
"""
|
|
@@ -40,7 +41,7 @@ def calculate_weighted_ownership(row_ownerships):
|
|
| 40 |
def calculate_player_similarity_score(portfolio, player_columns):
|
| 41 |
"""
|
| 42 |
Calculate a similarity score that measures how different each row is from all other rows
|
| 43 |
-
based on actual player selection
|
| 44 |
Higher scores indicate more unique/different lineups.
|
| 45 |
|
| 46 |
Args:
|
|
@@ -50,52 +51,29 @@ def calculate_player_similarity_score(portfolio, player_columns):
|
|
| 50 |
Returns:
|
| 51 |
Series: Similarity scores for each row
|
| 52 |
"""
|
| 53 |
-
# Extract player data and
|
| 54 |
player_data = portfolio[player_columns].fillna('')
|
| 55 |
|
| 56 |
-
#
|
| 57 |
-
|
| 58 |
-
for
|
| 59 |
-
#
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
all_players.add(val)
|
| 64 |
|
| 65 |
-
#
|
| 66 |
-
all_players = sorted(list(all_players))
|
| 67 |
-
|
| 68 |
-
# If no valid players found, return zeros
|
| 69 |
-
if len(all_players) == 0:
|
| 70 |
-
return np.zeros(len(portfolio))
|
| 71 |
-
|
| 72 |
-
# Create a binary matrix: 1 if player is in lineup, 0 if not
|
| 73 |
-
binary_matrix = np.zeros((len(portfolio), len(all_players)))
|
| 74 |
-
|
| 75 |
-
for i, row in player_data.iterrows():
|
| 76 |
-
for j, player in enumerate(all_players):
|
| 77 |
-
if player in row.values:
|
| 78 |
-
binary_matrix[i, j] = 1
|
| 79 |
-
|
| 80 |
-
# Calculate Jaccard distance between all pairs of lineups
|
| 81 |
-
# Jaccard distance = 1 - (intersection / union)
|
| 82 |
similarity_scores = []
|
| 83 |
|
| 84 |
for i in range(len(portfolio)):
|
| 85 |
distances = []
|
| 86 |
for j in range(len(portfolio)):
|
| 87 |
if i != j:
|
| 88 |
-
#
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
if union == 0:
|
| 94 |
-
jaccard_distance = 1.0 # Completely different if both are empty
|
| 95 |
-
else:
|
| 96 |
-
jaccard_distance = 1 - (intersection / union)
|
| 97 |
-
|
| 98 |
-
distances.append(jaccard_distance)
|
| 99 |
|
| 100 |
# Average distance to all other lineups
|
| 101 |
avg_distance = np.mean(distances) if distances else 0
|
|
|
|
| 4 |
import time
|
| 5 |
from fuzzywuzzy import process
|
| 6 |
import math
|
| 7 |
+
from difflib import SequenceMatcher
|
| 8 |
|
| 9 |
def calculate_weighted_ownership(row_ownerships):
|
| 10 |
"""
|
|
|
|
| 41 |
def calculate_player_similarity_score(portfolio, player_columns):
|
| 42 |
"""
|
| 43 |
Calculate a similarity score that measures how different each row is from all other rows
|
| 44 |
+
based on actual player selection. Converts each row to a string and uses SequenceMatcher.
|
| 45 |
Higher scores indicate more unique/different lineups.
|
| 46 |
|
| 47 |
Args:
|
|
|
|
| 51 |
Returns:
|
| 52 |
Series: Similarity scores for each row
|
| 53 |
"""
|
| 54 |
+
# Extract player data and convert each row to a string
|
| 55 |
player_data = portfolio[player_columns].fillna('')
|
| 56 |
|
| 57 |
+
# Convert each row to a string representation
|
| 58 |
+
row_strings = []
|
| 59 |
+
for _, row in player_data.iterrows():
|
| 60 |
+
# Sort the players to ensure consistent ordering
|
| 61 |
+
players = sorted([str(val) for val in row.values if str(val).strip() != ''])
|
| 62 |
+
row_string = '|'.join(players) # Use pipe as separator
|
| 63 |
+
row_strings.append(row_string)
|
|
|
|
| 64 |
|
| 65 |
+
# Calculate similarity scores using SequenceMatcher
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
similarity_scores = []
|
| 67 |
|
| 68 |
for i in range(len(portfolio)):
|
| 69 |
distances = []
|
| 70 |
for j in range(len(portfolio)):
|
| 71 |
if i != j:
|
| 72 |
+
# Use SequenceMatcher to compare the two row strings
|
| 73 |
+
similarity_ratio = SequenceMatcher(None, row_strings[i], row_strings[j]).ratio()
|
| 74 |
+
# Convert similarity to distance (1 - similarity)
|
| 75 |
+
distance = 1 - similarity_ratio
|
| 76 |
+
distances.append(distance)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
|
| 78 |
# Average distance to all other lineups
|
| 79 |
avg_distance = np.mean(distances) if distances else 0
|