""" Embeddings Module Creates team and player embeddings for deep learning models. Part of the complete blueprint implementation. """ import numpy as np from typing import Dict, List, Optional, Tuple import logging from pathlib import Path import json logger = logging.getLogger(__name__) try: import torch import torch.nn as nn TORCH_AVAILABLE = True except ImportError: TORCH_AVAILABLE = False class TeamEmbeddings: """ Creates and manages team embeddings. Features: - Learnable embeddings - Pre-trained loading - Similarity calculations """ def __init__( self, embedding_dim: int = 64, num_teams: int = 500 ): self.embedding_dim = embedding_dim self.num_teams = num_teams self.team_to_idx = {} self.idx_to_team = {} if TORCH_AVAILABLE: self.embeddings = nn.Embedding(num_teams, embedding_dim) else: self.embeddings = np.random.randn(num_teams, embedding_dim) * 0.1 def register_team(self, team: str) -> int: """Register a team and get its index.""" if team in self.team_to_idx: return self.team_to_idx[team] idx = len(self.team_to_idx) if idx >= self.num_teams: logger.warning(f"Max teams ({self.num_teams}) reached") return 0 self.team_to_idx[team] = idx self.idx_to_team[idx] = team return idx def get_embedding(self, team: str) -> np.ndarray: """Get embedding vector for a team.""" idx = self.team_to_idx.get(team) if idx is None: idx = self.register_team(team) if TORCH_AVAILABLE: with torch.no_grad(): idx_tensor = torch.tensor([idx]) return self.embeddings(idx_tensor).numpy()[0] else: return self.embeddings[idx] def get_match_embedding( self, home_team: str, away_team: str ) -> np.ndarray: """Get combined embedding for a match.""" home_emb = self.get_embedding(home_team) away_emb = self.get_embedding(away_team) # Concatenate home and away embeddings return np.concatenate([home_emb, away_emb]) def get_similarity(self, team1: str, team2: str) -> float: """Calculate cosine similarity between teams.""" emb1 = self.get_embedding(team1) emb2 = self.get_embedding(team2) norm1 = np.linalg.norm(emb1) norm2 = np.linalg.norm(emb2) if norm1 == 0 or norm2 == 0: return 0.0 return float(np.dot(emb1, emb2) / (norm1 * norm2)) def find_similar_teams( self, team: str, n: int = 5 ) -> List[Tuple[str, float]]: """Find most similar teams.""" target_emb = self.get_embedding(team) similarities = [] for other_team in self.team_to_idx: if other_team != team: sim = self.get_similarity(team, other_team) similarities.append((other_team, sim)) similarities.sort(key=lambda x: x[1], reverse=True) return similarities[:n] def save(self, path: str): """Save embeddings and mappings.""" path = Path(path) path.mkdir(parents=True, exist_ok=True) # Save mappings with open(path / 'team_mapping.json', 'w') as f: json.dump(self.team_to_idx, f) # Save embeddings if TORCH_AVAILABLE: torch.save(self.embeddings.state_dict(), path / 'embeddings.pt') else: np.save(path / 'embeddings.npy', self.embeddings) def load(self, path: str): """Load embeddings and mappings.""" path = Path(path) # Load mappings mapping_file = path / 'team_mapping.json' if mapping_file.exists(): with open(mapping_file) as f: self.team_to_idx = json.load(f) self.idx_to_team = {v: k for k, v in self.team_to_idx.items()} # Load embeddings if TORCH_AVAILABLE: pt_file = path / 'embeddings.pt' if pt_file.exists(): self.embeddings.load_state_dict(torch.load(pt_file, weights_only=True)) else: npy_file = path / 'embeddings.npy' if npy_file.exists(): self.embeddings = np.load(npy_file) class PositionalEncoding: """Positional encoding for sequence models.""" def __init__(self, d_model: int, max_len: int = 100): self.d_model = d_model self.max_len = max_len self.pe = self._create_encoding() def _create_encoding(self) -> np.ndarray: """Create positional encoding matrix.""" pe = np.zeros((self.max_len, self.d_model)) position = np.arange(0, self.max_len)[:, np.newaxis] div_term = np.exp(np.arange(0, self.d_model, 2) * (-np.log(10000.0) / self.d_model)) pe[:, 0::2] = np.sin(position * div_term) pe[:, 1::2] = np.cos(position * div_term) return pe def encode(self, x: np.ndarray) -> np.ndarray: """Add positional encoding to input.""" seq_len = x.shape[0] if len(x.shape) >= 1 else 1 return x + self.pe[:seq_len] class MatchSequenceEmbedding: """Creates embeddings for sequences of matches.""" def __init__( self, match_dim: int = 32, seq_len: int = 10 ): self.match_dim = match_dim self.seq_len = seq_len self.pos_encoding = PositionalEncoding(match_dim, seq_len) def encode_match_result( self, goals_for: int, goals_against: int ) -> np.ndarray: """Encode a single match result.""" features = np.zeros(self.match_dim) # Basic features features[0] = goals_for features[1] = goals_against features[2] = goals_for - goals_against features[3] = 1 if goals_for > goals_against else (0.5 if goals_for == goals_against else 0) features[4] = 1 if goals_for > 0 and goals_against > 0 else 0 # BTTS features[5] = 1 if goals_for + goals_against > 2.5 else 0 # Over 2.5 return features def encode_match_sequence( self, matches: List[Dict] ) -> np.ndarray: """Encode a sequence of matches.""" sequence = np.zeros((self.seq_len, self.match_dim)) for i, match in enumerate(matches[-self.seq_len:]): idx = self.seq_len - len(matches[-self.seq_len:]) + i sequence[idx] = self.encode_match_result( match.get('goals_for', 0), match.get('goals_against', 0) ) # Add positional encoding sequence = self.pos_encoding.encode(sequence) return sequence # Global instances _team_embeddings: Optional[TeamEmbeddings] = None def get_team_embeddings() -> TeamEmbeddings: """Get or create team embeddings.""" global _team_embeddings if _team_embeddings is None: _team_embeddings = TeamEmbeddings() return _team_embeddings def get_match_embedding(home_team: str, away_team: str) -> np.ndarray: """Get embedding for a match.""" return get_team_embeddings().get_match_embedding(home_team, away_team)