| """FootballBERT Tokenizer""" |
|
|
| import json |
| import pickle |
| from typing import Dict, List, Optional, Union |
| import numpy as np |
| import pandas as pd |
| from transformers import PreTrainedTokenizer |
|
|
|
|
| class FootballBERTTokenizer(PreTrainedTokenizer): |
| """ |
| Tokenizer for FootballBERT models. |
| |
| This tokenizer converts football match data into model inputs, treating players as tokens |
| and matches as sequences. |
| |
| Args: |
| player_vocab (`Dict[str, int]`): Mapping from player IDs to token IDs |
| position_vocab (`Dict[str, int]`): Mapping from position labels to position IDs |
| season_vocab (`Dict[str, int]`): Mapping from season names to season IDs |
| pad_token (`str`, *optional*, defaults to "[PAD]"): The padding token |
| mask_token (`str`, *optional*, defaults to "[MASK]"): The mask token |
| max_length (`int`, *optional*, defaults to 32): Maximum sequence length (2 teams) |
| """ |
| |
| vocab_files_names = { |
| "player_vocab_file": "player_vocab.json", |
| "position_vocab_file": "position_vocab.json", |
| "season_vocab_file": "season_vocab.json", |
| "special_tokens_map_file": "special_tokens_map.json" |
| } |
| |
| model_input_names = [ |
| "input_ids", |
| "attention_mask", |
| "position_ids", |
| "team_affiliation_ids", |
| "season_ids" |
| ] |
| |
| def __init__( |
| self, |
| player_vocab: Dict[str, int], |
| position_vocab: Dict[str, int], |
| season_vocab: Dict[str, int], |
| special_tokens_map_file: Dict[str, int], |
| pad_token: str = "[PAD]", |
| mask_token: str = "[MASK]", |
| max_length: int = 32, |
| **kwargs |
| ): |
| self.player_vocab = player_vocab |
| self.position_vocab = position_vocab |
| self.season_vocab = season_vocab |
| self.special_tokens_map_file = special_tokens_map_file |
| self._max_length = max_length |
| |
| |
| self.player_ids_to_tokens = {v: k for k, v in player_vocab.items()} |
| self.position_ids_to_tokens = {v: k for k, v in position_vocab.items()} |
| self.season_ids_to_tokens = {v: k for k, v in season_vocab.items()} |
| |
| |
| self.player_pad_token_id = special_tokens_map_file.get("PLAYER_PAD_TOKEN_ID", 99944) |
| self.player_mask_token_id = special_tokens_map_file.get("PLAYER_MASK_TOKEN_ID", 99943) |
| self.position_pad_token_id = special_tokens_map_file.get("POSITION_PAD_TOKEN_ID", 1419) |
| self.season_pad_token_id = special_tokens_map_file.get("SEASON_PAD_TOKEN_ID", 23) |
|
|
| super().__init__( |
| pad_token=pad_token, |
| mask_token=mask_token, |
| max_length=max_length, |
| **kwargs |
| ) |
| |
| @property |
| def vocab_size(self) -> int: |
| """Returns the size of the player vocabulary.""" |
| return len(self.player_vocab) |
| |
| def get_vocab(self) -> Dict[str, int]: |
| """Returns the player vocabulary.""" |
| return self.player_vocab.copy() |
| |
| def _convert_token_to_id(self, token: str) -> int: |
| """Converts a player token (string) to an id using the vocab.""" |
| return self.player_vocab.get(token, self.player_pad_token_id) |
| |
| def _convert_id_to_token(self, index: int) -> str: |
| """Converts an id to a player token (string) using the vocab.""" |
| return self.player_ids_to_tokens.get(index, self.pad_token) |
| |
| def convert_tokens_to_string(self, tokens: List[str]) -> str: |
| """Converts a sequence of player tokens to a single string.""" |
| return " ".join(tokens) |
| |
| def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple: |
| """Save the vocabularies to files.""" |
| import os |
| |
| if not os.path.isdir(save_directory): |
| raise ValueError(f"Vocabulary path ({save_directory}) should be a directory") |
| |
| prefix = filename_prefix + "-" if filename_prefix else "" |
| |
| player_vocab_file = os.path.join(save_directory, prefix + self.vocab_files_names["player_vocab_file"]) |
| position_vocab_file = os.path.join(save_directory, prefix + self.vocab_files_names["position_vocab_file"]) |
| season_vocab_file = os.path.join(save_directory, prefix + self.vocab_files_names["season_vocab_file"]) |
| special_tokens_map_file = os.path.join(save_directory, prefix + self.vocab_files_names["special_tokens_map_file"]) |
| |
| with open(player_vocab_file, "w", encoding="utf-8") as f: |
| json.dump(self.player_vocab, f, ensure_ascii=False, indent=2) |
| |
| with open(position_vocab_file, "w", encoding="utf-8") as f: |
| json.dump(self.position_vocab, f, ensure_ascii=False, indent=2) |
| |
| with open(season_vocab_file, "w", encoding="utf-8") as f: |
| json.dump(self.season_vocab, f, ensure_ascii=False, indent=2) |
| |
| with open(special_tokens_map_file, "w", encoding="utf-8") as f: |
| json.dump(self.special_tokens_map_file, f, ensure_ascii=False, indent=2) |
|
|
| return (player_vocab_file, position_vocab_file, season_vocab_file, special_tokens_map_file) |
|
|
| @classmethod |
| def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): |
| """Load vocabularies and instantiate tokenizer.""" |
| import os |
| from transformers.utils import cached_file |
|
|
| |
| |
| |
| |
| |
| |
| |
| player_vocab_file = cached_file(pretrained_model_name_or_path, "player_vocab.json") |
| position_vocab_file = cached_file(pretrained_model_name_or_path, "position_vocab.json") |
| season_vocab_file = cached_file(pretrained_model_name_or_path, "season_vocab.json") |
| special_tokens_map_file = cached_file(pretrained_model_name_or_path, "special_tokens_map.json") |
|
|
| with open(player_vocab_file, "r", encoding="utf-8") as f: |
| player_vocab = json.load(f) |
| |
| with open(position_vocab_file, "r", encoding="utf-8") as f: |
| position_vocab = json.load(f) |
| |
| with open(season_vocab_file, "r", encoding="utf-8") as f: |
| season_vocab = json.load(f) |
| |
| with open(special_tokens_map_file, "r", encoding="utf-8") as f: |
| special_tokens_map = json.load(f) |
| |
| return cls( |
| player_vocab=player_vocab, |
| position_vocab=position_vocab, |
| season_vocab=season_vocab, |
| special_tokens_map_file=special_tokens_map, |
| **kwargs |
| ) |
| |
| def __call__( |
| self, |
| match_data: Union[pd.DataFrame, Dict], |
| max_length: Optional[int] = None, |
| padding: bool = True, |
| return_tensors: Optional[str] = None, |
| **kwargs |
| ) -> Dict: |
| """ |
| Tokenize match data into model inputs. |
| |
| Args: |
| match_data: Either a DataFrame with columns [player_id, positions, season_name, team_name] |
| or a dict with lists for each field |
| max_length: Maximum sequence length (defaults to tokenizer's max_length) |
| padding: Whether to pad sequences |
| return_tensors: 'pt' for PyTorch tensors, 'np' for numpy arrays, None for lists |
| |
| Returns: |
| Dictionary with input_ids, attention_mask, position_ids, team_affiliation_ids, season_ids |
| """ |
| if max_length is None: |
| max_length = self._max_length |
| |
| |
| if isinstance(match_data, dict): |
| match_data = pd.DataFrame(match_data) |
| |
| |
| teams = match_data['team_name'].unique() |
| if len(teams) != 2: |
| raise ValueError(f"Match data must contain exactly 2 teams, found {len(teams)}") |
| |
| |
| match_data = pd.concat([ |
| match_data[match_data['team_name'] == teams[0]], |
| match_data[match_data['team_name'] == teams[1]] |
| ], ignore_index=True) |
| |
| |
| input_ids = [self.player_vocab.get(pid, self.player_pad_token_id) |
| for pid in match_data['player_id'].values] |
| position_ids = [self.position_vocab.get(pos, self.position_pad_token_id) |
| for pos in match_data['positions'].values] |
| season_ids = [self.season_vocab.get(season, self.season_pad_token_id) |
| for season in match_data['season_name'].values] |
| |
| |
| team_1_size = len(match_data[match_data['team_name'] == teams[0]]) |
| team_affiliation_ids = [0] * team_1_size + [1] * (len(match_data) - team_1_size) |
| |
| |
| attention_mask = [1] * len(input_ids) |
| |
| |
| if padding and len(input_ids) < max_length: |
| pad_length = max_length - len(input_ids) |
| input_ids.extend([self.player_pad_token_id] * pad_length) |
| position_ids.extend([self.position_pad_token_id] * pad_length) |
| season_ids.extend([self.season_pad_token_id] * pad_length) |
| team_affiliation_ids.extend([2] * pad_length) |
| attention_mask.extend([0] * pad_length) |
| |
| |
| if return_tensors == "pt": |
| import torch |
| return { |
| "input_ids": torch.tensor([input_ids], dtype=torch.long), |
| "attention_mask": torch.tensor([attention_mask], dtype=torch.long), |
| "position_ids": torch.tensor([position_ids], dtype=torch.long), |
| "team_affiliation_ids": torch.tensor([team_affiliation_ids], dtype=torch.long), |
| "season_ids": torch.tensor([season_ids], dtype=torch.long) |
| } |
| elif return_tensors == "np": |
| return { |
| "input_ids": np.array([input_ids]), |
| "attention_mask": np.array([attention_mask]), |
| "position_ids": np.array([position_ids]), |
| "team_affiliation_ids": np.array([team_affiliation_ids]), |
| "season_ids": np.array([season_ids]) |
| } |
| else: |
| return { |
| "input_ids": input_ids, |
| "attention_mask": attention_mask, |
| "position_ids": position_ids, |
| "team_affiliation_ids": team_affiliation_ids, |
| "season_ids": season_ids |
| } |
| |
| def batch_encode_matches( |
| self, |
| matches_data: List[pd.DataFrame], |
| max_length: Optional[int] = None, |
| padding: bool = True, |
| return_tensors: Optional[str] = None |
| ) -> Dict: |
| """ |
| Encode multiple matches at once. |
| |
| Args: |
| matches_data: List of DataFrames, one per match |
| max_length: Maximum sequence length |
| padding: Whether to pad sequences |
| return_tensors: 'pt' for PyTorch, 'np' for numpy, None for lists |
| |
| Returns: |
| Dictionary with batched tensors |
| """ |
| all_encodings = [self(match, max_length=max_length, padding=padding, return_tensors=None) |
| for match in matches_data] |
| |
| |
| batch = { |
| key: [enc[key] for enc in all_encodings] |
| for key in all_encodings[0].keys() |
| } |
| |
| if return_tensors == "pt": |
| import torch |
| return {k: torch.tensor(v, dtype=torch.long) for k, v in batch.items()} |
| elif return_tensors == "np": |
| return {k: np.array(v) for k, v in batch.items()} |
| |
| return batch |