footballbert-base / tokenization_footballbert.py
achadj's picture
Update tokenization_footballbert.py
481f75f verified
"""FootballBERT Tokenizer"""
import json
import pickle
from typing import Dict, List, Optional, Union
import numpy as np
import pandas as pd
from transformers import PreTrainedTokenizer
class FootballBERTTokenizer(PreTrainedTokenizer):
"""
Tokenizer for FootballBERT models.
This tokenizer converts football match data into model inputs, treating players as tokens
and matches as sequences.
Args:
player_vocab (`Dict[str, int]`): Mapping from player IDs to token IDs
position_vocab (`Dict[str, int]`): Mapping from position labels to position IDs
season_vocab (`Dict[str, int]`): Mapping from season names to season IDs
pad_token (`str`, *optional*, defaults to "[PAD]"): The padding token
mask_token (`str`, *optional*, defaults to "[MASK]"): The mask token
max_length (`int`, *optional*, defaults to 32): Maximum sequence length (2 teams)
"""
vocab_files_names = {
"player_vocab_file": "player_vocab.json",
"position_vocab_file": "position_vocab.json",
"season_vocab_file": "season_vocab.json",
"special_tokens_map_file": "special_tokens_map.json"
}
model_input_names = [
"input_ids",
"attention_mask",
"position_ids",
"team_affiliation_ids",
"season_ids"
]
def __init__(
self,
player_vocab: Dict[str, int],
position_vocab: Dict[str, int],
season_vocab: Dict[str, int],
special_tokens_map_file: Dict[str, int],
pad_token: str = "[PAD]",
mask_token: str = "[MASK]",
max_length: int = 32,
**kwargs
):
self.player_vocab = player_vocab
self.position_vocab = position_vocab
self.season_vocab = season_vocab
self.special_tokens_map_file = special_tokens_map_file
self._max_length = max_length
# Reverse mappings
self.player_ids_to_tokens = {v: k for k, v in player_vocab.items()}
self.position_ids_to_tokens = {v: k for k, v in position_vocab.items()}
self.season_ids_to_tokens = {v: k for k, v in season_vocab.items()}
# Special token IDs
self.player_pad_token_id = special_tokens_map_file.get("PLAYER_PAD_TOKEN_ID", 99944)
self.player_mask_token_id = special_tokens_map_file.get("PLAYER_MASK_TOKEN_ID", 99943)
self.position_pad_token_id = special_tokens_map_file.get("POSITION_PAD_TOKEN_ID", 1419)
self.season_pad_token_id = special_tokens_map_file.get("SEASON_PAD_TOKEN_ID", 23)
super().__init__(
pad_token=pad_token,
mask_token=mask_token,
max_length=max_length,
**kwargs
)
@property
def vocab_size(self) -> int:
"""Returns the size of the player vocabulary."""
return len(self.player_vocab)
def get_vocab(self) -> Dict[str, int]:
"""Returns the player vocabulary."""
return self.player_vocab.copy()
def _convert_token_to_id(self, token: str) -> int:
"""Converts a player token (string) to an id using the vocab."""
return self.player_vocab.get(token, self.player_pad_token_id)
def _convert_id_to_token(self, index: int) -> str:
"""Converts an id to a player token (string) using the vocab."""
return self.player_ids_to_tokens.get(index, self.pad_token)
def convert_tokens_to_string(self, tokens: List[str]) -> str:
"""Converts a sequence of player tokens to a single string."""
return " ".join(tokens)
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple:
"""Save the vocabularies to files."""
import os
if not os.path.isdir(save_directory):
raise ValueError(f"Vocabulary path ({save_directory}) should be a directory")
prefix = filename_prefix + "-" if filename_prefix else ""
player_vocab_file = os.path.join(save_directory, prefix + self.vocab_files_names["player_vocab_file"])
position_vocab_file = os.path.join(save_directory, prefix + self.vocab_files_names["position_vocab_file"])
season_vocab_file = os.path.join(save_directory, prefix + self.vocab_files_names["season_vocab_file"])
special_tokens_map_file = os.path.join(save_directory, prefix + self.vocab_files_names["special_tokens_map_file"])
with open(player_vocab_file, "w", encoding="utf-8") as f:
json.dump(self.player_vocab, f, ensure_ascii=False, indent=2)
with open(position_vocab_file, "w", encoding="utf-8") as f:
json.dump(self.position_vocab, f, ensure_ascii=False, indent=2)
with open(season_vocab_file, "w", encoding="utf-8") as f:
json.dump(self.season_vocab, f, ensure_ascii=False, indent=2)
with open(special_tokens_map_file, "w", encoding="utf-8") as f:
json.dump(self.special_tokens_map_file, f, ensure_ascii=False, indent=2)
return (player_vocab_file, position_vocab_file, season_vocab_file, special_tokens_map_file)
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
"""Load vocabularies and instantiate tokenizer."""
import os
from transformers.utils import cached_file
# Load vocabulary files
#player_vocab_file = os.path.join(pretrained_model_name_or_path, "player_vocab.json")
#position_vocab_file = os.path.join(pretrained_model_name_or_path, "position_vocab.json")
#season_vocab_file = os.path.join(pretrained_model_name_or_path, "season_vocab.json")
#special_tokens_map_file = os.path.join(pretrained_model_name_or_path, "special_tokens_map.json")
# Télécharge ou récupère les fichiers depuis le cache du Hub
player_vocab_file = cached_file(pretrained_model_name_or_path, "player_vocab.json")
position_vocab_file = cached_file(pretrained_model_name_or_path, "position_vocab.json")
season_vocab_file = cached_file(pretrained_model_name_or_path, "season_vocab.json")
special_tokens_map_file = cached_file(pretrained_model_name_or_path, "special_tokens_map.json")
with open(player_vocab_file, "r", encoding="utf-8") as f:
player_vocab = json.load(f)
with open(position_vocab_file, "r", encoding="utf-8") as f:
position_vocab = json.load(f)
with open(season_vocab_file, "r", encoding="utf-8") as f:
season_vocab = json.load(f)
with open(special_tokens_map_file, "r", encoding="utf-8") as f:
special_tokens_map = json.load(f)
return cls(
player_vocab=player_vocab,
position_vocab=position_vocab,
season_vocab=season_vocab,
special_tokens_map_file=special_tokens_map,
**kwargs
)
def __call__(
self,
match_data: Union[pd.DataFrame, Dict],
max_length: Optional[int] = None,
padding: bool = True,
return_tensors: Optional[str] = None,
**kwargs
) -> Dict:
"""
Tokenize match data into model inputs.
Args:
match_data: Either a DataFrame with columns [player_id, positions, season_name, team_name]
or a dict with lists for each field
max_length: Maximum sequence length (defaults to tokenizer's max_length)
padding: Whether to pad sequences
return_tensors: 'pt' for PyTorch tensors, 'np' for numpy arrays, None for lists
Returns:
Dictionary with input_ids, attention_mask, position_ids, team_affiliation_ids, season_ids
"""
if max_length is None:
max_length = self._max_length
# Convert dict to DataFrame if needed
if isinstance(match_data, dict):
match_data = pd.DataFrame(match_data)
# Get unique teams
teams = match_data['team_name'].unique()
if len(teams) != 2:
raise ValueError(f"Match data must contain exactly 2 teams, found {len(teams)}")
# Sort by team to ensure consistent ordering
match_data = pd.concat([
match_data[match_data['team_name'] == teams[0]],
match_data[match_data['team_name'] == teams[1]]
], ignore_index=True)
# Convert to token IDs
input_ids = [self.player_vocab.get(pid, self.player_pad_token_id)
for pid in match_data['player_id'].values]
position_ids = [self.position_vocab.get(pos, self.position_pad_token_id)
for pos in match_data['positions'].values]
season_ids = [self.season_vocab.get(season, self.season_pad_token_id)
for season in match_data['season_name'].values]
# Team affiliation (0 for team 1, 1 for team 2)
team_1_size = len(match_data[match_data['team_name'] == teams[0]])
team_affiliation_ids = [0] * team_1_size + [1] * (len(match_data) - team_1_size)
# Attention mask (1 for real tokens, 0 for padding)
attention_mask = [1] * len(input_ids)
# Padding
if padding and len(input_ids) < max_length:
pad_length = max_length - len(input_ids)
input_ids.extend([self.player_pad_token_id] * pad_length)
position_ids.extend([self.position_pad_token_id] * pad_length)
season_ids.extend([self.season_pad_token_id] * pad_length)
team_affiliation_ids.extend([2] * pad_length) # 2 for padding
attention_mask.extend([0] * pad_length)
# Convert to appropriate tensor type
if return_tensors == "pt":
import torch
return {
"input_ids": torch.tensor([input_ids], dtype=torch.long),
"attention_mask": torch.tensor([attention_mask], dtype=torch.long),
"position_ids": torch.tensor([position_ids], dtype=torch.long),
"team_affiliation_ids": torch.tensor([team_affiliation_ids], dtype=torch.long),
"season_ids": torch.tensor([season_ids], dtype=torch.long)
}
elif return_tensors == "np":
return {
"input_ids": np.array([input_ids]),
"attention_mask": np.array([attention_mask]),
"position_ids": np.array([position_ids]),
"team_affiliation_ids": np.array([team_affiliation_ids]),
"season_ids": np.array([season_ids])
}
else:
return {
"input_ids": input_ids,
"attention_mask": attention_mask,
"position_ids": position_ids,
"team_affiliation_ids": team_affiliation_ids,
"season_ids": season_ids
}
def batch_encode_matches(
self,
matches_data: List[pd.DataFrame],
max_length: Optional[int] = None,
padding: bool = True,
return_tensors: Optional[str] = None
) -> Dict:
"""
Encode multiple matches at once.
Args:
matches_data: List of DataFrames, one per match
max_length: Maximum sequence length
padding: Whether to pad sequences
return_tensors: 'pt' for PyTorch, 'np' for numpy, None for lists
Returns:
Dictionary with batched tensors
"""
all_encodings = [self(match, max_length=max_length, padding=padding, return_tensors=None)
for match in matches_data]
# Stack encodings
batch = {
key: [enc[key] for enc in all_encodings]
for key in all_encodings[0].keys()
}
if return_tensors == "pt":
import torch
return {k: torch.tensor(v, dtype=torch.long) for k, v in batch.items()}
elif return_tensors == "np":
return {k: np.array(v) for k, v in batch.items()}
return batch