footballbert-base / tokenization_footballbert.py

Update tokenization_footballbert.py

481f75f verified 7 months ago

12.2 kB

	"""FootballBERT Tokenizer"""

	import json
	import pickle
	from typing import Dict, List, Optional, Union
	import numpy as np
	import pandas as pd
	from transformers import PreTrainedTokenizer


	class FootballBERTTokenizer(PreTrainedTokenizer):
	"""
	Tokenizer for FootballBERT models.

	This tokenizer converts football match data into model inputs, treating players as tokens
	and matches as sequences.

	Args:
	player_vocab (`Dict[str, int]`): Mapping from player IDs to token IDs
	position_vocab (`Dict[str, int]`): Mapping from position labels to position IDs
	season_vocab (`Dict[str, int]`): Mapping from season names to season IDs
	pad_token (`str`, optional, defaults to "[PAD]"): The padding token
	mask_token (`str`, optional, defaults to "[MASK]"): The mask token
	max_length (`int`, optional, defaults to 32): Maximum sequence length (2 teams)
	"""

	vocab_files_names = {
	"player_vocab_file": "player_vocab.json",
	"position_vocab_file": "position_vocab.json",
	"season_vocab_file": "season_vocab.json",
	"special_tokens_map_file": "special_tokens_map.json"
	}

	model_input_names = [
	"input_ids",
	"attention_mask",
	"position_ids",
	"team_affiliation_ids",
	"season_ids"
	]

	def __init__(
	self,
	player_vocab: Dict[str, int],
	position_vocab: Dict[str, int],
	season_vocab: Dict[str, int],
	special_tokens_map_file: Dict[str, int],
	pad_token: str = "[PAD]",
	mask_token: str = "[MASK]",
	max_length: int = 32,
	**kwargs
	):
	self.player_vocab = player_vocab
	self.position_vocab = position_vocab
	self.season_vocab = season_vocab
	self.special_tokens_map_file = special_tokens_map_file
	self._max_length = max_length

	# Reverse mappings
	self.player_ids_to_tokens = {v: k for k, v in player_vocab.items()}
	self.position_ids_to_tokens = {v: k for k, v in position_vocab.items()}
	self.season_ids_to_tokens = {v: k for k, v in season_vocab.items()}

	# Special token IDs
	self.player_pad_token_id = special_tokens_map_file.get("PLAYER_PAD_TOKEN_ID", 99944)
	self.player_mask_token_id = special_tokens_map_file.get("PLAYER_MASK_TOKEN_ID", 99943)
	self.position_pad_token_id = special_tokens_map_file.get("POSITION_PAD_TOKEN_ID", 1419)
	self.season_pad_token_id = special_tokens_map_file.get("SEASON_PAD_TOKEN_ID", 23)

	super().__init__(
	pad_token=pad_token,
	mask_token=mask_token,
	max_length=max_length,
	**kwargs
	)

	@property
	def vocab_size(self) -> int:
	"""Returns the size of the player vocabulary."""
	return len(self.player_vocab)

	def get_vocab(self) -> Dict[str, int]:
	"""Returns the player vocabulary."""
	return self.player_vocab.copy()

	def _convert_token_to_id(self, token: str) -> int:
	"""Converts a player token (string) to an id using the vocab."""
	return self.player_vocab.get(token, self.player_pad_token_id)

	def _convert_id_to_token(self, index: int) -> str:
	"""Converts an id to a player token (string) using the vocab."""
	return self.player_ids_to_tokens.get(index, self.pad_token)

	def convert_tokens_to_string(self, tokens: List[str]) -> str:
	"""Converts a sequence of player tokens to a single string."""
	return " ".join(tokens)

	def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple:
	"""Save the vocabularies to files."""
	import os

	if not os.path.isdir(save_directory):
	raise ValueError(f"Vocabulary path ({save_directory}) should be a directory")

	prefix = filename_prefix + "-" if filename_prefix else ""

	player_vocab_file = os.path.join(save_directory, prefix + self.vocab_files_names["player_vocab_file"])
	position_vocab_file = os.path.join(save_directory, prefix + self.vocab_files_names["position_vocab_file"])
	season_vocab_file = os.path.join(save_directory, prefix + self.vocab_files_names["season_vocab_file"])
	special_tokens_map_file = os.path.join(save_directory, prefix + self.vocab_files_names["special_tokens_map_file"])

	with open(player_vocab_file, "w", encoding="utf-8") as f:
	json.dump(self.player_vocab, f, ensure_ascii=False, indent=2)

	with open(position_vocab_file, "w", encoding="utf-8") as f:
	json.dump(self.position_vocab, f, ensure_ascii=False, indent=2)

	with open(season_vocab_file, "w", encoding="utf-8") as f:
	json.dump(self.season_vocab, f, ensure_ascii=False, indent=2)

	with open(special_tokens_map_file, "w", encoding="utf-8") as f:
	json.dump(self.special_tokens_map_file, f, ensure_ascii=False, indent=2)

	return (player_vocab_file, position_vocab_file, season_vocab_file, special_tokens_map_file)

	@classmethod
	def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
	"""Load vocabularies and instantiate tokenizer."""
	import os
	from transformers.utils import cached_file

	# Load vocabulary files
	#player_vocab_file = os.path.join(pretrained_model_name_or_path, "player_vocab.json")
	#position_vocab_file = os.path.join(pretrained_model_name_or_path, "position_vocab.json")
	#season_vocab_file = os.path.join(pretrained_model_name_or_path, "season_vocab.json")
	#special_tokens_map_file = os.path.join(pretrained_model_name_or_path, "special_tokens_map.json")

	# Télécharge ou récupère les fichiers depuis le cache du Hub
	player_vocab_file = cached_file(pretrained_model_name_or_path, "player_vocab.json")
	position_vocab_file = cached_file(pretrained_model_name_or_path, "position_vocab.json")
	season_vocab_file = cached_file(pretrained_model_name_or_path, "season_vocab.json")
	special_tokens_map_file = cached_file(pretrained_model_name_or_path, "special_tokens_map.json")

	with open(player_vocab_file, "r", encoding="utf-8") as f:
	player_vocab = json.load(f)

	with open(position_vocab_file, "r", encoding="utf-8") as f:
	position_vocab = json.load(f)

	with open(season_vocab_file, "r", encoding="utf-8") as f:
	season_vocab = json.load(f)

	with open(special_tokens_map_file, "r", encoding="utf-8") as f:
	special_tokens_map = json.load(f)

	return cls(
	player_vocab=player_vocab,
	position_vocab=position_vocab,
	season_vocab=season_vocab,
	special_tokens_map_file=special_tokens_map,
	**kwargs
	)

	def __call__(
	self,
	match_data: Union[pd.DataFrame, Dict],
	max_length: Optional[int] = None,
	padding: bool = True,
	return_tensors: Optional[str] = None,
	**kwargs
	) -> Dict:
	"""
	Tokenize match data into model inputs.

	Args:
	match_data: Either a DataFrame with columns [player_id, positions, season_name, team_name]
	or a dict with lists for each field
	max_length: Maximum sequence length (defaults to tokenizer's max_length)
	padding: Whether to pad sequences
	return_tensors: 'pt' for PyTorch tensors, 'np' for numpy arrays, None for lists

	Returns:
	Dictionary with input_ids, attention_mask, position_ids, team_affiliation_ids, season_ids
	"""
	if max_length is None:
	max_length = self._max_length

	# Convert dict to DataFrame if needed
	if isinstance(match_data, dict):
	match_data = pd.DataFrame(match_data)

	# Get unique teams
	teams = match_data['team_name'].unique()
	if len(teams) != 2:
	raise ValueError(f"Match data must contain exactly 2 teams, found {len(teams)}")

	# Sort by team to ensure consistent ordering
	match_data = pd.concat([
	match_data[match_data['team_name'] == teams[0]],
	match_data[match_data['team_name'] == teams[1]]
	], ignore_index=True)

	# Convert to token IDs
	input_ids = [self.player_vocab.get(pid, self.player_pad_token_id)
	for pid in match_data['player_id'].values]
	position_ids = [self.position_vocab.get(pos, self.position_pad_token_id)
	for pos in match_data['positions'].values]
	season_ids = [self.season_vocab.get(season, self.season_pad_token_id)
	for season in match_data['season_name'].values]

	# Team affiliation (0 for team 1, 1 for team 2)
	team_1_size = len(match_data[match_data['team_name'] == teams[0]])
	team_affiliation_ids = [0] * team_1_size + [1] * (len(match_data) - team_1_size)

	# Attention mask (1 for real tokens, 0 for padding)
	attention_mask = [1] * len(input_ids)

	# Padding
	if padding and len(input_ids) < max_length:
	pad_length = max_length - len(input_ids)
	input_ids.extend([self.player_pad_token_id] * pad_length)
	position_ids.extend([self.position_pad_token_id] * pad_length)
	season_ids.extend([self.season_pad_token_id] * pad_length)
	team_affiliation_ids.extend([2] * pad_length) # 2 for padding
	attention_mask.extend([0] * pad_length)

	# Convert to appropriate tensor type
	if return_tensors == "pt":
	import torch
	return {
	"input_ids": torch.tensor([input_ids], dtype=torch.long),
	"attention_mask": torch.tensor([attention_mask], dtype=torch.long),
	"position_ids": torch.tensor([position_ids], dtype=torch.long),
	"team_affiliation_ids": torch.tensor([team_affiliation_ids], dtype=torch.long),
	"season_ids": torch.tensor([season_ids], dtype=torch.long)
	}
	elif return_tensors == "np":
	return {
	"input_ids": np.array([input_ids]),
	"attention_mask": np.array([attention_mask]),
	"position_ids": np.array([position_ids]),
	"team_affiliation_ids": np.array([team_affiliation_ids]),
	"season_ids": np.array([season_ids])
	}
	else:
	return {
	"input_ids": input_ids,
	"attention_mask": attention_mask,
	"position_ids": position_ids,
	"team_affiliation_ids": team_affiliation_ids,
	"season_ids": season_ids
	}

	def batch_encode_matches(
	self,
	matches_data: List[pd.DataFrame],
	max_length: Optional[int] = None,
	padding: bool = True,
	return_tensors: Optional[str] = None
	) -> Dict:
	"""
	Encode multiple matches at once.

	Args:
	matches_data: List of DataFrames, one per match
	max_length: Maximum sequence length
	padding: Whether to pad sequences
	return_tensors: 'pt' for PyTorch, 'np' for numpy, None for lists

	Returns:
	Dictionary with batched tensors
	"""
	all_encodings = [self(match, max_length=max_length, padding=padding, return_tensors=None)
	for match in matches_data]

	# Stack encodings
	batch = {
	key: [enc[key] for enc in all_encodings]
	for key in all_encodings[0].keys()
	}

	if return_tensors == "pt":
	import torch
	return {k: torch.tensor(v, dtype=torch.long) for k, v in batch.items()}
	elif return_tensors == "np":
	return {k: np.array(v) for k, v in batch.items()}

	return batch