ChessGPT_d12 / agents /uci_tokenizers.py

Create agents/uci_tokenizers.py

8e4af20 verified about 1 year ago

11.3 kB

	from typing import List

	import chess
	import tiktoken
	import tokenizers
	from tokenizers import models, pre_tokenizers, processors
	from torch import Tensor as TT
	from transformers import PreTrainedTokenizerFast
	from transformers.tokenization_utils_fast import BatchEncoding


	def getTiktokenizer() -> tiktoken.Encoding:
	"""
	Defines a tiktoken-based BPE encoder for UCI chess moves. This
	tokenizer effectively tokenizes UCI moves by the square names.
	One notable variation is that promotions must be in upper-case.

	Vocabulary:
	Special Tokens (4): "\<\|pad\|\>", "\<\|startoftext\|\>", "\<\|endoftext\|\>", "\<\|unknown\|\>"
	Square Tokens (64): a1 through h8
	Promote Tokens (4): Q, B, R, N
	UNUSED (8120): Need 8192-4-64-4=8120 unused tokens of the form <\|unused####\|>
	"""
	special_tokens = ["<\|pad\|>", "<\|startoftext\|>", "<\|endoftext\|>", "<\|unknown\|>"]
	unused_tokens = [f"<\|unused{i:04d}" for i in range(8120)]
	chess_vocab = special_tokens + chess.SQUARE_NAMES + list("QBRN") + unused_tokens
	mergeable_ranks = {k.encode():v for (v,k) in enumerate(chess_vocab)}
	chess_pat_str = r'[a-h][1-8]\|[QBRN]'

	enc = tiktoken.Encoding(
	name="chess_enc",
	pat_str=chess_pat_str, # or \d\|\s
	mergeable_ranks=mergeable_ranks,
	special_tokens={k:v for (v,k) in enumerate(special_tokens)},
	)

	return enc


	class UciTokenizer(PreTrainedTokenizerFast):
	_PAD_TOKEN: str
	_UNK_TOKEN: str
	_EOS_TOKEN: str
	_BOS_TOKEN: str


	stoi: dict[str, int]
	"""Integer to String mapping"""

	itos: dict[int, str]
	"""String to Integer Mapping. This is the vocab"""

	def __init__(
	self,
	stoi,
	itos,
	pad_token,
	unk_token,
	bos_token,
	eos_token,
	name_or_path,
	**kwargs
	):
	self.stoi = stoi
	self.itos = itos

	self._PAD_TOKEN = pad_token
	self._UNK_TOKEN = unk_token
	self._EOS_TOKEN = eos_token
	self._BOS_TOKEN = bos_token

	# Define the model
	tok_model = models.WordLevel(vocab=self.stoi, unk_token=self._UNK_TOKEN)

	slow_tokenizer = tokenizers.Tokenizer(tok_model)
	slow_tokenizer.pre_tokenizer = self._init_pretokenizer()

	# post processing adds special tokens unless explicitly ignored
	post_proc = processors.TemplateProcessing(
	single=f"{bos_token} $0",
	pair=None,
	special_tokens=[(bos_token, 1)],
	)
	slow_tokenizer.post_processor=post_proc

	super().__init__(
	tokenizer_object=slow_tokenizer,
	unk_token=self._UNK_TOKEN,
	bos_token=self._BOS_TOKEN,
	eos_token=self._EOS_TOKEN,
	pad_token=self._PAD_TOKEN,
	name_or_path=name_or_path,
	**kwargs
	)

	# Override the decode behavior to ensure spaces are correctly handled
	def _decode(
	token_ids: int \| List[int] \| dict \| TT,
	skip_special_tokens=False,
	clean_up_tokenization_spaces=False,
	) -> int \| List[int]:

	if isinstance(token_ids, int):
	return self.itos.get(token_ids, self._UNK_TOKEN)

	if isinstance(token_ids, dict):
	token_ids = token_ids["input_ids"]

	if isinstance(token_ids, TT):
	token_ids = token_ids.tolist()

	if isinstance(token_ids, list):
	tokens_str = [self.itos.get(xi, self._UNK_TOKEN) for xi in token_ids]
	processed_tokens = self._process_str_tokens(tokens_str)

	return " ".join(processed_tokens)

	raise ValueError(f"Unknown input type to decode() for argument 'token_ids'. Received: {type(token_ids)} ")


	self._decode = _decode

	def _init_pretokenizer(self) -> pre_tokenizers.PreTokenizer:
	raise NotImplementedError

	def _process_str_tokens(self, tokens_str: list[str], return_player_ids: bool) -> list[str]:
	raise NotImplementedError

	def get_id2square_list() -> list[int]:
	raise NotImplementedError


	class UciTileTokenizer(UciTokenizer):
	""" Uci tokenizer converting start/end tiles and promotion types each into individual tokens"""

	SPECIAL_TOKENS = ["<\|pad\|>", "<\|startoftext\|>", "<\|endoftext\|>", "<\|unknown\|>"]

	stoi = {
	tok: idx
	for tok, idx in list(
	zip(SPECIAL_TOKENS + chess.SQUARE_NAMES + list("QRBN"), range(72))
	)
	}

	itos = {
	idx: tok
	for tok, idx in list(
	zip(SPECIAL_TOKENS + chess.SQUARE_NAMES + list("QRBN"), range(72))
	)
	}

	id2square:List[int] = list(range(4,68))
	"""
	List mapping token IDs to squares on the chess board. Order is file then rank, i.e.:
	`A1, B1, C1, ..., F8, G8, H8`
	"""

	def get_id2square_list(self) -> List[int]:
	return self.id2square

	def __init__(self, **kwargs):
	super().__init__(
	self.stoi,
	self.itos,
	pad_token="<\|pad\|>",
	unk_token="<\|unknown\|>",
	bos_token="<\|startoftext\|>",
	eos_token="<\|endoftext\|>",
	name_or_path="austindavis/uci_tile_tokenizer",
	clean_up_tokenization_spaces=False,
	**kwargs
	)

	def _init_pretokenizer(self):
	# Pre-tokenizer to split input into UCI moves
	pattern = tokenizers.Regex(r"\d\|[QBRN]")
	pre_tokenizer = pre_tokenizers.Sequence(
	[
	pre_tokenizers.Whitespace(),
	pre_tokenizers.Split(pattern=pattern, behavior="merged_with_previous"),
	]
	)
	return pre_tokenizer

	def _process_str_tokens(self, token_str: list[str]):
	moves = []
	next_move = ""
	for token in token_str:

	# skip special tokens
	if token in self.all_special_tokens:
	continue

	# handle promotions
	if len(token) == 1:
	next_move += token
	continue

	# handle regular tokens if there's room
	if len(next_move) < 4:
	next_move += token
	continue

	moves.append(next_move)
	next_move = token

	moves.append(next_move)
	return moves

	@staticmethod
	def compute_players(encoding: BatchEncoding, according_to='output'):
	"""
	Determines which player (white=True, black=False) is associated with each token in the sequence.
	This method works based on chess move sequences tokenized using the UciTileTokenizer.

	# Parameters:
	----------
	`encoding` : BatchEncoding
	Tokenized input of a chess game, where each token represents a move or special token.

	`according_to` : str (optional, default='output')
	Specifies the perspective for associating players:
	- 'output': Returns the player whose next move is predicted by the sequence (the output move).
	- Otherwise: Returns the player associated with the input tokens (i.e., which player made each move).

	# Returns:
	-------
	List[bool]
	A list of boolean values indicating the player for each token:
	- True for white (player 1),
	- False for black (player 2).

	The list length corresponds to the number of tokens in the sequence, including special tokens if any.

	# Example Usage:
	```
	>>> tok = UciTileTokenizer()
	>>> encoding = tok('e2e4 d7d5 e4d5 e7e6 d5e6 d8g5 e6e7 g5f6 e7f8Q')
	>>> print(encoding['input_ids'])
	[1, 16, 32, 55, 39, 32, 39, 56, 48, 39, 48, 63, 42, 48, 56, 42, 49, 56, 65, 68]
	>>> tok.compute_players(encoding)
	[True, True, False, False, True, True, False, False, True, True, False, False, True, True, False, False, True, True, True, False]
	>>> tok.compute_players(encoding, according_to='input')
	[True, True, True, False, False, True, True, False, False, True, True, False, False, True, True, False, False, True, True, True]
	```

	# Notes:
	-------
	This method does not rely on board position calculations. Therefore, when
	using `according_to='output'`, it cannot reliably predict which player is
	responsible for selecting the final token of the sequence. For instance,
	if a pawn is moved to the back rank (e.g., 'e7e8'), then white must select
	the promotion class on the next token; however, this algorithm will predict
	that black is responsible for selecting the next token instead of white.
	"""

	return [UciTileTokenizer._compute_players_single(encoding[i].ids) for i in range(len(encoding['input_ids']))]



	@staticmethod
	def _compute_players_single(input_ids: list[int], according_to: str='output'):
	players = [] if according_to == "output" else [True]
	current_player = False
	num_tokens_in_ply = 0
	has_specials = False

	for i, token_id in enumerate(input_ids):
	if token_id == 1:
	has_specials = True
	continue

	if num_tokens_in_ply == 0:
	# check if promotion OR unknown token ID
	if token_id > 67 or token_id == 3:
	players.append(current_player)
	num_tokens_in_ply = 0
	else:
	num_tokens_in_ply += 1
	current_player = not current_player
	players.append(current_player)
	elif num_tokens_in_ply == 1:
	num_tokens_in_ply = 0
	players.append(current_player)
	else:
	raise ValueError("Illegal move sequence")

	if according_to == "output":
	# anticipate what output should be based on the final input token
	# see notes for more detail
	if num_tokens_in_ply == 0:
	if token_id > 67:
	players.append(not current_player)
	else:
	players.append(current_player)
	else:
	players.append(current_player)

	return players if has_specials else players[1:]

	if __name__ == "__main__":
	tok = UciTileTokenizer()
	encoding = tok('e2e4Q b7b8N e2e7 a1',add_special_tokens=True)
	print(f"{encoding['input_ids']=}\n{tok.compute_players(encoding, according_to='output')=}")
	print(f"{encoding['input_ids']=}\n{tok.compute_players(encoding, according_to='input')=}")

	encoding = tok('e2e4Q b7b8N e2e7 a1',add_special_tokens=False)
	print(f"{encoding['input_ids']=}\n{tok.compute_players(encoding, according_to='output')=}")
	print(f"{encoding['input_ids']=}\n{tok.compute_players(encoding, according_to='input')=}")

	encoding = tok('e2e4 d7d5 e4d5 e7e6 d5e6 d8g5 e6e7 g5f6 e7f8Q')
	print(encoding['input_ids'])
	print(tok.compute_players(encoding))
	print(tok.compute_players(encoding, according_to='input'))