ricl / src /openpi /models /tokenizer.py

doanh25032004

Add files using upload-large-folder tool

991941e verified 11 days ago

14.2 kB

	import logging

	import numpy as np
	import sentencepiece
	from transformers import AutoProcessor

	import etils.epath as epath
	import openpi.shared.download as download


	class PaligemmaTokenizer:
	def __init__(self, max_len: int = 48):
	self._max_len = max_len

	local_path = epath.Path("assets/paligemma_tokenizer.model")
	hf_path = epath.Path("/projects/extern/kisski/kisski-spath/dir.project/VLA_Groot/in_context_learning/VLA-Humanoid/paligemma-3b-pt-224/tokenizer.model")
	if local_path.exists():
	path = local_path
	elif hf_path.exists():
	path = hf_path
	else:
	path = download.maybe_download("gs://big_vision/paligemma_tokenizer.model", gs={"token": "anon"})
	with path.open("rb") as f:
	self._tokenizer = sentencepiece.SentencePieceProcessor(model_proto=f.read())

	def tokenize(self, prompt: str) -> tuple[np.ndarray, np.ndarray]:
	cleaned_text = prompt.strip().replace("_", " ").replace("\n", " ")
	# tokenize "\n" separately as the "start of answer" token
	tokens = self._tokenizer.encode(cleaned_text, add_bos=True) + self._tokenizer.encode("\n")
	tokens_len = len(tokens)
	if tokens_len < self._max_len:
	padding = [False] * (self._max_len - tokens_len)
	mask = [True] * tokens_len + padding
	tokens = tokens + padding
	else:
	if len(tokens) > self._max_len:
	logging.warning(
	f"Token length ({len(tokens)}) exceeds max length ({self._max_len}), truncating. "
	"Consider increasing the `max_token_len` in your model config if this happens frequently."
	)
	tokens = tokens[: self._max_len]
	mask = [True] * self._max_len

	return np.asarray(tokens), np.asarray(mask)


	class FASTTokenizer:
	def __init__(self, max_len: int = 256, fast_tokenizer_path: str = "physical-intelligence/fast"):
	self._max_len = max_len

	# Download base PaliGemma tokenizer
	local_path = epath.Path("assets/paligemma_tokenizer.model")
	hf_path = epath.Path("/projects/extern/kisski/kisski-spath/dir.project/VLA_Groot/in_context_learning/VLA-Humanoid/paligemma-3b-pt-224/tokenizer.model")
	if local_path.exists():
	path = local_path
	elif hf_path.exists():
	path = hf_path
	else:
	path = download.maybe_download("gs://big_vision/paligemma_tokenizer.model", gs={"token": "anon"})
	with path.open("rb") as f:
	self._paligemma_tokenizer = sentencepiece.SentencePieceProcessor(model_proto=f.read())

	# Instantiate FAST tokenizer - check for local path first
	local_fast_path = epath.Path("fast")
	if local_fast_path.exists():
	fast_tokenizer_path = str(local_fast_path)
	self._fast_tokenizer = AutoProcessor.from_pretrained(fast_tokenizer_path, trust_remote_code=True)
	self._fast_skip_tokens = 128 # Skip last 128 tokens in PaliGemma vocab since they are special tokens

	def tokenize(
	self, prompt: str, state: np.ndarray, actions: np.ndarray \| None,
	dont_pad: bool = False,
	dont_loss: bool = False,
	) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
	cleaned_text = prompt.lower().strip().replace("_", " ")

	# Convention: state gets discretized into 256 discrete bins (assumed range after normalization: [-1, 1])
	discretized_state = np.digitize(state, bins=np.linspace(-1, 1, 256 + 1)[:-1]) - 1

	# Convention: prefix includes prompt and string-representation of state, followed by ';'
	state_str = " ".join(map(str, discretized_state))
	prefix = f"Task: {cleaned_text}, State: {state_str};\n"
	prefix_tokens = self._paligemma_tokenizer.encode(prefix, add_bos=True)

	if actions is not None:
	# Tokenize actions with FAST tokenizer --> map to last tokens in PaliGemma vocab
	action_tokens = self._fast_tokenizer(actions[None])[0]
	action_tokens_in_pg = self._act_tokens_to_paligemma_tokens(action_tokens)

	# Convention: postfix contains 'Action:' followed by FAST tokens, followed by '\|'
	postfix_tokens = (
	self._paligemma_tokenizer.encode("Action: ")
	+ action_tokens_in_pg.tolist()
	+ self._paligemma_tokenizer.encode("\|")
	)
	else:
	postfix_tokens = []

	# Create output token sequence & masks
	# AR mask is 0 on prefix (bidirectional attention) and 1 on postfix (causal attention to all previous tokens)
	tokens = prefix_tokens + postfix_tokens
	token_mask = [True] * len(tokens)
	ar_mask = [0] * len(prefix_tokens) + [1] * len(postfix_tokens)
	if dont_loss:
	loss_mask = [False] * len(prefix_tokens) + [False] * len(postfix_tokens) # no loss on prefix or postfix
	else:
	loss_mask = [False] * len(prefix_tokens) + [True] * len(postfix_tokens) # Loss on postfix only

	# Pad tokens to max length
	tokens_len = len(tokens)
	if tokens_len < self._max_len:
	# When padding is not desired
	if dont_pad:
	return np.asarray(tokens), np.asarray(token_mask), np.asarray(ar_mask), np.asarray(loss_mask)

	padding = [False] * (self._max_len - tokens_len)
	tokens = tokens + padding
	token_mask = token_mask + padding
	ar_mask = ar_mask + padding
	loss_mask = loss_mask + padding
	else:
	if len(tokens) > self._max_len:
	logging.warning(
	f"Token length ({len(tokens)}) exceeds max length ({self._max_len}), truncating. "
	"Consider increasing the `max_token_len` in your model config if this happens frequently."
	)
	tokens = tokens[: self._max_len]
	token_mask = token_mask[: self._max_len]
	ar_mask = ar_mask[: self._max_len]
	loss_mask = loss_mask[: self._max_len]

	return np.asarray(tokens), np.asarray(token_mask), np.asarray(ar_mask), np.asarray(loss_mask)

	def extract_actions(self, tokens: np.ndarray, action_horizon: int, action_dim: int) -> np.ndarray:
	# Decode predicted output tokens
	decoded_tokens = self._paligemma_tokenizer.decode(tokens.tolist())

	# Extract actions from FAST model outputs
	if "Action: " not in decoded_tokens:
	print(f"WARNING: No `Action: ` found in decoded tokens: {decoded_tokens}, so returning zeros")
	return np.zeros((action_horizon, action_dim), dtype=np.float32)

	# Extract actions from decoded tokens
	raw_action_tokens = np.array(
	self._paligemma_tokenizer.encode(decoded_tokens.split("Action: ")[1].split("\|")[0].strip())
	)
	action_tokens = self._act_tokens_to_paligemma_tokens(raw_action_tokens)
	return self._fast_tokenizer.decode(
	[action_tokens.tolist()], time_horizon=action_horizon, action_dim=action_dim
	)[0]

	def _act_tokens_to_paligemma_tokens(self, tokens: np.ndarray \| list[int]) -> np.ndarray:
	if isinstance(tokens, list):
	tokens = np.array(tokens)
	return self._paligemma_tokenizer.vocab_size() - 1 - self._fast_skip_tokens - tokens


	class FASTTokenizerRicl:
	def __init__(self, max_len: int = 256, fast_tokenizer_path: str = "physical-intelligence/fast", action_horizon: int = 10, action_dim: int = 8):
	self._max_len = max_len
	self._action_horizon = action_horizon
	self._action_dim = action_dim

	# Download base PaliGemma tokenizer
	local_path = epath.Path("assets/paligemma_tokenizer.model")
	hf_path = epath.Path("/projects/extern/kisski/kisski-spath/dir.project/VLA_Groot/in_context_learning/VLA-Humanoid/paligemma-3b-pt-224/tokenizer.model")
	if local_path.exists():
	path = local_path
	elif hf_path.exists():
	path = hf_path
	else:
	path = download.maybe_download("gs://big_vision/paligemma_tokenizer.model", gs={"token": "anon"})
	with path.open("rb") as f:
	self._paligemma_tokenizer = sentencepiece.SentencePieceProcessor(model_proto=f.read())

	# Instantiate FAST tokenizer - check for local path first
	local_fast_path = epath.Path("fast")
	if local_fast_path.exists():
	fast_tokenizer_path = str(local_fast_path)
	self._fast_tokenizer = AutoProcessor.from_pretrained(fast_tokenizer_path, trust_remote_code=True)
	self._fast_skip_tokens = 128 # Skip last 128 tokens in PaliGemma vocab since they are special tokens

	def tokenize(
	self, prompt: str, state: np.ndarray, actions: np.ndarray \| None,
	dont_pad: bool = False,
	dont_loss: bool = False,
	) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
	cleaned_text = prompt.lower().strip().replace("_", " ")

	# Convention: state gets discretized into 256 discrete bins (assumed range after normalization: [-1, 1])
	discretized_state = np.digitize(state, bins=np.linspace(-1, 1, 256 + 1)[:-1]) - 1

	# Convention: prefix includes prompt and string-representation of state, followed by ';'
	state_str = " ".join(map(str, discretized_state))
	prefix = f"Task: {cleaned_text}, State: {state_str};\n"
	prefix_tokens = self._paligemma_tokenizer.encode(prefix, add_bos=True)

	if actions is not None:
	# Tokenize actions with FAST tokenizer --> map to last tokens in PaliGemma vocab
	assert actions.shape == (self._action_horizon, self._action_dim), f"{actions.shape=}"
	action_tokens = self._fast_tokenizer(actions[None])[0]
	action_tokens_in_pg = self._act_tokens_to_paligemma_tokens(action_tokens)

	# Convention: postfix contains 'Action:' followed by FAST tokens, followed by '\|'
	postfix_tokens = (
	self._paligemma_tokenizer.encode("Action: ")
	+ action_tokens_in_pg.tolist()
	+ self._paligemma_tokenizer.encode("\|")
	)
	else:
	postfix_tokens = []

	# always pad prefix tokens to 1/2 the max length
	assert self._max_len % 2 == 0, "max_len must be divisible by 2 to pad prefix tokens to 1/2 the max length and postfix tokens to the rest"
	if len(prefix_tokens) < self._max_len // 2:
	prefix_padding = [False] * (self._max_len // 2 - len(prefix_tokens))
	else:
	raise ValueError(f"Prefix tokens length ({len(prefix_tokens)}) exceeds 1/2 the max length ({self._max_len // 2})! Increase the `max_token_len` in your model config.")
	# pad postfix tokens if not dont_pad
	if dont_pad:
	postfix_padding = []
	else:
	postfix_padding = [False] * (self._max_len - len(prefix_tokens) - len(prefix_padding) - len(postfix_tokens))

	# Create output token sequence & masks
	# AR mask is 0 on prefix (bidirectional attention) and 1 on postfix (causal attention to all previous tokens)
	tokens_len = len(prefix_tokens) + len(prefix_padding) + len(postfix_tokens) + len(postfix_padding)
	if not dont_pad:
	assert tokens_len == self._max_len
	token_mask = [True] * len(prefix_tokens) + [False] * len(prefix_padding) + [True] * len(postfix_tokens) + [False] * len(postfix_padding)
	ar_mask = [0] * len(prefix_tokens) + [False] * len(prefix_padding) + [1] * len(postfix_tokens) + [False] * len(postfix_padding)
	if dont_loss:
	loss_mask = [False] * tokens_len # no loss on prefix or postfix
	else:
	loss_mask = [False] * len(prefix_tokens) + [False] * len(prefix_padding) + [True] * len(postfix_tokens) + [False] * len(postfix_padding) # Loss on postfix_tokens only

	# pad prefix and postfix tokens
	prefix_tokens = prefix_tokens + prefix_padding
	postfix_tokens = postfix_tokens + postfix_padding

	if len(postfix_tokens) == 0:
	# happens at inference time when actions are not provided and dont_pad is True
	postfix_tokens = None
	else:
	postfix_tokens = np.asarray(postfix_tokens)

	return np.asarray(prefix_tokens), postfix_tokens, np.asarray(token_mask), np.asarray(ar_mask), np.asarray(loss_mask)

	def extract_actions(self, tokens: np.ndarray, action_horizon: int, action_dim: int) -> np.ndarray:
	assert action_horizon == self._action_horizon and action_dim == self._action_dim, f"{action_horizon=}, {action_dim=}, {self._action_horizon=}, {self._action_dim=}"
	# Decode predicted output tokens
	decoded_tokens = self._paligemma_tokenizer.decode(tokens.tolist())

	# Extract actions from FAST model outputs
	if "Action: " not in decoded_tokens:
	print(f"WARNING: No `Action: ` found in decoded tokens: {decoded_tokens}, so returning zeros")
	return np.zeros((action_horizon, action_dim), dtype=np.float32)

	# Extract actions from decoded tokens
	print(f'decoded_tokens: {decoded_tokens}')
	raw_action_tokens = np.array(
	self._paligemma_tokenizer.encode(decoded_tokens.split("Action: ")[1].split("\|")[0].strip())
	)
	print(f'raw_action_tokens: {raw_action_tokens}')
	action_tokens = self._act_tokens_to_paligemma_tokens(raw_action_tokens)
	print(f'action_tokens: {action_tokens}')
	outputs = self._fast_tokenizer.decode(
	[action_tokens.tolist()], time_horizon=action_horizon, action_dim=action_dim
	)
	assert outputs.shape == (1, action_horizon, action_dim), f"{outputs.shape=}"
	outputs = outputs[0]
	print(f'outputs before normalization: {outputs}')
	return outputs

	def _act_tokens_to_paligemma_tokens(self, tokens: np.ndarray \| list[int]) -> np.ndarray:
	if isinstance(tokens, list):
	tokens = np.array(tokens)
	return self._paligemma_tokenizer.vocab_size() - 1 - self._fast_skip_tokens - tokens