fcxfcx
/

owlv2

Model card Files Files and versions

owlv2 / scenic /projects /t5 /tokenizer.py

fcxfcx's picture

Upload 2446 files

1327f34 verified 5 days ago

history blame contribute delete

3.69 kB

	# Copyright 2025 The Scenic Authors.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""Constructor functions for the pretrained SentencePiece tokenizer.

	This module provides constructor functions for creating the pretrained
	SentencePiece tokenizer.

	The current DMVR SentencePiece tokenizer always sets `prepend_bos` when
	initializing the tensorflow processor and returns a sliced tensor when called
	with `prepend_bos=False`. This is problematic when the sentencepiece model is
	not trained with the BOS token (predefined and hard-coded as '<S>'), which is
	the case for T5 tokenizer. This module contains a wrapper for the DMVR
	SentencePiece tokenizer to initialize the tensorflow processor without
	prepending BOS. Instead, it prepends a custom BOS token given as an argument.
	"""

	from collections.abc import Sequence
	from typing import Optional
	from typing import Union

	from dmvr import tokenizers
	import tensorflow as tf
	import tensorflow_text

	# pylint: disable=line-too-long
	SP_MODEL_PATH = 'gs://t5-data/vocabs/cc_all.32000.100extra/sentencepiece.model'
	# pylint: enable=line-too-long


	class SentencePieceTokenizer(tokenizers.SentencePieceTokenizer):
	"""Wrapper around `SentencePieceTokenizer` to keep backwards compatibility.

	The current DMVR SentencePiece tokenizer always sets `prepend_bos` when
	initializing the tensorflow processor and returns a sliced tensor when called
	with `prepend_bos=False`. This is problematic when the sentencepiece model is
	not trained with the BOS token (predefined and hard-coded as '<S>'), which is
	the case for T5 tokenizer. This module contains a wrapper for the DMVR
	SentencePiece tokenizer to initialize the tensorflow processor without
	prepending BOS. Instead, it prepends a custom BOS token given as an argument.
	"""

	def __init__(self,
	model_path: str,
	bos_id: int = 0):
	self.bos_id = bos_id
	super().__init__(model_path)

	def initialize(self):
	with tf.io.gfile.GFile(self._model_path, 'rb') as f:
	self._tf_sp_model = tensorflow_text.SentencepieceTokenizer(
	model=f.read(),
	out_type=tf.int32,
	add_bos=False,
	add_eos=True)

	def string_tensor_to_indices(self,
	string_tensor: Union[tf.Tensor, Sequence[str]],
	prepend_bos: bool = False,
	append_eos: bool = False,
	max_num_tokens: Optional[int] = 32) -> tf.Tensor:
	if self._tf_sp_model is None:
	raise RuntimeError('Model was not initialized. Call `initialize` method.')

	tokenized = self._tf_sp_model.tokenize(string_tensor)
	tokenized = tokenized if append_eos else tokenized[..., :-1]

	# Pad to `max_num_tokens`.
	shape = None if max_num_tokens is None else [None, max_num_tokens]
	tokenized = tokenized.to_tensor(default_value=self._pad_token, shape=shape)

	if prepend_bos:
	tokenized = tf.concat([
	tf.zeros_like(tokenized[..., 0:1]) + self.bos_id, tokenized[..., :-1]
	], -1)
	return tokenized


	def build_dmvr_sp_model(model_path: str = SP_MODEL_PATH):
	return SentencePieceTokenizer(model_path)