Spaces:

osherr
/

Prior2DSM

Sleeping

App Files Files Community

Prior2DSM / src /dinov3 /eval /text /tokenizer.py

osherr

Upload 222 files

bc90483 verified 5 months ago

raw

history blame contribute delete

2.42 kB

	# Copyright (c) Meta Platforms, Inc. and affiliates.
	#
	# This software may be used and distributed in accordance with
	# the terms of the DINOv3 License Agreement.

	from typing import List, Union

	import torch
	from dinov3.thirdparty.CLIP.clip.simple_tokenizer import SimpleTokenizer


	class Tokenizer(SimpleTokenizer):
	def __init__(self, vocab_path: str):
	SimpleTokenizer.__init__(self, bpe_path=vocab_path)

	def tokenize(
	self, texts: Union[str, List[str]], context_length: int = 77
	) -> torch.LongTensor:
	"""
	Returns the tokenized representation of given input string(s)

	Parameters
	----------
	texts : Union[str, List[str]]
	An input string or a list of input strings to tokenize
	context_length : int
	The context length to use; all CLIP models use 77 as the context length

	Returns
	-------
	A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length]
	"""
	if isinstance(texts, str):
	texts = [texts]
	sot_token = self.encoder["<\|startoftext\|>"]
	eot_token = self.encoder["<\|endoftext\|>"]
	all_tokens = [[sot_token] + self.encode(text) + [eot_token] for text in texts]
	result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)

	for i, tokens in enumerate(all_tokens):
	if len(tokens) > context_length:
	tokens = tokens[:context_length] # Truncate
	tokens[-1] = eot_token
	result[i, : len(tokens)] = torch.tensor(tokens)

	return result


	def get_tokenizer(bpe_path_or_url: str) -> Tokenizer \| None:
	import urllib
	from io import BytesIO

	from .tokenizer import Tokenizer

	if urllib.parse.urlparse(bpe_path_or_url).scheme:
	try:
	with urllib.request.urlopen(bpe_path_or_url) as response:
	file_buf = BytesIO(response.read())
	return Tokenizer(vocab_path=file_buf)
	except Exception as e:
	raise FileNotFoundError(
	f"Failed to download file from url {bpe_path_or_url} with error last: {e}"
	)
	else:
	with open(bpe_path_or_url, "rb") as f:
	file_buf = BytesIO(f.read())
	return Tokenizer(vocab_path=file_buf)