Prior2DSM / src /dinov3 /eval /text /tokenizer.py
osherr's picture
Upload 222 files
bc90483 verified
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This software may be used and distributed in accordance with
# the terms of the DINOv3 License Agreement.
from typing import List, Union
import torch
from dinov3.thirdparty.CLIP.clip.simple_tokenizer import SimpleTokenizer
class Tokenizer(SimpleTokenizer):
def __init__(self, vocab_path: str):
SimpleTokenizer.__init__(self, bpe_path=vocab_path)
def tokenize(
self, texts: Union[str, List[str]], context_length: int = 77
) -> torch.LongTensor:
"""
Returns the tokenized representation of given input string(s)
Parameters
----------
texts : Union[str, List[str]]
An input string or a list of input strings to tokenize
context_length : int
The context length to use; all CLIP models use 77 as the context length
Returns
-------
A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length]
"""
if isinstance(texts, str):
texts = [texts]
sot_token = self.encoder["<|startoftext|>"]
eot_token = self.encoder["<|endoftext|>"]
all_tokens = [[sot_token] + self.encode(text) + [eot_token] for text in texts]
result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
for i, tokens in enumerate(all_tokens):
if len(tokens) > context_length:
tokens = tokens[:context_length] # Truncate
tokens[-1] = eot_token
result[i, : len(tokens)] = torch.tensor(tokens)
return result
def get_tokenizer(bpe_path_or_url: str) -> Tokenizer | None:
import urllib
from io import BytesIO
from .tokenizer import Tokenizer
if urllib.parse.urlparse(bpe_path_or_url).scheme:
try:
with urllib.request.urlopen(bpe_path_or_url) as response:
file_buf = BytesIO(response.read())
return Tokenizer(vocab_path=file_buf)
except Exception as e:
raise FileNotFoundError(
f"Failed to download file from url {bpe_path_or_url} with error last: {e}"
)
else:
with open(bpe_path_or_url, "rb") as f:
file_buf = BytesIO(f.read())
return Tokenizer(vocab_path=file_buf)