File size: 2,421 Bytes
bc90483
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This software may be used and distributed in accordance with
# the terms of the DINOv3 License Agreement.

from typing import List, Union

import torch
from dinov3.thirdparty.CLIP.clip.simple_tokenizer import SimpleTokenizer


class Tokenizer(SimpleTokenizer):
    def __init__(self, vocab_path: str):
        SimpleTokenizer.__init__(self, bpe_path=vocab_path)

    def tokenize(

        self, texts: Union[str, List[str]], context_length: int = 77

    ) -> torch.LongTensor:
        """

        Returns the tokenized representation of given input string(s)



        Parameters

        ----------

        texts : Union[str, List[str]]

            An input string or a list of input strings to tokenize

        context_length : int

            The context length to use; all CLIP models use 77 as the context length



        Returns

        -------

        A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length]

        """
        if isinstance(texts, str):
            texts = [texts]
        sot_token = self.encoder["<|startoftext|>"]
        eot_token = self.encoder["<|endoftext|>"]
        all_tokens = [[sot_token] + self.encode(text) + [eot_token] for text in texts]
        result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)

        for i, tokens in enumerate(all_tokens):
            if len(tokens) > context_length:
                tokens = tokens[:context_length]  # Truncate
                tokens[-1] = eot_token
            result[i, : len(tokens)] = torch.tensor(tokens)

        return result


def get_tokenizer(bpe_path_or_url: str) -> Tokenizer | None:
    import urllib
    from io import BytesIO

    from .tokenizer import Tokenizer

    if urllib.parse.urlparse(bpe_path_or_url).scheme:
        try:
            with urllib.request.urlopen(bpe_path_or_url) as response:
                file_buf = BytesIO(response.read())
                return Tokenizer(vocab_path=file_buf)
        except Exception as e:
            raise FileNotFoundError(
                f"Failed to download file from url {bpe_path_or_url} with error last: {e}"
            )
    else:
        with open(bpe_path_or_url, "rb") as f:
            file_buf = BytesIO(f.read())
            return Tokenizer(vocab_path=file_buf)