Transformers
English
pszemraj commited on
Commit
035d06d
·
verified ·
0 Parent(s):

Super-squash branch 'main' using huggingface_hub

Browse files
Files changed (5) hide show
  1. .gitattributes +35 -0
  2. README.md +117 -0
  3. special_tokens_map.json +51 -0
  4. tokenizer.json +0 -0
  5. tokenizer_config.json +55 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: apache-2.0
4
+ datasets:
5
+ - pints-ai/Expository-Prose-V1
6
+ language:
7
+ - en
8
+ ---
9
+
10
+ # bpe tokenizer w byte-fallback: 24k vocab
11
+
12
+ BPE tokenizer for encoders/MLM objective with byte-pair fallback:
13
+
14
+ - Trained on `pints-ai/Expository-Prose-V1`; this tokenizer is primarily for English and code.
15
+ - this tokenizer is cased: "HELLO WORLD" **is different** than "hello world"
16
+ - `model_max_length` is set to 1e9 to not cause hidden issues. **Set `tokenizer.model_max_length` to your model's max position embeddings** when training.
17
+
18
+
19
+ ## visualize
20
+
21
+
22
+ ### code
23
+
24
+ ```py
25
+ from typing import Any, Callable, Optional, Union
26
+
27
+ from tokenizers import Tokenizer as RustTokenizer
28
+ from tokenizers.tools import EncodingVisualizer
29
+ from transformers import AutoTokenizer, PreTrainedTokenizerBase
30
+
31
+ SAMPLE_TEXT = """class DyT(nn.Module):
32
+ def __init__(self, num_features, alpha_init_value=0.5):
33
+ super().__init__()
34
+ self.alpha = nn.Parameter(torch.ones(1) * alpha_init_value)
35
+ self.weight = nn.Parameter(torch.ones(num_features))
36
+ self.bias = nn.Parameter(torch.zeros(num_features))
37
+
38
+ def forward(self, x):
39
+ x = torch.tanh(self.alpha * x)
40
+ return x * self.weight + self.bias"""
41
+
42
+
43
+ def tokenizer_report_and_visualize(
44
+ tk: Union[PreTrainedTokenizerBase, RustTokenizer],
45
+ sample_text: str = SAMPLE_TEXT,
46
+ *,
47
+ default_to_notebook: bool = True,
48
+ annotation_converter: Optional[Callable[[Any], Any]] = None,
49
+ add_special_tokens: bool = True,
50
+ n_first_tokens: int = 15,
51
+ ):
52
+ """
53
+ Count tokens and ALWAYS launch tokenization visualization for the given tokenizer.
54
+
55
+ tk:
56
+ - Either a HuggingFace *fast* tokenizer (PreTrainedTokenizerFast subclass)
57
+ or a low-level `tokenizers.Tokenizer` (RustTokenizer).
58
+ """
59
+
60
+ # Resolve a Rust-backed tokenizer for the visualizer and do the encoding.
61
+ if isinstance(tk, PreTrainedTokenizerBase):
62
+ name = getattr(tk, "name_or_path", tk.__class__.__name__)
63
+ backend = getattr(tk, "backend_tokenizer", None)
64
+ if backend is None:
65
+ raise ValueError(
66
+ "EncodingVisualizer requires a *fast* tokenizer. "
67
+ "Reload with `use_fast=True` or pass a `tokenizers.Tokenizer`."
68
+ )
69
+ # mirror your original behavior that used `.encode(...)`
70
+ input_ids = tk.encode(
71
+ sample_text,
72
+ add_special_tokens=add_special_tokens,
73
+ padding=False,
74
+ truncation=False,
75
+ )
76
+ tokens = tk.convert_ids_to_tokens(input_ids, skip_special_tokens=False)
77
+ rust_tok = backend # this is a tokenizers.Tokenizer
78
+ elif isinstance(tk, RustTokenizer):
79
+ name = "tokenizers.Tokenizer"
80
+ enc = tk.encode(sample_text)
81
+ input_ids = enc.ids
82
+ tokens = enc.tokens
83
+ rust_tok = tk
84
+ else:
85
+ raise TypeError(
86
+ "`tk` must be a HF *fast* tokenizer or a `tokenizers.Tokenizer`."
87
+ )
88
+
89
+ num_tokens = len(input_ids)
90
+ print(f"tokenizer ({name}): {num_tokens} tokens")
91
+ print(
92
+ "first tokens:",
93
+ tokens[:n_first_tokens] + (["..."] if len(tokens) > n_first_tokens else []),
94
+ )
95
+
96
+ # EncodingVisualizer is REQUIRED (not optional).
97
+ viz = EncodingVisualizer(
98
+ tokenizer=rust_tok,
99
+ default_to_notebook=default_to_notebook,
100
+ annotation_converter=annotation_converter,
101
+ )
102
+ viz(sample_text)
103
+
104
+ return {
105
+ "tokenizer_name": name,
106
+ "num_tokens": num_tokens,
107
+ "input_ids": input_ids,
108
+ "tokens": tokens,
109
+ "text": sample_text,
110
+ }
111
+
112
+
113
+ # --- example usage ---
114
+ repo_id = "pszemraj/bytebpe-tokenizer-24k-en_code-mlm"
115
+ tk = AutoTokenizer.from_pretrained(repo_id)
116
+ report = tokenizer_report_and_visualize(tk)
117
+ ```
special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "[CLS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "[CLS]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "[SEP]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "[MASK]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "[PAD]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "[SEP]",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "[UNK]",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[UNK]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[CLS]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[SEP]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[PAD]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "bos_token": "[CLS]",
45
+ "clean_up_tokenization_spaces": false,
46
+ "cls_token": "[CLS]",
47
+ "eos_token": "[SEP]",
48
+ "extra_special_tokens": {},
49
+ "mask_token": "[MASK]",
50
+ "model_max_length": 1000000000.0,
51
+ "pad_token": "[PAD]",
52
+ "sep_token": "[SEP]",
53
+ "tokenizer_class": "PreTrainedTokenizerFast",
54
+ "unk_token": "[UNK]"
55
+ }