rockylynnstein commited on
Commit
11cd52b
·
verified ·
1 Parent(s): 049afc0

Upload 4 files

Browse files
tevunahai_quant_info.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "quant_method": "fp8",
3
+ "fp8_dtype": "e4m3",
4
+ "per_channel": true,
5
+ "calibration_samples": 2048,
6
+ "quantized_by": "TevunahAi",
7
+ "timestamp": "2025-12-05T01:53:52.287326"
8
+ }
tokenization_grok2.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Grok-2 Tokenizer
3
+
4
+ Custom tokenizer for xAI's Grok-2 model using the .tok.json format.
5
+ """
6
+
7
+ import json
8
+ from typing import List, Optional, Union
9
+ from transformers import PreTrainedTokenizer
10
+
11
+
12
+ class Grok2Tokenizer(PreTrainedTokenizer):
13
+ """
14
+ Tokenizer for Grok-2 model.
15
+
16
+ This tokenizer uses a byte-level BPE vocabulary stored in tokenizer.tok.json format.
17
+ """
18
+
19
+ vocab_files_names = {"vocab_file": "tokenizer.tok.json"}
20
+ model_input_names = ["input_ids", "attention_mask"]
21
+
22
+ def __init__(
23
+ self,
24
+ vocab_file: str,
25
+ bos_token: str = "<|begin_of_text|>",
26
+ eos_token: str = "<|end_of_text|>",
27
+ pad_token: str = "<|pad|>",
28
+ unk_token: str = "<|unk|>",
29
+ **kwargs,
30
+ ):
31
+ # Load the vocabulary
32
+ with open(vocab_file, "r", encoding="utf-8") as f:
33
+ vocab_data = json.load(f)
34
+
35
+ self.reserved_tokens = vocab_data.get("reserved_tokens", 128)
36
+ self.regular_tokens = vocab_data.get("regular_tokens", [])
37
+
38
+ # Build byte to token mapping
39
+ self._byte_to_token = {}
40
+ self._token_to_bytes = {}
41
+
42
+ for entry in self.regular_tokens:
43
+ token_id = entry["token"]
44
+ byte_seq = bytes(entry["bytes"])
45
+ self._byte_to_token[byte_seq] = token_id
46
+ self._token_to_bytes[token_id] = byte_seq
47
+
48
+ # Build vocabulary dict for compatibility
49
+ self._vocab = {}
50
+ for entry in self.regular_tokens:
51
+ token_id = entry["token"]
52
+ # Create a string representation
53
+ byte_seq = bytes(entry["bytes"])
54
+ try:
55
+ text = byte_seq.decode("utf-8")
56
+ except UnicodeDecodeError:
57
+ text = f"<0x{byte_seq.hex()}>"
58
+ self._vocab[text] = token_id
59
+
60
+ # Add special tokens to vocab
61
+ self._vocab[bos_token] = 1
62
+ self._vocab[eos_token] = 2
63
+ self._vocab[pad_token] = 0
64
+ self._vocab[unk_token] = 3
65
+
66
+ self._id_to_token = {v: k for k, v in self._vocab.items()}
67
+
68
+ super().__init__(
69
+ bos_token=bos_token,
70
+ eos_token=eos_token,
71
+ pad_token=pad_token,
72
+ unk_token=unk_token,
73
+ **kwargs,
74
+ )
75
+
76
+ self.vocab_file = vocab_file
77
+
78
+ @property
79
+ def vocab_size(self) -> int:
80
+ return 131072 # Grok-2 vocab size
81
+
82
+ def get_vocab(self):
83
+ return self._vocab.copy()
84
+
85
+ def _tokenize(self, text: str) -> List[str]:
86
+ """Tokenize text into subword tokens."""
87
+ # Simple byte-level tokenization
88
+ # For production, this should use proper BPE merges
89
+ tokens = []
90
+ text_bytes = text.encode("utf-8")
91
+
92
+ i = 0
93
+ while i < len(text_bytes):
94
+ # Try to find longest matching byte sequence
95
+ found = False
96
+ for length in range(min(20, len(text_bytes) - i), 0, -1):
97
+ byte_seq = text_bytes[i:i+length]
98
+ if byte_seq in self._byte_to_token:
99
+ token_id = self._byte_to_token[byte_seq]
100
+ tokens.append(self._id_to_token.get(token_id, f"<{token_id}>"))
101
+ i += length
102
+ found = True
103
+ break
104
+
105
+ if not found:
106
+ # Fall back to single byte
107
+ single_byte = bytes([text_bytes[i]])
108
+ if single_byte in self._byte_to_token:
109
+ token_id = self._byte_to_token[single_byte]
110
+ tokens.append(self._id_to_token.get(token_id, f"<{token_id}>"))
111
+ else:
112
+ tokens.append(self.unk_token)
113
+ i += 1
114
+
115
+ return tokens
116
+
117
+ def _convert_token_to_id(self, token: str) -> int:
118
+ """Convert token to id."""
119
+ return self._vocab.get(token, self._vocab.get(self.unk_token, 3))
120
+
121
+ def _convert_id_to_token(self, index: int) -> str:
122
+ """Convert id to token."""
123
+ return self._id_to_token.get(index, self.unk_token)
124
+
125
+ def convert_tokens_to_string(self, tokens: List[str]) -> str:
126
+ """Convert tokens back to string."""
127
+ byte_list = []
128
+ for token in tokens:
129
+ if token in self._vocab:
130
+ token_id = self._vocab[token]
131
+ if token_id in self._token_to_bytes:
132
+ byte_list.extend(self._token_to_bytes[token_id])
133
+ elif token.startswith("<0x") and token.endswith(">"):
134
+ # Hex encoded byte
135
+ try:
136
+ hex_str = token[3:-1]
137
+ byte_list.extend(bytes.fromhex(hex_str))
138
+ except ValueError:
139
+ pass
140
+
141
+ try:
142
+ return bytes(byte_list).decode("utf-8", errors="replace")
143
+ except Exception:
144
+ return ""
145
+
146
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None):
147
+ """Save vocabulary to directory."""
148
+ import os
149
+ import shutil
150
+
151
+ if filename_prefix:
152
+ vocab_file = os.path.join(save_directory, f"{filename_prefix}-tokenizer.tok.json")
153
+ else:
154
+ vocab_file = os.path.join(save_directory, "tokenizer.tok.json")
155
+
156
+ if os.path.abspath(self.vocab_file) != os.path.abspath(vocab_file):
157
+ shutil.copy(self.vocab_file, vocab_file)
158
+
159
+ return (vocab_file,)
160
+
161
+
162
+ # For auto registration
163
+ def get_tokenizer_class():
164
+ return Grok2Tokenizer
tokenizer.tok.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoTokenizer": "tokenization_grok2.Grok2Tokenizer"
4
+ },
5
+ "bos_token": "<|begin_of_text|>",
6
+ "eos_token": "<|end_of_text|>",
7
+ "pad_token": "<|pad|>",
8
+ "unk_token": "<|unk|>",
9
+ "model_max_length": 131072,
10
+ "clean_up_tokenization_spaces": false
11
+ }