zkolter commited on
Commit
25eb6e2
·
verified ·
1 Parent(s): 8b30d41

Initial private upload of simplified Llama 3.2 1B Instruct checkpoint

Browse files
Files changed (6) hide show
  1. .gitattributes +2 -35
  2. README.md +10 -0
  3. consolidated.00.pth +3 -0
  4. params.json +13 -0
  5. tokenizer.model +3 -0
  6. tokenizer.py +229 -0
.gitattributes CHANGED
@@ -1,35 +1,2 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ consolidated.00.pth filter=lfs diff=lfs merge=lfs -text
2
+ tokenizer.model filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
README.md ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ ## LLama 3.2 1B Simplified
2
+
3
+ This repo contains a simplified variant of the Llama 3.2 1B Instruct model, aimed at instruction for the [Introduction to Modern AI](https://modernaicourse.org) course. The model is intended for instructional purposes only, specifically meant to test the implementation of a Transformer for Homework 4.
4
+
5
+ The differences with the normal Llama 3.2 1B model are:
6
+ 1. The model replaces RoPE with an absolute positional embedding. RoPE typically works slightly better, but is somewhat cumbersome and unintuitive to implement for an introductory class.
7
+ 2. The model uses normal multihead attention instead of grouped query attention. Grouped query attention is a minor architecturual optimization that introduces marginal added complexity with little instructional value.
8
+
9
+ To build this model, we made these two architecture changes then finetuned the model to recover the Llama 3.2 Instruct performance, matching with a KL loss on calibration set involving FineWebEDU and UltraChat200K.
10
+
consolidated.00.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bae99564f076d94ac4cfd420fdfe7897f7e3fa910a99d9a437c29235c9cb42ab
3
+ size 2689779681
params.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dim": 2048,
3
+ "n_layers": 16,
4
+ "n_heads": 32,
5
+ "vocab_size": 128256,
6
+ "ffn_dim_multiplier": 4,
7
+ "multiple_of": 256,
8
+ "norm_eps": 1e-05,
9
+ "rope_theta": 500000.0,
10
+ "use_scaled_rope": true,
11
+ "use_abs_pos_embeddings": true,
12
+ "abs_pos_max_position_embeddings": 4096
13
+ }
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82e9d31979e92ab929cd544440f129d9ecd797b69e327f80f17e1c50d5551b55
3
+ size 2183982
tokenizer.py ADDED
@@ -0,0 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # This software may be used and distributed in accordance with the terms of the Llama 3 Community License Agreement.
3
+
4
+ import os
5
+ from logging import getLogger
6
+ from pathlib import Path
7
+ from typing import (
8
+ AbstractSet,
9
+ cast,
10
+ Collection,
11
+ Dict,
12
+ Iterator,
13
+ List,
14
+ Literal,
15
+ Sequence,
16
+ TypedDict,
17
+ Union,
18
+ )
19
+
20
+ import tiktoken
21
+ from tiktoken.load import load_tiktoken_bpe
22
+
23
+
24
+ logger = getLogger(__name__)
25
+
26
+
27
+ Role = Literal["system", "user", "assistant"]
28
+
29
+
30
+ class Message(TypedDict):
31
+ role: Role
32
+ content: str
33
+
34
+
35
+ Dialog = Sequence[Message]
36
+
37
+
38
+ class Tokenizer:
39
+ """
40
+ Tokenizing and encoding/decoding text using the Tiktoken tokenizer.
41
+ """
42
+
43
+ special_tokens: Dict[str, int]
44
+
45
+ num_reserved_special_tokens = 256
46
+
47
+ pat_str = r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+" # noqa: E501
48
+
49
+ def __init__(self, model_path: str):
50
+ """
51
+ Initializes the Tokenizer with a Tiktoken model.
52
+
53
+ Args:
54
+ model_path (str): The path to the Tiktoken model file.
55
+ """
56
+ assert os.path.isfile(model_path), model_path
57
+
58
+ mergeable_ranks = load_tiktoken_bpe(model_path)
59
+ num_base_tokens = len(mergeable_ranks)
60
+ special_tokens = [
61
+ "<|begin_of_text|>",
62
+ "<|end_of_text|>",
63
+ "<|reserved_special_token_0|>",
64
+ "<|reserved_special_token_1|>",
65
+ "<|reserved_special_token_2|>",
66
+ "<|reserved_special_token_3|>",
67
+ "<|start_header_id|>",
68
+ "<|end_header_id|>",
69
+ "<|reserved_special_token_4|>",
70
+ "<|eot_id|>", # end of turn
71
+ ] + [
72
+ f"<|reserved_special_token_{i}|>"
73
+ for i in range(5, self.num_reserved_special_tokens - 5)
74
+ ]
75
+ self.special_tokens = {
76
+ token: num_base_tokens + i for i, token in enumerate(special_tokens)
77
+ }
78
+ self.model = tiktoken.Encoding(
79
+ name=Path(model_path).name,
80
+ pat_str=self.pat_str,
81
+ mergeable_ranks=mergeable_ranks,
82
+ special_tokens=self.special_tokens,
83
+ )
84
+ logger.info(f"Reloaded tiktoken model from {model_path}")
85
+
86
+ self.n_words: int = self.model.n_vocab
87
+ # BOS / EOS token IDs
88
+ self.bos_id: int = self.special_tokens["<|begin_of_text|>"]
89
+ self.eos_id: int = self.special_tokens["<|end_of_text|>"]
90
+ self.pad_id: int = -1
91
+ self.stop_tokens = {
92
+ self.special_tokens["<|end_of_text|>"],
93
+ self.special_tokens["<|eot_id|>"],
94
+ }
95
+ logger.info(
96
+ f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}"
97
+ )
98
+
99
+ def encode(
100
+ self,
101
+ s: str,
102
+ *,
103
+ bos: bool,
104
+ eos: bool,
105
+ allowed_special: Union[Literal["all"], AbstractSet[str]] = set(),
106
+ disallowed_special: Union[Literal["all"], Collection[str]] = (),
107
+ ) -> List[int]:
108
+ """
109
+ Encodes a string into a list of token IDs.
110
+
111
+ Args:
112
+ s (str): The input string to be encoded.
113
+ bos (bool): Whether to prepend the beginning-of-sequence token.
114
+ eos (bool): Whether to append the end-of-sequence token.
115
+ allowed_tokens ("all"|set[str]): allowed special tokens in string
116
+ disallowed_tokens ("all"|set[str]): special tokens that raise an error when in string
117
+
118
+ Returns:
119
+ list[int]: A list of token IDs.
120
+
121
+ By default, setting disallowed_special=() encodes a string by ignoring
122
+ special tokens. Specifically:
123
+ - Setting `disallowed_special` to () will cause all text corresponding
124
+ to special tokens to be encoded as natural text (insteading of raising
125
+ an error).
126
+ - Setting `allowed_special` to "all" will treat all text corresponding
127
+ to special tokens to be encoded as special tokens.
128
+ """
129
+ assert type(s) is str
130
+
131
+ # The tiktoken tokenizer can handle <=400k chars without
132
+ # pyo3_runtime.PanicException.
133
+ TIKTOKEN_MAX_ENCODE_CHARS = 400_000
134
+
135
+ # https://github.com/openai/tiktoken/issues/195
136
+ # Here we iterate over subsequences and split if we exceed the limit
137
+ # of max consecutive non-whitespace or whitespace characters.
138
+ MAX_NO_WHITESPACES_CHARS = 25_000
139
+
140
+ substrs = (
141
+ substr
142
+ for i in range(0, len(s), TIKTOKEN_MAX_ENCODE_CHARS)
143
+ for substr in self._split_whitespaces_or_nonwhitespaces(
144
+ s[i : i + TIKTOKEN_MAX_ENCODE_CHARS], MAX_NO_WHITESPACES_CHARS
145
+ )
146
+ )
147
+ t: List[int] = []
148
+ for substr in substrs:
149
+ t.extend(
150
+ self.model.encode(
151
+ substr,
152
+ allowed_special=allowed_special,
153
+ disallowed_special=disallowed_special,
154
+ )
155
+ )
156
+ if bos:
157
+ t.insert(0, self.bos_id)
158
+ if eos:
159
+ t.append(self.eos_id)
160
+ return t
161
+
162
+ def decode(self, t: Sequence[int]) -> str:
163
+ """
164
+ Decodes a list of token IDs into a string.
165
+
166
+ Args:
167
+ t (List[int]): The list of token IDs to be decoded.
168
+
169
+ Returns:
170
+ str: The decoded string.
171
+ """
172
+ # Typecast is safe here. Tiktoken doesn't do anything list-related with the sequence.
173
+ return self.model.decode(cast(List[int], t))
174
+
175
+ @staticmethod
176
+ def _split_whitespaces_or_nonwhitespaces(
177
+ s: str, max_consecutive_slice_len: int
178
+ ) -> Iterator[str]:
179
+ """
180
+ Splits the string `s` so that each substring contains no more than `max_consecutive_slice_len`
181
+ consecutive whitespaces or consecutive non-whitespaces.
182
+ """
183
+ current_slice_len = 0
184
+ current_slice_is_space = s[0].isspace() if len(s) > 0 else False
185
+ slice_start = 0
186
+
187
+ for i in range(len(s)):
188
+ is_now_space = s[i].isspace()
189
+
190
+ if current_slice_is_space ^ is_now_space:
191
+ current_slice_len = 1
192
+ current_slice_is_space = is_now_space
193
+ else:
194
+ current_slice_len += 1
195
+ if current_slice_len > max_consecutive_slice_len:
196
+ yield s[slice_start:i]
197
+ slice_start = i
198
+ current_slice_len = 1
199
+ yield s[slice_start:]
200
+
201
+
202
+ class ChatFormat:
203
+ def __init__(self, tokenizer: Tokenizer):
204
+ self.tokenizer = tokenizer
205
+
206
+ def encode_header(self, message: Message) -> List[int]:
207
+ tokens = []
208
+ tokens.append(self.tokenizer.special_tokens["<|start_header_id|>"])
209
+ tokens.extend(self.tokenizer.encode(message["role"], bos=False, eos=False))
210
+ tokens.append(self.tokenizer.special_tokens["<|end_header_id|>"])
211
+ tokens.extend(self.tokenizer.encode("\n\n", bos=False, eos=False))
212
+ return tokens
213
+
214
+ def encode_message(self, message: Message) -> List[int]:
215
+ tokens = self.encode_header(message)
216
+ tokens.extend(
217
+ self.tokenizer.encode(message["content"].strip(), bos=False, eos=False)
218
+ )
219
+ tokens.append(self.tokenizer.special_tokens["<|eot_id|>"])
220
+ return tokens
221
+
222
+ def encode_dialog_prompt(self, dialog: Dialog) -> List[int]:
223
+ tokens = []
224
+ tokens.append(self.tokenizer.special_tokens["<|begin_of_text|>"])
225
+ for message in dialog:
226
+ tokens.extend(self.encode_message(message))
227
+ # Add the start of an assistant message for the model to complete.
228
+ tokens.extend(self.encode_header({"role": "assistant", "content": ""}))
229
+ return tokens