File size: 1,187 Bytes
55b60a8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 | from __future__ import annotations
from tokenizers import Tokenizer, models, pre_tokenizers
from transformers import AutoConfig, AutoModelForCausalLM, PreTrainedTokenizerFast
def make_tiny_qwen2_model_and_tokenizer(
*,
n_layers: int = 3,
d_model: int = 48,
n_heads: int = 4,
n_kv_heads: int = 2,
max_pos: int = 128,
):
config = AutoConfig.for_model(
"qwen2",
vocab_size=500,
hidden_size=d_model,
intermediate_size=d_model * 2,
num_hidden_layers=n_layers,
num_attention_heads=n_heads,
num_key_value_heads=n_kv_heads,
max_position_embeddings=max_pos,
use_sliding_window=False,
attn_implementation="eager",
)
model = AutoModelForCausalLM.from_config(config, attn_implementation="eager")
model.eval()
backend = Tokenizer(models.WordLevel(vocab={f"t{i}": i for i in range(500)}, unk_token="t0"))
backend.pre_tokenizer = pre_tokenizers.Whitespace()
tokenizer = PreTrainedTokenizerFast(tokenizer_object=backend, eos_token="t1", pad_token="t2")
tokenizer.chat_template = "{% for m in messages %}{{ m['content'] }}{% endfor %}"
return model, tokenizer
|