File size: 1,366 Bytes
bd01d05
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
from transformers import AutoTokenizer
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("unsloth/gemma-4-E2B", extra_special_tokens={})
preprompt = "imagine"
text = "Japan's railways are the finest in the world.\xa0"

full_text = f"{preprompt}\n\n{text}" if preprompt else text
inputs = tokenizer(full_text, return_tensors="pt")

num_preprompt_tokens = 0
if preprompt:
    p_toks = tokenizer(f"{preprompt}\n\n")["input_ids"]
    num_preprompt_tokens = len(p_toks)
elif tokenizer.bos_token_id is not None and len(inputs["input_ids"][0]) > 0 and inputs["input_ids"][0][0] == tokenizer.bos_token_id:
    num_preprompt_tokens = 1

input_ids = inputs["input_ids"][0].tolist()
tokens = tokenizer.convert_ids_to_tokens(input_ids)
has_bos = (input_ids[0] == tokenizer.bos_token_id) if len(input_ids) > 0 else False

normalized_scores = np.ones(len(tokens))

result = []
for i, t in enumerate(tokens):
    word = tokenizer.decode([input_ids[i]])
    if t.startswith('<0x') and t.endswith('>'):
        word = ""
        t = ""
    raw_clean = t.replace('\u2581', ' ')
    result.append({"token": raw_clean, "word": word})

if num_preprompt_tokens > 0 and len(result) > num_preprompt_tokens:
    if has_bos:
        result = [result[0]] + result[num_preprompt_tokens:]
    else:
        result = result[num_preprompt_tokens:]

print([r["token"] for r in result])