File size: 1,178 Bytes
a433a25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
from transformers import AutoTokenizer
tok = AutoTokenizer.from_pretrained("./Tokenizer/BPE")


text1 = "Hello world! <user> write code </s>"
text2 = "myHTTPRequestHandler is calling process_payment_v2"
text3 = "methylphenidate hydrochloride dopamine reuptake modulation"
text4 = "hello ๐Ÿ”ฅ๐Ÿ”ฅ๐Ÿ”ฅ๐Ÿ’€๐Ÿ’€"
text5 = "https://github.com/Avinash-MiniLLM?tab=repos"


print(text1)
print(text2)
print(text3)
print(text4)
print(text5)

print(tok.tokenize(text1))
print(tok.tokenize(text2))
print(tok.tokenize(text3))
print(tok.tokenize(text4))
print(tok.tokenize(text5))


ids1 = tok.encode(text1)
ids2 = tok.encode(text2)
ids3 = tok.encode(text3)
ids4 = tok.encode(text4)
ids5 = tok.encode(text5)

print(ids1)
print(tok.decode(ids1))
print(tok.decode(ids1, skip_special_tokens=True))

print(ids2)
print(tok.decode(ids2))
print(tok.decode(ids2, skip_special_tokens=True))

print(ids3)
print(tok.decode(ids3))
print(tok.decode(ids3, skip_special_tokens=True))

ids4 = tok.encode(text4)
print(ids4)
print(tok.decode(ids4))
print(tok.decode(ids4, skip_special_tokens=True))

ids5 = tok.encode(text5)
print(ids5)
print(tok.decode(ids5))
print(tok.decode(ids5, skip_special_tokens=True))