File size: 1,178 Bytes
a433a25 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 |
from transformers import AutoTokenizer
tok = AutoTokenizer.from_pretrained("./Tokenizer/BPE")
text1 = "Hello world! <user> write code </s>"
text2 = "myHTTPRequestHandler is calling process_payment_v2"
text3 = "methylphenidate hydrochloride dopamine reuptake modulation"
text4 = "hello ๐ฅ๐ฅ๐ฅ๐๐"
text5 = "https://github.com/Avinash-MiniLLM?tab=repos"
print(text1)
print(text2)
print(text3)
print(text4)
print(text5)
print(tok.tokenize(text1))
print(tok.tokenize(text2))
print(tok.tokenize(text3))
print(tok.tokenize(text4))
print(tok.tokenize(text5))
ids1 = tok.encode(text1)
ids2 = tok.encode(text2)
ids3 = tok.encode(text3)
ids4 = tok.encode(text4)
ids5 = tok.encode(text5)
print(ids1)
print(tok.decode(ids1))
print(tok.decode(ids1, skip_special_tokens=True))
print(ids2)
print(tok.decode(ids2))
print(tok.decode(ids2, skip_special_tokens=True))
print(ids3)
print(tok.decode(ids3))
print(tok.decode(ids3, skip_special_tokens=True))
ids4 = tok.encode(text4)
print(ids4)
print(tok.decode(ids4))
print(tok.decode(ids4, skip_special_tokens=True))
ids5 = tok.encode(text5)
print(ids5)
print(tok.decode(ids5))
print(tok.decode(ids5, skip_special_tokens=True)) |