| from transformers import AutoTokenizer | |
| tok = AutoTokenizer.from_pretrained("./Tokenizer/BPE") | |
| text1 = "Hello world! <user> write code </s>" | |
| text2 = "myHTTPRequestHandler is calling process_payment_v2" | |
| text3 = "methylphenidate hydrochloride dopamine reuptake modulation" | |
| text4 = "hello π₯π₯π₯ππ" | |
| text5 = "https://github.com/Avinash-MiniLLM?tab=repos" | |
| print(text1) | |
| print(text2) | |
| print(text3) | |
| print(text4) | |
| print(text5) | |
| print(tok.tokenize(text1)) | |
| print(tok.tokenize(text2)) | |
| print(tok.tokenize(text3)) | |
| print(tok.tokenize(text4)) | |
| print(tok.tokenize(text5)) | |
| ids1 = tok.encode(text1) | |
| ids2 = tok.encode(text2) | |
| ids3 = tok.encode(text3) | |
| ids4 = tok.encode(text4) | |
| ids5 = tok.encode(text5) | |
| print(ids1) | |
| print(tok.decode(ids1)) | |
| print(tok.decode(ids1, skip_special_tokens=True)) | |
| print(ids2) | |
| print(tok.decode(ids2)) | |
| print(tok.decode(ids2, skip_special_tokens=True)) | |
| print(ids3) | |
| print(tok.decode(ids3)) | |
| print(tok.decode(ids3, skip_special_tokens=True)) | |
| ids4 = tok.encode(text4) | |
| print(ids4) | |
| print(tok.decode(ids4)) | |
| print(tok.decode(ids4, skip_special_tokens=True)) | |
| ids5 = tok.encode(text5) | |
| print(ids5) | |
| print(tok.decode(ids5)) | |
| print(tok.decode(ids5, skip_special_tokens=True)) |