| from transformers import AutoTokenizer |
| import numpy as np |
|
|
| tokenizer = AutoTokenizer.from_pretrained("unsloth/gemma-4-E2B", extra_special_tokens={}) |
| preprompt = "imagine" |
| text = "Japan's railways are the finest in the world.\xa0" |
|
|
| full_text = f"{preprompt}\n\n{text}" if preprompt else text |
| inputs = tokenizer(full_text, return_tensors="pt") |
|
|
| num_preprompt_tokens = 0 |
| if preprompt: |
| p_toks = tokenizer(f"{preprompt}\n\n")["input_ids"] |
| num_preprompt_tokens = len(p_toks) |
| elif tokenizer.bos_token_id is not None and len(inputs["input_ids"][0]) > 0 and inputs["input_ids"][0][0] == tokenizer.bos_token_id: |
| num_preprompt_tokens = 1 |
|
|
| input_ids = inputs["input_ids"][0].tolist() |
| tokens = tokenizer.convert_ids_to_tokens(input_ids) |
| has_bos = (input_ids[0] == tokenizer.bos_token_id) if len(input_ids) > 0 else False |
|
|
| normalized_scores = np.ones(len(tokens)) |
|
|
| result = [] |
| for i, t in enumerate(tokens): |
| word = tokenizer.decode([input_ids[i]]) |
| if t.startswith('<0x') and t.endswith('>'): |
| word = "" |
| t = "" |
| raw_clean = t.replace('\u2581', ' ') |
| result.append({"token": raw_clean, "word": word}) |
|
|
| if num_preprompt_tokens > 0 and len(result) > num_preprompt_tokens: |
| if has_bos: |
| result = [result[0]] + result[num_preprompt_tokens:] |
| else: |
| result = result[num_preprompt_tokens:] |
|
|
| print([r["token"] for r in result]) |
|
|