| |
|
|
| from crayon import CrayonVocab |
|
|
| def main(): |
| print("Crayon Tokenizer Demo") |
| print("=======================\n") |
|
|
| |
| |
| vocab = CrayonVocab(device="auto") |
| vocab.load_profile("lite") |
| print(f"Loaded Profile: 'lite' on {vocab.device.upper()}") |
|
|
| |
| text = "Hello, Crayon! This is a simple test." |
|
|
| |
| |
| tokens = vocab.tokenize(text) |
|
|
| print(f"\nInput Text: '{text}'") |
| print(f"Token IDs: {tokens}") |
| print(f"Count: {len(tokens)} tokens\n") |
|
|
| |
| |
| print("Token Breakdown:") |
| print(f"{'ID':<8} | {'Substring':<20}") |
| print("-" * 30) |
|
|
| for tid in tokens: |
| |
| substring = vocab.decode([tid]) |
| print(f"{tid:<8} | '{substring}'") |
|
|
| |
| |
| decoded_text = vocab.decode(tokens) |
| print(f"\nFull Decode check: '{decoded_text}'") |
| |
| |
| if text == decoded_text: |
| print("[MATCH] Exact Match!") |
| else: |
| print("[MISMATCH] Mismatch (canonicalization might differ)") |
|
|
| if __name__ == "__main__": |
| main() |
|
|