print("PythaiNLP (Dictionary-based) word-tokenizer test:") from pythainlp.tokenize import word_tokenize test_cases = [ "วันนี้ทำอะไรบ้าง", "กำหนดการวันนี้", "ตื่นกี่โมง", "ไปที่ไหนต่อ", "เดินทางยังไง", "ออกเดินทางกี่โมง", "กำหนดการวันที่ 29 พ.ค.", "วันที่29พ.ค.ทำอะไรบ้าง", ] for text in test_cases: tokens = word_tokenize(text, engine="newmm", keep_whitespace=False) print(f"Input : {text}") print(f"Tokens: {tokens}") print() print("Transformers wangchanberta (SentencePiece Model) tokenizer test:") from transformers import AutoTokenizer # 1. Load the tokenizer for WangchanBERTa tokenizer = AutoTokenizer.from_pretrained("airesearch/wangchanberta-base-att-spm-uncased") # 2. Your raw text (no special tokens added by you!) raw_text = "ผมกำลังเตรียมตัวแข่งแฮกกาธอน" # raw_text = "วันนี้ทำอะไรบ้าง" # 3. Pass the text through the tokenizer # We'll ask it to return the actual string tokens so we can read them tokens = tokenizer.tokenize(raw_text) # tokenizer() adds special tokens automatically, then convert IDs back to readable strings encoding = tokenizer(raw_text, add_special_tokens=True) tokens_with_special = tokenizer.convert_ids_to_tokens(encoding["input_ids"]) print("Original Text :", raw_text) print("Subword tokens :", tokens) print("With / :", tokens_with_special) print("input_ids :", encoding["input_ids"]) print("attention_mask :", encoding["attention_mask"])