File size: 1,764 Bytes
f8a2f75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
print("PythaiNLP (Dictionary-based) word-tokenizer test:")
from pythainlp.tokenize import word_tokenize

test_cases = [
    "วันนี้ทำอะไรบ้าง",
    "กำหนดการวันนี้",
    "ตื่นกี่โมง",
    "ไปที่ไหนต่อ",
    "เดินทางยังไง",
    "ออกเดินทางกี่โมง",
    "กำหนดการวันที่ 29 พ.ค.",
    "วันที่29พ.ค.ทำอะไรบ้าง",
]

for text in test_cases:
    tokens = word_tokenize(text, engine="newmm", keep_whitespace=False)
    print(f"Input : {text}")
    print(f"Tokens: {tokens}")
    print()


print("Transformers wangchanberta (SentencePiece Model) tokenizer test:")
from transformers import AutoTokenizer

# 1. Load the tokenizer for WangchanBERTa
tokenizer = AutoTokenizer.from_pretrained("airesearch/wangchanberta-base-att-spm-uncased")

# 2. Your raw text (no special tokens added by you!)
raw_text = "ผมกำลังเตรียมตัวแข่งแฮกกาธอน"
# raw_text = "วันนี้ทำอะไรบ้าง"

# 3. Pass the text through the tokenizer
# We'll ask it to return the actual string tokens so we can read them
tokens = tokenizer.tokenize(raw_text)

# tokenizer() adds special tokens automatically, then convert IDs back to readable strings
encoding = tokenizer(raw_text, add_special_tokens=True)
tokens_with_special = tokenizer.convert_ids_to_tokens(encoding["input_ids"])

print("Original Text  :", raw_text)
print("Subword tokens :", tokens)
print("With <s>/</s>  :", tokens_with_special)
print("input_ids      :", encoding["input_ids"])
print("attention_mask :", encoding["attention_mask"])