File size: 2,760 Bytes
f8a2f75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
570280a
f8a2f75
 
 
570280a
f8a2f75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# ── 1. Load model and tokenizer from the same fine-tuned checkpoint ──────────
MODEL_NAME = "Porameht/wangchanberta-thainer-corpus-v2-2"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME) # NER model fine-tuned on ThaiNER Corpus v2.2 (Thainer Wangchan)

# ── 2. Show available labels ─────────────────────────────────────────────────
print("Labels this model can detect:")
for id, label in model.config.id2label.items():
    print(f"  {id:2}: {label}")
print()

# ── 3. Build NER pipeline ────────────────────────────────────────────────────
# aggregation_strategy="simple" merges subword pieces into full entity spans
ner = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# ── 4. Test sentences ────────────────────────────────────────────────────────
test_sentences = [
    "ไปมัตสึโมโตวันที่ 29 พฤษภาคม",
    "ไปมัตสึโมตวันที่ 29 พฤษาคม",
    "วันที่ 30 พ.ค. ออกเดินทางจากฮาคุบะไปคามิโคจิ",
    "เช็คอินโรงแรมที่โตเกียวตอน 15:00 น.",
    "วันที่ 29 ทำอะไรบ้าง",
    "29"
]

USEFUL_LABELS = {"LOCATION", "DATE", "TIME", "FACILITY"}

for sentence in test_sentences:
    print(f"Input : {sentence}")
    results = ner(sentence)

    if not results:
        print("  (no entities found)")
    else:
        for ent in results:
            label = ent["entity_group"]
            word  = ent["word"]
            score = ent["score"]
            marker = " ✓" if label in USEFUL_LABELS else ""
            print(f"  [{label}] '{word}'  (score: {score:.3f}){marker}")
    print()


# {0: 'B-PERSON', 1: 'I-PERSON', 2: 'O', 3: 'B-ORGANIZATION', 4: 'B-LOCATION', 5: 'I-ORGANIZATION', 6: 'I-LOCATION', 7: 'B-DATE', 8: 'I-DATE', 9: 'B-TIME', 10: 'I-TIME', 11: 'B-MONEY', 12: 'I-MONEY', 13: 'B-FACILITY', 14: 'I-FACILITY', 15: 'B-URL', 16: 'I-URL', 17: 'B-PERCENT', 18: 'I-PERCENT', 19: 'B-LEN', 20: 'I-LEN', 21: 'B-AGO', 22: 'I-AGO', 23: 'B-LAW', 24: 'I-LAW', 25: 'B-PHONE', 26: 'I-PHONE', 27: 'B-EMAIL', 28: 'I-EMAIL', 29: 'B-ZIP', 30: 'B-TEMPERATURE', 31: 'I-TEMPERATURE'}