Spaces:

TaipongK
/

trip-bot

Sleeping

trip-bot / dev_tools /test_thainerwangchan.py

Pongsatorn Kanjanasantisak

using LLM typhoon-v2.5-30b-a3b-instruct

570280a 3 months ago

2.76 kB

	from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

	# ── 1. Load model and tokenizer from the same fine-tuned checkpoint ──────────
	MODEL_NAME = "Porameht/wangchanberta-thainer-corpus-v2-2"

	tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
	model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME) # NER model fine-tuned on ThaiNER Corpus v2.2 (Thainer Wangchan)

	# ── 2. Show available labels ─────────────────────────────────────────────────
	print("Labels this model can detect:")
	for id, label in model.config.id2label.items():
	print(f" {id:2}: {label}")
	print()

	# ── 3. Build NER pipeline ────────────────────────────────────────────────────
	# aggregation_strategy="simple" merges subword pieces into full entity spans
	ner = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

	# ── 4. Test sentences ────────────────────────────────────────────────────────
	test_sentences = [
	"ไปมัตสึโมโตวันที่ 29 พฤษภาคม",
	"ไปมัตสึโมตวันที่ 29 พฤษาคม",
	"วันที่ 30 พ.ค. ออกเดินทางจากฮาคุบะไปคามิโคจิ",
	"เช็คอินโรงแรมที่โตเกียวตอน 15:00 น.",
	"วันที่ 29 ทำอะไรบ้าง",
	"29"
	]

	USEFUL_LABELS = {"LOCATION", "DATE", "TIME", "FACILITY"}

	for sentence in test_sentences:
	print(f"Input : {sentence}")
	results = ner(sentence)

	if not results:
	print(" (no entities found)")
	else:
	for ent in results:
	label = ent["entity_group"]
	word = ent["word"]
	score = ent["score"]
	marker = " ✓" if label in USEFUL_LABELS else ""
	print(f" [{label}] '{word}' (score: {score:.3f}){marker}")
	print()


	# {0: 'B-PERSON', 1: 'I-PERSON', 2: 'O', 3: 'B-ORGANIZATION', 4: 'B-LOCATION', 5: 'I-ORGANIZATION', 6: 'I-LOCATION', 7: 'B-DATE', 8: 'I-DATE', 9: 'B-TIME', 10: 'I-TIME', 11: 'B-MONEY', 12: 'I-MONEY', 13: 'B-FACILITY', 14: 'I-FACILITY', 15: 'B-URL', 16: 'I-URL', 17: 'B-PERCENT', 18: 'I-PERCENT', 19: 'B-LEN', 20: 'I-LEN', 21: 'B-AGO', 22: 'I-AGO', 23: 'B-LAW', 24: 'I-LAW', 25: 'B-PHONE', 26: 'I-PHONE', 27: 'B-EMAIL', 28: 'I-EMAIL', 29: 'B-ZIP', 30: 'B-TEMPERATURE', 31: 'I-TEMPERATURE'}