tre-1 / src /test_trainers.py

Add VLSP 2013 word segmentation, feature ablation study, and Hydra config

27e6434 about 2 months ago

5.47 kB

	# /// script
	# requires-python = ">=3.9"
	# dependencies = [
	# "python-crfsuite>=0.9.11",
	# "datasets>=2.0.0",
	# "underthesea-core @ file:///home/claude-user/projects/workspace_underthesea/underthesea-core-dev/extensions/underthesea_core/target/wheels/underthesea_core-1.0.7-cp312-cp312-manylinux_2_34_x86_64.whl",
	# ]
	# ///
	"""
	Test script to compare python-crfsuite and underthesea-core trainers
	on a tiny dataset to validate correctness.
	"""

	import time


	def create_tiny_dataset():
	"""Create a tiny dataset for testing."""
	# 3 simple sentences
	data = [
	# Sentence 1: "Tôi yêu Việt Nam"
	(["Tôi", "yêu", "Việt_Nam"], ["PRON", "VERB", "PROPN"]),
	# Sentence 2: "Hà Nội đẹp"
	(["Hà_Nội", "đẹp"], ["PROPN", "ADJ"]),
	# Sentence 3: "Tôi ở Hà Nội"
	(["Tôi", "ở", "Hà_Nội"], ["PRON", "VERB", "PROPN"]),
	]
	return data


	def create_medium_dataset(num_sentences=100):
	"""Create a medium dataset from UDD-1 for testing."""
	from datasets import load_dataset

	dataset = load_dataset("undertheseanlp/UDD-1")
	train_data = dataset["train"]

	data = []
	for i, item in enumerate(train_data):
	if i >= num_sentences:
	break
	tokens = item["tokens"]
	tags = item["upos"] # Already strings
	if tokens and tags and len(tokens) == len(tags):
	data.append((tokens, tags))

	return data


	def extract_features(tokens, position):
	"""Simple feature extraction."""
	features = {}
	token = tokens[position]
	features["word"] = token
	features["lower"] = token.lower()

	if position > 0:
	features["prev"] = tokens[position - 1]
	else:
	features["prev"] = "__BOS__"

	if position < len(tokens) - 1:
	features["next"] = tokens[position + 1]
	else:
	features["next"] = "__EOS__"

	return features


	def sentence_to_features(tokens):
	return [
	[f"{k}={v}" for k, v in extract_features(tokens, i).items()]
	for i in range(len(tokens))
	]


	def test_python_crfsuite(data, max_iter=10):
	"""Test with python-crfsuite."""
	import pycrfsuite

	X_train = [sentence_to_features(tokens) for tokens, _ in data]
	y_train = [tags for _, tags in data]

	print("\n=== Python-CRFsuite ===")
	print(f"Training data: {len(data)} sentences")

	trainer = pycrfsuite.Trainer(verbose=True)
	for xseq, yseq in zip(X_train, y_train):
	trainer.append(xseq, yseq)

	trainer.set_params({
	"c1": 0.1,
	"c2": 0.01,
	"max_iterations": max_iter,
	"feature.possible_transitions": True,
	})

	start = time.time()
	trainer.train("/tmp/test_pycrfsuite.model")
	elapsed = time.time() - start
	print(f"Training time: {elapsed:.4f}s")

	# Test prediction accuracy
	tagger = pycrfsuite.Tagger()
	tagger.open("/tmp/test_pycrfsuite.model")

	correct = 0
	total = 0
	for tokens, gold in data:
	features = sentence_to_features(tokens)
	pred = tagger.tag(features)
	for p, g in zip(pred, gold):
	if p == g:
	correct += 1
	total += 1

	print(f"Accuracy: {correct}/{total} = {correct/total:.4f}")

	return tagger


	def test_underthesea_core(data, max_iter=10):
	"""Test with underthesea-core."""
	try:
	from underthesea_core import CRFTrainer, CRFModel, CRFTagger
	except ImportError:
	try:
	from underthesea_core.underthesea_core import CRFTrainer, CRFModel, CRFTagger
	except ImportError:
	print("\n=== Underthesea-core ===")
	print("ERROR: CRFTrainer not available")
	return None

	X_train = [sentence_to_features(tokens) for tokens, _ in data]
	y_train = [tags for _, tags in data]

	print("\n=== Underthesea-core ===")
	print(f"Training data: {len(data)} sentences")

	# Same iterations as CRFsuite for fair speed comparison
	trainer = CRFTrainer(
	loss_function="lbfgs",
	l1_penalty=0.1,
	l2_penalty=0.01,
	max_iterations=max_iter,
	verbose=1, # Show sparse feature count
	)

	start = time.time()
	model = trainer.train(X_train, y_train)
	elapsed = time.time() - start
	print(f"Training time: {elapsed:.4f}s")

	# Save and load for testing
	model.save("/tmp/test_underthesea.crf")
	model = CRFModel.load("/tmp/test_underthesea.crf")
	tagger = CRFTagger.from_model(model)

	correct = 0
	total = 0
	for tokens, gold in data:
	features = sentence_to_features(tokens)
	pred = tagger.tag(features)
	for p, g in zip(pred, gold):
	if p == g:
	correct += 1
	total += 1

	print(f"Accuracy: {correct}/{total} = {correct/total:.4f}")

	return tagger


	def main():
	import sys

	num_sentences = 100
	if len(sys.argv) > 1:
	num_sentences = int(sys.argv[1])

	print("=" * 60)
	print(f"Comparing CRF Trainers on {num_sentences} sentences")
	print("=" * 60)

	if num_sentences <= 3:
	data = create_tiny_dataset()
	else:
	data = create_medium_dataset(num_sentences)

	total_tokens = sum(len(tokens) for tokens, _ in data)
	print(f"Total tokens: {total_tokens}")

	max_iter = 100

	# Test both
	test_python_crfsuite(data, max_iter)
	test_underthesea_core(data, max_iter)

	print("\n" + "=" * 60)


	if __name__ == "__main__":
	main()