|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" |
|
|
Test script to compare python-crfsuite and underthesea-core trainers |
|
|
on a tiny dataset to validate correctness. |
|
|
""" |
|
|
|
|
|
import time |
|
|
|
|
|
|
|
|
def create_tiny_dataset(): |
|
|
"""Create a tiny dataset for testing.""" |
|
|
|
|
|
data = [ |
|
|
|
|
|
(["Tôi", "yêu", "Việt_Nam"], ["PRON", "VERB", "PROPN"]), |
|
|
|
|
|
(["Hà_Nội", "đẹp"], ["PROPN", "ADJ"]), |
|
|
|
|
|
(["Tôi", "ở", "Hà_Nội"], ["PRON", "VERB", "PROPN"]), |
|
|
] |
|
|
return data |
|
|
|
|
|
|
|
|
def create_medium_dataset(num_sentences=100): |
|
|
"""Create a medium dataset from UDD-1 for testing.""" |
|
|
from datasets import load_dataset |
|
|
|
|
|
dataset = load_dataset("undertheseanlp/UDD-1") |
|
|
train_data = dataset["train"] |
|
|
|
|
|
data = [] |
|
|
for i, item in enumerate(train_data): |
|
|
if i >= num_sentences: |
|
|
break |
|
|
tokens = item["tokens"] |
|
|
tags = item["upos"] |
|
|
if tokens and tags and len(tokens) == len(tags): |
|
|
data.append((tokens, tags)) |
|
|
|
|
|
return data |
|
|
|
|
|
|
|
|
def extract_features(tokens, position): |
|
|
"""Simple feature extraction.""" |
|
|
features = {} |
|
|
token = tokens[position] |
|
|
features["word"] = token |
|
|
features["lower"] = token.lower() |
|
|
|
|
|
if position > 0: |
|
|
features["prev"] = tokens[position - 1] |
|
|
else: |
|
|
features["prev"] = "__BOS__" |
|
|
|
|
|
if position < len(tokens) - 1: |
|
|
features["next"] = tokens[position + 1] |
|
|
else: |
|
|
features["next"] = "__EOS__" |
|
|
|
|
|
return features |
|
|
|
|
|
|
|
|
def sentence_to_features(tokens): |
|
|
return [ |
|
|
[f"{k}={v}" for k, v in extract_features(tokens, i).items()] |
|
|
for i in range(len(tokens)) |
|
|
] |
|
|
|
|
|
|
|
|
def test_python_crfsuite(data, max_iter=10): |
|
|
"""Test with python-crfsuite.""" |
|
|
import pycrfsuite |
|
|
|
|
|
X_train = [sentence_to_features(tokens) for tokens, _ in data] |
|
|
y_train = [tags for _, tags in data] |
|
|
|
|
|
print("\n=== Python-CRFsuite ===") |
|
|
print(f"Training data: {len(data)} sentences") |
|
|
|
|
|
trainer = pycrfsuite.Trainer(verbose=True) |
|
|
for xseq, yseq in zip(X_train, y_train): |
|
|
trainer.append(xseq, yseq) |
|
|
|
|
|
trainer.set_params({ |
|
|
"c1": 0.1, |
|
|
"c2": 0.01, |
|
|
"max_iterations": max_iter, |
|
|
"feature.possible_transitions": True, |
|
|
}) |
|
|
|
|
|
start = time.time() |
|
|
trainer.train("/tmp/test_pycrfsuite.model") |
|
|
elapsed = time.time() - start |
|
|
print(f"Training time: {elapsed:.4f}s") |
|
|
|
|
|
|
|
|
tagger = pycrfsuite.Tagger() |
|
|
tagger.open("/tmp/test_pycrfsuite.model") |
|
|
|
|
|
correct = 0 |
|
|
total = 0 |
|
|
for tokens, gold in data: |
|
|
features = sentence_to_features(tokens) |
|
|
pred = tagger.tag(features) |
|
|
for p, g in zip(pred, gold): |
|
|
if p == g: |
|
|
correct += 1 |
|
|
total += 1 |
|
|
|
|
|
print(f"Accuracy: {correct}/{total} = {correct/total:.4f}") |
|
|
|
|
|
return tagger |
|
|
|
|
|
|
|
|
def test_underthesea_core(data, max_iter=10): |
|
|
"""Test with underthesea-core.""" |
|
|
try: |
|
|
from underthesea_core import CRFTrainer, CRFModel, CRFTagger |
|
|
except ImportError: |
|
|
try: |
|
|
from underthesea_core.underthesea_core import CRFTrainer, CRFModel, CRFTagger |
|
|
except ImportError: |
|
|
print("\n=== Underthesea-core ===") |
|
|
print("ERROR: CRFTrainer not available") |
|
|
return None |
|
|
|
|
|
X_train = [sentence_to_features(tokens) for tokens, _ in data] |
|
|
y_train = [tags for _, tags in data] |
|
|
|
|
|
print("\n=== Underthesea-core ===") |
|
|
print(f"Training data: {len(data)} sentences") |
|
|
|
|
|
|
|
|
trainer = CRFTrainer( |
|
|
loss_function="lbfgs", |
|
|
l1_penalty=0.1, |
|
|
l2_penalty=0.01, |
|
|
max_iterations=max_iter, |
|
|
verbose=1, |
|
|
) |
|
|
|
|
|
start = time.time() |
|
|
model = trainer.train(X_train, y_train) |
|
|
elapsed = time.time() - start |
|
|
print(f"Training time: {elapsed:.4f}s") |
|
|
|
|
|
|
|
|
model.save("/tmp/test_underthesea.crf") |
|
|
model = CRFModel.load("/tmp/test_underthesea.crf") |
|
|
tagger = CRFTagger.from_model(model) |
|
|
|
|
|
correct = 0 |
|
|
total = 0 |
|
|
for tokens, gold in data: |
|
|
features = sentence_to_features(tokens) |
|
|
pred = tagger.tag(features) |
|
|
for p, g in zip(pred, gold): |
|
|
if p == g: |
|
|
correct += 1 |
|
|
total += 1 |
|
|
|
|
|
print(f"Accuracy: {correct}/{total} = {correct/total:.4f}") |
|
|
|
|
|
return tagger |
|
|
|
|
|
|
|
|
def main(): |
|
|
import sys |
|
|
|
|
|
num_sentences = 100 |
|
|
if len(sys.argv) > 1: |
|
|
num_sentences = int(sys.argv[1]) |
|
|
|
|
|
print("=" * 60) |
|
|
print(f"Comparing CRF Trainers on {num_sentences} sentences") |
|
|
print("=" * 60) |
|
|
|
|
|
if num_sentences <= 3: |
|
|
data = create_tiny_dataset() |
|
|
else: |
|
|
data = create_medium_dataset(num_sentences) |
|
|
|
|
|
total_tokens = sum(len(tokens) for tokens, _ in data) |
|
|
print(f"Total tokens: {total_tokens}") |
|
|
|
|
|
max_iter = 100 |
|
|
|
|
|
|
|
|
test_python_crfsuite(data, max_iter) |
|
|
test_underthesea_core(data, max_iter) |
|
|
|
|
|
print("\n" + "=" * 60) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|