File size: 5,468 Bytes
f189f3f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 |
# /// script
# requires-python = ">=3.9"
# dependencies = [
# "python-crfsuite>=0.9.11",
# "datasets>=2.0.0",
# "underthesea-core @ file:///home/claude-user/projects/workspace_underthesea/underthesea-core-dev/extensions/underthesea_core/target/wheels/underthesea_core-1.0.7-cp312-cp312-manylinux_2_34_x86_64.whl",
# ]
# ///
"""
Test script to compare python-crfsuite and underthesea-core trainers
on a tiny dataset to validate correctness.
"""
import time
def create_tiny_dataset():
"""Create a tiny dataset for testing."""
# 3 simple sentences
data = [
# Sentence 1: "Tôi yêu Việt Nam"
(["Tôi", "yêu", "Việt_Nam"], ["PRON", "VERB", "PROPN"]),
# Sentence 2: "Hà Nội đẹp"
(["Hà_Nội", "đẹp"], ["PROPN", "ADJ"]),
# Sentence 3: "Tôi ở Hà Nội"
(["Tôi", "ở", "Hà_Nội"], ["PRON", "VERB", "PROPN"]),
]
return data
def create_medium_dataset(num_sentences=100):
"""Create a medium dataset from UDD-1 for testing."""
from datasets import load_dataset
dataset = load_dataset("undertheseanlp/UDD-1")
train_data = dataset["train"]
data = []
for i, item in enumerate(train_data):
if i >= num_sentences:
break
tokens = item["tokens"]
tags = item["upos"] # Already strings
if tokens and tags and len(tokens) == len(tags):
data.append((tokens, tags))
return data
def extract_features(tokens, position):
"""Simple feature extraction."""
features = {}
token = tokens[position]
features["word"] = token
features["lower"] = token.lower()
if position > 0:
features["prev"] = tokens[position - 1]
else:
features["prev"] = "__BOS__"
if position < len(tokens) - 1:
features["next"] = tokens[position + 1]
else:
features["next"] = "__EOS__"
return features
def sentence_to_features(tokens):
return [
[f"{k}={v}" for k, v in extract_features(tokens, i).items()]
for i in range(len(tokens))
]
def test_python_crfsuite(data, max_iter=10):
"""Test with python-crfsuite."""
import pycrfsuite
X_train = [sentence_to_features(tokens) for tokens, _ in data]
y_train = [tags for _, tags in data]
print("\n=== Python-CRFsuite ===")
print(f"Training data: {len(data)} sentences")
trainer = pycrfsuite.Trainer(verbose=True)
for xseq, yseq in zip(X_train, y_train):
trainer.append(xseq, yseq)
trainer.set_params({
"c1": 0.1,
"c2": 0.01,
"max_iterations": max_iter,
"feature.possible_transitions": True,
})
start = time.time()
trainer.train("/tmp/test_pycrfsuite.model")
elapsed = time.time() - start
print(f"Training time: {elapsed:.4f}s")
# Test prediction accuracy
tagger = pycrfsuite.Tagger()
tagger.open("/tmp/test_pycrfsuite.model")
correct = 0
total = 0
for tokens, gold in data:
features = sentence_to_features(tokens)
pred = tagger.tag(features)
for p, g in zip(pred, gold):
if p == g:
correct += 1
total += 1
print(f"Accuracy: {correct}/{total} = {correct/total:.4f}")
return tagger
def test_underthesea_core(data, max_iter=10):
"""Test with underthesea-core."""
try:
from underthesea_core import CRFTrainer, CRFModel, CRFTagger
except ImportError:
try:
from underthesea_core.underthesea_core import CRFTrainer, CRFModel, CRFTagger
except ImportError:
print("\n=== Underthesea-core ===")
print("ERROR: CRFTrainer not available")
return None
X_train = [sentence_to_features(tokens) for tokens, _ in data]
y_train = [tags for _, tags in data]
print("\n=== Underthesea-core ===")
print(f"Training data: {len(data)} sentences")
# Same iterations as CRFsuite for fair speed comparison
trainer = CRFTrainer(
loss_function="lbfgs",
l1_penalty=0.1,
l2_penalty=0.01,
max_iterations=max_iter,
verbose=1, # Show sparse feature count
)
start = time.time()
model = trainer.train(X_train, y_train)
elapsed = time.time() - start
print(f"Training time: {elapsed:.4f}s")
# Save and load for testing
model.save("/tmp/test_underthesea.crf")
model = CRFModel.load("/tmp/test_underthesea.crf")
tagger = CRFTagger.from_model(model)
correct = 0
total = 0
for tokens, gold in data:
features = sentence_to_features(tokens)
pred = tagger.tag(features)
for p, g in zip(pred, gold):
if p == g:
correct += 1
total += 1
print(f"Accuracy: {correct}/{total} = {correct/total:.4f}")
return tagger
def main():
import sys
num_sentences = 100
if len(sys.argv) > 1:
num_sentences = int(sys.argv[1])
print("=" * 60)
print(f"Comparing CRF Trainers on {num_sentences} sentences")
print("=" * 60)
if num_sentences <= 3:
data = create_tiny_dataset()
else:
data = create_medium_dataset(num_sentences)
total_tokens = sum(len(tokens) for tokens, _ in data)
print(f"Total tokens: {total_tokens}")
max_iter = 100
# Test both
test_python_crfsuite(data, max_iter)
test_underthesea_core(data, max_iter)
print("\n" + "=" * 60)
if __name__ == "__main__":
main()
|