| |
| |
| |
| |
| |
| |
| |
| |
| """ |
| Test script to compare python-crfsuite and underthesea-core trainers |
| on a tiny dataset to validate correctness. |
| """ |
|
|
| import time |
|
|
|
|
| def create_tiny_dataset(): |
| """Create a tiny dataset for testing.""" |
| |
| data = [ |
| |
| (["Tôi", "yêu", "Việt_Nam"], ["PRON", "VERB", "PROPN"]), |
| |
| (["Hà_Nội", "đẹp"], ["PROPN", "ADJ"]), |
| |
| (["Tôi", "ở", "Hà_Nội"], ["PRON", "VERB", "PROPN"]), |
| ] |
| return data |
|
|
|
|
| def create_medium_dataset(num_sentences=100): |
| """Create a medium dataset from UDD-1 for testing.""" |
| from datasets import load_dataset |
|
|
| dataset = load_dataset("undertheseanlp/UDD-1") |
| train_data = dataset["train"] |
|
|
| data = [] |
| for i, item in enumerate(train_data): |
| if i >= num_sentences: |
| break |
| tokens = item["tokens"] |
| tags = item["upos"] |
| if tokens and tags and len(tokens) == len(tags): |
| data.append((tokens, tags)) |
|
|
| return data |
|
|
|
|
| def extract_features(tokens, position): |
| """Simple feature extraction.""" |
| features = {} |
| token = tokens[position] |
| features["word"] = token |
| features["lower"] = token.lower() |
|
|
| if position > 0: |
| features["prev"] = tokens[position - 1] |
| else: |
| features["prev"] = "__BOS__" |
|
|
| if position < len(tokens) - 1: |
| features["next"] = tokens[position + 1] |
| else: |
| features["next"] = "__EOS__" |
|
|
| return features |
|
|
|
|
| def sentence_to_features(tokens): |
| return [ |
| [f"{k}={v}" for k, v in extract_features(tokens, i).items()] |
| for i in range(len(tokens)) |
| ] |
|
|
|
|
| def test_python_crfsuite(data, max_iter=10): |
| """Test with python-crfsuite.""" |
| import pycrfsuite |
|
|
| X_train = [sentence_to_features(tokens) for tokens, _ in data] |
| y_train = [tags for _, tags in data] |
|
|
| print("\n=== Python-CRFsuite ===") |
| print(f"Training data: {len(data)} sentences") |
|
|
| trainer = pycrfsuite.Trainer(verbose=True) |
| for xseq, yseq in zip(X_train, y_train): |
| trainer.append(xseq, yseq) |
|
|
| trainer.set_params({ |
| "c1": 0.1, |
| "c2": 0.01, |
| "max_iterations": max_iter, |
| "feature.possible_transitions": True, |
| }) |
|
|
| start = time.time() |
| trainer.train("/tmp/test_pycrfsuite.model") |
| elapsed = time.time() - start |
| print(f"Training time: {elapsed:.4f}s") |
|
|
| |
| tagger = pycrfsuite.Tagger() |
| tagger.open("/tmp/test_pycrfsuite.model") |
|
|
| correct = 0 |
| total = 0 |
| for tokens, gold in data: |
| features = sentence_to_features(tokens) |
| pred = tagger.tag(features) |
| for p, g in zip(pred, gold): |
| if p == g: |
| correct += 1 |
| total += 1 |
|
|
| print(f"Accuracy: {correct}/{total} = {correct/total:.4f}") |
|
|
| return tagger |
|
|
|
|
| def test_underthesea_core(data, max_iter=10): |
| """Test with underthesea-core.""" |
| try: |
| from underthesea_core import CRFTrainer, CRFModel, CRFTagger |
| except ImportError: |
| try: |
| from underthesea_core.underthesea_core import CRFTrainer, CRFModel, CRFTagger |
| except ImportError: |
| print("\n=== Underthesea-core ===") |
| print("ERROR: CRFTrainer not available") |
| return None |
|
|
| X_train = [sentence_to_features(tokens) for tokens, _ in data] |
| y_train = [tags for _, tags in data] |
|
|
| print("\n=== Underthesea-core ===") |
| print(f"Training data: {len(data)} sentences") |
|
|
| |
| trainer = CRFTrainer( |
| loss_function="lbfgs", |
| l1_penalty=0.1, |
| l2_penalty=0.01, |
| max_iterations=max_iter, |
| verbose=1, |
| ) |
|
|
| start = time.time() |
| model = trainer.train(X_train, y_train) |
| elapsed = time.time() - start |
| print(f"Training time: {elapsed:.4f}s") |
|
|
| |
| model.save("/tmp/test_underthesea.crf") |
| model = CRFModel.load("/tmp/test_underthesea.crf") |
| tagger = CRFTagger.from_model(model) |
|
|
| correct = 0 |
| total = 0 |
| for tokens, gold in data: |
| features = sentence_to_features(tokens) |
| pred = tagger.tag(features) |
| for p, g in zip(pred, gold): |
| if p == g: |
| correct += 1 |
| total += 1 |
|
|
| print(f"Accuracy: {correct}/{total} = {correct/total:.4f}") |
|
|
| return tagger |
|
|
|
|
| def main(): |
| import sys |
|
|
| num_sentences = 100 |
| if len(sys.argv) > 1: |
| num_sentences = int(sys.argv[1]) |
|
|
| print("=" * 60) |
| print(f"Comparing CRF Trainers on {num_sentences} sentences") |
| print("=" * 60) |
|
|
| if num_sentences <= 3: |
| data = create_tiny_dataset() |
| else: |
| data = create_medium_dataset(num_sentences) |
|
|
| total_tokens = sum(len(tokens) for tokens, _ in data) |
| print(f"Total tokens: {total_tokens}") |
|
|
| max_iter = 100 |
|
|
| |
| test_python_crfsuite(data, max_iter) |
| test_underthesea_core(data, max_iter) |
|
|
| print("\n" + "=" * 60) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|