Add examples and test trainer script

Browse files

Files changed (3) hide show

examples.txt +2 -0
references/2018.naacl.vu/source/VnCoreNLP_Architecture.pdf +0 -0
scripts/test_trainers.py +203 -0

examples.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ Trên thế giới, giá vàng đang được giao dịch ở mức 5.068 USD/ounce, mất thêm khoảng 280 đồng/USD so với phiên sáng. Nếu tính trong một phiên, giá vàng mất tổng cộng gần 500 USD/ounce (tương đương mức giảm khoảng 15 triệu đồng). Đây là mức giảm kỷ lục trong lịch sử biến động của kim loại quý này.
2	+ Hiện giá vàng thế giới quy đổi theo tỷ giá Vietcombank (chưa bao gồm thuế, phí) vào khoảng 160,4 triệu đồng/lượng, thấp hơn vàng trong nước gần 20 triệu đồng/lượng.

references/2018.naacl.vu/source/VnCoreNLP_Architecture.pdf CHANGED Viewed

Binary files a/references/2018.naacl.vu/source/VnCoreNLP_Architecture.pdf and b/references/2018.naacl.vu/source/VnCoreNLP_Architecture.pdf differ

scripts/test_trainers.py ADDED Viewed

	@@ -0,0 +1,203 @@

+# /// script
+# requires-python = ">=3.9"
+# dependencies = [
+#     "python-crfsuite>=0.9.11",
+#     "datasets>=2.0.0",
+#     "underthesea-core @ file:///home/claude-user/projects/workspace_underthesea/underthesea-core-dev/extensions/underthesea_core/target/wheels/underthesea_core-1.0.7-cp312-cp312-manylinux_2_34_x86_64.whl",
+# ]
+# ///
+"""
+Test script to compare python-crfsuite and underthesea-core trainers
+on a tiny dataset to validate correctness.
+"""
+import time
+def create_tiny_dataset():
+    """Create a tiny dataset for testing."""
+    # 3 simple sentences
+    data = [
+        # Sentence 1: "Tôi yêu Việt Nam"
+        (["Tôi", "yêu", "Việt_Nam"], ["PRON", "VERB", "PROPN"]),
+        # Sentence 2: "Hà Nội đẹp"
+        (["Hà_Nội", "đẹp"], ["PROPN", "ADJ"]),
+        # Sentence 3: "Tôi ở Hà Nội"
+        (["Tôi", "ở", "Hà_Nội"], ["PRON", "VERB", "PROPN"]),
+    ]
+    return data
+def create_medium_dataset(num_sentences=100):
+    """Create a medium dataset from UDD-1 for testing."""
+    from datasets import load_dataset
+    dataset = load_dataset("undertheseanlp/UDD-1")
+    train_data = dataset["train"]
+    data = []
+    for i, item in enumerate(train_data):
+        if i >= num_sentences:
+            break
+        tokens = item["tokens"]
+        tags = item["upos"]  # Already strings
+        if tokens and tags and len(tokens) == len(tags):
+            data.append((tokens, tags))
+    return data
+def extract_features(tokens, position):
+    """Simple feature extraction."""
+    features = {}
+    token = tokens[position]
+    features["word"] = token
+    features["lower"] = token.lower()
+    if position > 0:
+        features["prev"] = tokens[position - 1]
+    else:
+        features["prev"] = "__BOS__"
+    if position < len(tokens) - 1:
+        features["next"] = tokens[position + 1]
+    else:
+        features["next"] = "__EOS__"
+    return features
+def sentence_to_features(tokens):
+    return [
+        [f"{k}={v}" for k, v in extract_features(tokens, i).items()]
+        for i in range(len(tokens))
+    ]
+def test_python_crfsuite(data, max_iter=10):
+    """Test with python-crfsuite."""
+    import pycrfsuite
+    X_train = [sentence_to_features(tokens) for tokens, _ in data]
+    y_train = [tags for _, tags in data]
+    print("\n=== Python-CRFsuite ===")
+    print(f"Training data: {len(data)} sentences")
+    trainer = pycrfsuite.Trainer(verbose=True)
+    for xseq, yseq in zip(X_train, y_train):
+        trainer.append(xseq, yseq)
+    trainer.set_params({
+        "c1": 0.1,
+        "c2": 0.01,
+        "max_iterations": max_iter,
+        "feature.possible_transitions": True,
+    })
+    start = time.time()
+    trainer.train("/tmp/test_pycrfsuite.model")
+    elapsed = time.time() - start
+    print(f"Training time: {elapsed:.4f}s")
+    # Test prediction accuracy
+    tagger = pycrfsuite.Tagger()
+    tagger.open("/tmp/test_pycrfsuite.model")
+    correct = 0
+    total = 0
+    for tokens, gold in data:
+        features = sentence_to_features(tokens)
+        pred = tagger.tag(features)
+        for p, g in zip(pred, gold):
+            if p == g:
+                correct += 1
+            total += 1
+    print(f"Accuracy: {correct}/{total} = {correct/total:.4f}")
+    return tagger
+def test_underthesea_core(data, max_iter=10):
+    """Test with underthesea-core."""
+    try:
+        from underthesea_core import CRFTrainer, CRFModel, CRFTagger
+    except ImportError:
+        try:
+            from underthesea_core.underthesea_core import CRFTrainer, CRFModel, CRFTagger
+        except ImportError:
+            print("\n=== Underthesea-core ===")
+            print("ERROR: CRFTrainer not available")
+            return None
+    X_train = [sentence_to_features(tokens) for tokens, _ in data]
+    y_train = [tags for _, tags in data]
+    print("\n=== Underthesea-core ===")
+    print(f"Training data: {len(data)} sentences")
+    # Same iterations as CRFsuite for fair speed comparison
+    trainer = CRFTrainer(
+        loss_function="lbfgs",
+        l1_penalty=0.1,
+        l2_penalty=0.01,
+        max_iterations=max_iter,
+        verbose=1,  # Show sparse feature count
+    )
+    start = time.time()
+    model = trainer.train(X_train, y_train)
+    elapsed = time.time() - start
+    print(f"Training time: {elapsed:.4f}s")
+    # Save and load for testing
+    model.save("/tmp/test_underthesea.crf")
+    model = CRFModel.load("/tmp/test_underthesea.crf")
+    tagger = CRFTagger.from_model(model)
+    correct = 0
+    total = 0
+    for tokens, gold in data:
+        features = sentence_to_features(tokens)
+        pred = tagger.tag(features)
+        for p, g in zip(pred, gold):
+            if p == g:
+                correct += 1
+            total += 1
+    print(f"Accuracy: {correct}/{total} = {correct/total:.4f}")
+    return tagger
+def main():
+    import sys
+    num_sentences = 100
+    if len(sys.argv) > 1:
+        num_sentences = int(sys.argv[1])
+    print("=" * 60)
+    print(f"Comparing CRF Trainers on {num_sentences} sentences")
+    print("=" * 60)
+    if num_sentences <= 3:
+        data = create_tiny_dataset()
+    else:
+        data = create_medium_dataset(num_sentences)
+    total_tokens = sum(len(tokens) for tokens, _ in data)
+    print(f"Total tokens: {total_tokens}")
+    max_iter = 100
+    # Test both
+    test_python_crfsuite(data, max_iter)
+    test_underthesea_core(data, max_iter)
+    print("\n" + "=" * 60)
+if __name__ == "__main__":
+    main()