rain1024 commited on
Commit
f2c46df
·
1 Parent(s): 9a803fe

Add examples and test trainer script

Browse files
examples.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ Trên thế giới, giá vàng đang được giao dịch ở mức 5.068 USD/ounce, mất thêm khoảng 280 đồng/USD so với phiên sáng. Nếu tính trong một phiên, giá vàng mất tổng cộng gần 500 USD/ounce (tương đương mức giảm khoảng 15 triệu đồng). Đây là mức giảm kỷ lục trong lịch sử biến động của kim loại quý này.
2
+ Hiện giá vàng thế giới quy đổi theo tỷ giá Vietcombank (chưa bao gồm thuế, phí) vào khoảng 160,4 triệu đồng/lượng, thấp hơn vàng trong nước gần 20 triệu đồng/lượng.
references/2018.naacl.vu/source/VnCoreNLP_Architecture.pdf CHANGED
Binary files a/references/2018.naacl.vu/source/VnCoreNLP_Architecture.pdf and b/references/2018.naacl.vu/source/VnCoreNLP_Architecture.pdf differ
 
scripts/test_trainers.py ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # requires-python = ">=3.9"
3
+ # dependencies = [
4
+ # "python-crfsuite>=0.9.11",
5
+ # "datasets>=2.0.0",
6
+ # "underthesea-core @ file:///home/claude-user/projects/workspace_underthesea/underthesea-core-dev/extensions/underthesea_core/target/wheels/underthesea_core-1.0.7-cp312-cp312-manylinux_2_34_x86_64.whl",
7
+ # ]
8
+ # ///
9
+ """
10
+ Test script to compare python-crfsuite and underthesea-core trainers
11
+ on a tiny dataset to validate correctness.
12
+ """
13
+
14
+ import time
15
+
16
+
17
+ def create_tiny_dataset():
18
+ """Create a tiny dataset for testing."""
19
+ # 3 simple sentences
20
+ data = [
21
+ # Sentence 1: "Tôi yêu Việt Nam"
22
+ (["Tôi", "yêu", "Việt_Nam"], ["PRON", "VERB", "PROPN"]),
23
+ # Sentence 2: "Hà Nội đẹp"
24
+ (["Hà_Nội", "đẹp"], ["PROPN", "ADJ"]),
25
+ # Sentence 3: "Tôi ở Hà Nội"
26
+ (["Tôi", "ở", "Hà_Nội"], ["PRON", "VERB", "PROPN"]),
27
+ ]
28
+ return data
29
+
30
+
31
+ def create_medium_dataset(num_sentences=100):
32
+ """Create a medium dataset from UDD-1 for testing."""
33
+ from datasets import load_dataset
34
+
35
+ dataset = load_dataset("undertheseanlp/UDD-1")
36
+ train_data = dataset["train"]
37
+
38
+ data = []
39
+ for i, item in enumerate(train_data):
40
+ if i >= num_sentences:
41
+ break
42
+ tokens = item["tokens"]
43
+ tags = item["upos"] # Already strings
44
+ if tokens and tags and len(tokens) == len(tags):
45
+ data.append((tokens, tags))
46
+
47
+ return data
48
+
49
+
50
+ def extract_features(tokens, position):
51
+ """Simple feature extraction."""
52
+ features = {}
53
+ token = tokens[position]
54
+ features["word"] = token
55
+ features["lower"] = token.lower()
56
+
57
+ if position > 0:
58
+ features["prev"] = tokens[position - 1]
59
+ else:
60
+ features["prev"] = "__BOS__"
61
+
62
+ if position < len(tokens) - 1:
63
+ features["next"] = tokens[position + 1]
64
+ else:
65
+ features["next"] = "__EOS__"
66
+
67
+ return features
68
+
69
+
70
+ def sentence_to_features(tokens):
71
+ return [
72
+ [f"{k}={v}" for k, v in extract_features(tokens, i).items()]
73
+ for i in range(len(tokens))
74
+ ]
75
+
76
+
77
+ def test_python_crfsuite(data, max_iter=10):
78
+ """Test with python-crfsuite."""
79
+ import pycrfsuite
80
+
81
+ X_train = [sentence_to_features(tokens) for tokens, _ in data]
82
+ y_train = [tags for _, tags in data]
83
+
84
+ print("\n=== Python-CRFsuite ===")
85
+ print(f"Training data: {len(data)} sentences")
86
+
87
+ trainer = pycrfsuite.Trainer(verbose=True)
88
+ for xseq, yseq in zip(X_train, y_train):
89
+ trainer.append(xseq, yseq)
90
+
91
+ trainer.set_params({
92
+ "c1": 0.1,
93
+ "c2": 0.01,
94
+ "max_iterations": max_iter,
95
+ "feature.possible_transitions": True,
96
+ })
97
+
98
+ start = time.time()
99
+ trainer.train("/tmp/test_pycrfsuite.model")
100
+ elapsed = time.time() - start
101
+ print(f"Training time: {elapsed:.4f}s")
102
+
103
+ # Test prediction accuracy
104
+ tagger = pycrfsuite.Tagger()
105
+ tagger.open("/tmp/test_pycrfsuite.model")
106
+
107
+ correct = 0
108
+ total = 0
109
+ for tokens, gold in data:
110
+ features = sentence_to_features(tokens)
111
+ pred = tagger.tag(features)
112
+ for p, g in zip(pred, gold):
113
+ if p == g:
114
+ correct += 1
115
+ total += 1
116
+
117
+ print(f"Accuracy: {correct}/{total} = {correct/total:.4f}")
118
+
119
+ return tagger
120
+
121
+
122
+ def test_underthesea_core(data, max_iter=10):
123
+ """Test with underthesea-core."""
124
+ try:
125
+ from underthesea_core import CRFTrainer, CRFModel, CRFTagger
126
+ except ImportError:
127
+ try:
128
+ from underthesea_core.underthesea_core import CRFTrainer, CRFModel, CRFTagger
129
+ except ImportError:
130
+ print("\n=== Underthesea-core ===")
131
+ print("ERROR: CRFTrainer not available")
132
+ return None
133
+
134
+ X_train = [sentence_to_features(tokens) for tokens, _ in data]
135
+ y_train = [tags for _, tags in data]
136
+
137
+ print("\n=== Underthesea-core ===")
138
+ print(f"Training data: {len(data)} sentences")
139
+
140
+ # Same iterations as CRFsuite for fair speed comparison
141
+ trainer = CRFTrainer(
142
+ loss_function="lbfgs",
143
+ l1_penalty=0.1,
144
+ l2_penalty=0.01,
145
+ max_iterations=max_iter,
146
+ verbose=1, # Show sparse feature count
147
+ )
148
+
149
+ start = time.time()
150
+ model = trainer.train(X_train, y_train)
151
+ elapsed = time.time() - start
152
+ print(f"Training time: {elapsed:.4f}s")
153
+
154
+ # Save and load for testing
155
+ model.save("/tmp/test_underthesea.crf")
156
+ model = CRFModel.load("/tmp/test_underthesea.crf")
157
+ tagger = CRFTagger.from_model(model)
158
+
159
+ correct = 0
160
+ total = 0
161
+ for tokens, gold in data:
162
+ features = sentence_to_features(tokens)
163
+ pred = tagger.tag(features)
164
+ for p, g in zip(pred, gold):
165
+ if p == g:
166
+ correct += 1
167
+ total += 1
168
+
169
+ print(f"Accuracy: {correct}/{total} = {correct/total:.4f}")
170
+
171
+ return tagger
172
+
173
+
174
+ def main():
175
+ import sys
176
+
177
+ num_sentences = 100
178
+ if len(sys.argv) > 1:
179
+ num_sentences = int(sys.argv[1])
180
+
181
+ print("=" * 60)
182
+ print(f"Comparing CRF Trainers on {num_sentences} sentences")
183
+ print("=" * 60)
184
+
185
+ if num_sentences <= 3:
186
+ data = create_tiny_dataset()
187
+ else:
188
+ data = create_medium_dataset(num_sentences)
189
+
190
+ total_tokens = sum(len(tokens) for tokens, _ in data)
191
+ print(f"Total tokens: {total_tokens}")
192
+
193
+ max_iter = 100
194
+
195
+ # Test both
196
+ test_python_crfsuite(data, max_iter)
197
+ test_underthesea_core(data, max_iter)
198
+
199
+ print("\n" + "=" * 60)
200
+
201
+
202
+ if __name__ == "__main__":
203
+ main()