rain1024 commited on
Commit
d734595
·
unverified ·
1 Parent(s): 8314544

Initial model upload: Vietnamese POS Tagger

Browse files
Files changed (5) hide show
  1. .gitattributes +1 -0
  2. README.md +107 -0
  3. handler.py +151 -0
  4. pos_tagger.crfsuite +3 -0
  5. requirements.txt +1 -0
.gitattributes CHANGED
@@ -33,5 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
36
  *.mco filter=lfs diff=lfs merge=lfs -text
37
  *.jar filter=lfs diff=lfs merge=lfs -text
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ pos_tagger.crfsuite filter=lfs diff=lfs merge=lfs -text
37
  *.mco filter=lfs diff=lfs merge=lfs -text
38
  *.jar filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - vi
4
+ license: apache-2.0
5
+ tags:
6
+ - pos
7
+ - part-of-speech
8
+ - vietnamese
9
+ - crf
10
+ - nlp
11
+ - token-classification
12
+ datasets:
13
+ - undertheseanlp/UDD-v0.1
14
+ library_name: python-crfsuite
15
+ pipeline_tag: token-classification
16
+ ---
17
+
18
+ # Vietnamese POS Tagger (TRE-1)
19
+
20
+ A Conditional Random Field (CRF) based Part-of-Speech tagger for Vietnamese, trained on the Universal Dependencies Dataset (UDD-v0.1).
21
+
22
+ ## Model Description
23
+
24
+ This model uses CRF (Conditional Random Fields) with handcrafted features inspired by the underthesea NLP library. It achieves high accuracy on Vietnamese POS tagging tasks.
25
+
26
+ ### Features
27
+
28
+ - **Architecture**: CRF (python-crfsuite)
29
+ - **Language**: Vietnamese
30
+ - **Tagset**: Universal POS tags (UPOS)
31
+ - **Training Data**: undertheseanlp/UDD-v0.1
32
+
33
+ ### Feature Templates
34
+
35
+ The model uses the following feature templates:
36
+ - Current token features: word form, lowercase, prefix/suffix (2-3 chars), character type checks
37
+ - Context features: previous and next 1-2 tokens
38
+ - Bigram features: adjacent token combinations
39
+ - Dictionary features: in-vocabulary checks
40
+
41
+ ## Usage
42
+
43
+ ### Using the Inference API
44
+
45
+ ```python
46
+ import requests
47
+
48
+ API_URL = "https://api-inference.huggingface.co/models/undertheseanlp/tre-1"
49
+ headers = {"Authorization": "Bearer YOUR_TOKEN"}
50
+
51
+ def query(payload):
52
+ response = requests.post(API_URL, headers=headers, json=payload)
53
+ return response.json()
54
+
55
+ output = query({"inputs": "Tôi yêu Việt Nam"})
56
+ print(output)
57
+ # [{"token": "Tôi", "tag": "PRON"}, {"token": "yêu", "tag": "VERB"}, ...]
58
+ ```
59
+
60
+ ### Local Usage
61
+
62
+ ```python
63
+ import pycrfsuite
64
+ from handler import EndpointHandler
65
+
66
+ handler = EndpointHandler(path="./")
67
+ result = handler({"inputs": "Tôi yêu Việt Nam"})
68
+ print(result)
69
+ ```
70
+
71
+ ## Training
72
+
73
+ The model was trained using:
74
+ - L1 regularization (c1): 1.0
75
+ - L2 regularization (c2): 1e-3
76
+ - Max iterations: 100
77
+
78
+ ## Performance
79
+
80
+ Evaluated on a held-out test set from UDD-v0.1:
81
+ - Accuracy: ~94%
82
+ - F1 (macro): ~90%
83
+ - F1 (weighted): ~94%
84
+
85
+ ## Limitations
86
+
87
+ - Requires pre-tokenized input (whitespace-separated tokens)
88
+ - Performance may vary on out-of-domain text
89
+ - Does not handle Vietnamese word segmentation
90
+
91
+ ## Citation
92
+
93
+ If you use this model, please cite:
94
+
95
+ ```bibtex
96
+ @misc{tre1-pos-tagger,
97
+ author = {undertheseanlp},
98
+ title = {Vietnamese POS Tagger TRE-1},
99
+ year = {2025},
100
+ publisher = {Hugging Face},
101
+ url = {https://huggingface.co/undertheseanlp/tre-1}
102
+ }
103
+ ```
104
+
105
+ ## License
106
+
107
+ Apache 2.0
handler.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Custom handler for Vietnamese POS Tagger inference on Hugging Face.
3
+ """
4
+
5
+ import re
6
+ import pycrfsuite
7
+ from typing import Dict, List, Any
8
+
9
+
10
+ class PythonCRFFeaturizer:
11
+ """
12
+ Python implementation of CRFFeaturizer compatible with underthesea_core API.
13
+ """
14
+
15
+ def __init__(self, feature_templates, dictionary=None):
16
+ self.feature_templates = feature_templates
17
+ self.dictionary = dictionary or set()
18
+
19
+ def _parse_template(self, template):
20
+ match = re.match(r'T\[([^\]]+)\](?:\.(\w+))?', template)
21
+ if not match:
22
+ return None, None, None
23
+ indices_str = match.group(1)
24
+ attribute = match.group(2)
25
+ indices = [int(i.strip()) for i in indices_str.split(',')]
26
+ return indices, attribute, template
27
+
28
+ def _get_token_value(self, tokens, position, index):
29
+ actual_pos = position + index
30
+ if actual_pos < 0:
31
+ return '__BOS__'
32
+ elif actual_pos >= len(tokens):
33
+ return '__EOS__'
34
+ return tokens[actual_pos]
35
+
36
+ def _apply_attribute(self, value, attribute):
37
+ if value in ('__BOS__', '__EOS__'):
38
+ return value
39
+ if attribute is None:
40
+ return value
41
+ elif attribute == 'lower':
42
+ return value.lower()
43
+ elif attribute == 'upper':
44
+ return value.upper()
45
+ elif attribute == 'istitle':
46
+ return str(value.istitle())
47
+ elif attribute == 'isupper':
48
+ return str(value.isupper())
49
+ elif attribute == 'islower':
50
+ return str(value.islower())
51
+ elif attribute == 'isdigit':
52
+ return str(value.isdigit())
53
+ elif attribute == 'isalpha':
54
+ return str(value.isalpha())
55
+ elif attribute == 'is_in_dict':
56
+ return str(value in self.dictionary)
57
+ elif attribute.startswith('prefix'):
58
+ n = int(attribute[6:]) if len(attribute) > 6 else 2
59
+ return value[:n] if len(value) >= n else value
60
+ elif attribute.startswith('suffix'):
61
+ n = int(attribute[6:]) if len(attribute) > 6 else 2
62
+ return value[-n:] if len(value) >= n else value
63
+ else:
64
+ return value
65
+
66
+ def extract_features(self, tokens, position):
67
+ features = {}
68
+ for template in self.feature_templates:
69
+ indices, attribute, template_str = self._parse_template(template)
70
+ if indices is None:
71
+ continue
72
+ if len(indices) == 1:
73
+ value = self._get_token_value(tokens, position, indices[0])
74
+ value = self._apply_attribute(value, attribute)
75
+ features[template_str] = value
76
+ else:
77
+ values = [self._get_token_value(tokens, position, idx) for idx in indices]
78
+ if attribute == 'is_in_dict':
79
+ combined = ' '.join(values)
80
+ features[template_str] = str(combined in self.dictionary)
81
+ else:
82
+ combined = '|'.join(values)
83
+ features[template_str] = combined
84
+ return features
85
+
86
+
87
+ class EndpointHandler:
88
+ def __init__(self, path: str = ""):
89
+ import os
90
+
91
+ # Feature templates
92
+ self.feature_templates = [
93
+ "T[0]", "T[0].lower", "T[0].istitle", "T[0].isupper",
94
+ "T[0].isdigit", "T[0].isalpha", "T[0].prefix2", "T[0].prefix3",
95
+ "T[0].suffix2", "T[0].suffix3", "T[-1]", "T[-1].lower",
96
+ "T[-1].istitle", "T[-1].isupper", "T[-2]", "T[-2].lower",
97
+ "T[1]", "T[1].lower", "T[1].istitle", "T[1].isupper",
98
+ "T[2]", "T[2].lower", "T[-1,0]", "T[0,1]",
99
+ "T[0].is_in_dict", "T[-1,0].is_in_dict", "T[0,1].is_in_dict",
100
+ ]
101
+
102
+ self.featurizer = PythonCRFFeaturizer(self.feature_templates)
103
+
104
+ # Load CRF model
105
+ model_path = os.path.join(path, "pos_tagger.crfsuite")
106
+ self.tagger = pycrfsuite.Tagger()
107
+ self.tagger.open(model_path)
108
+
109
+ def _tokenize(self, text: str) -> List[str]:
110
+ """Simple whitespace tokenization."""
111
+ return text.strip().split()
112
+
113
+ def _extract_features(self, tokens: List[str]) -> List[List[str]]:
114
+ """Extract features for all tokens in a sentence."""
115
+ features = []
116
+ for i in range(len(tokens)):
117
+ feat_dict = self.featurizer.extract_features(tokens, i)
118
+ feature_list = [f"{k}={v}" for k, v in feat_dict.items()]
119
+ features.append(feature_list)
120
+ return features
121
+
122
+ def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
123
+ """
124
+ Handle inference requests.
125
+
126
+ Args:
127
+ data: Dict with "inputs" key containing text or list of texts
128
+
129
+ Returns:
130
+ List of dicts with token and POS tag pairs
131
+ """
132
+ inputs = data.get("inputs", data.get("text", ""))
133
+
134
+ # Handle single string or list
135
+ if isinstance(inputs, str):
136
+ inputs = [inputs]
137
+
138
+ results = []
139
+ for text in inputs:
140
+ tokens = self._tokenize(text)
141
+ if not tokens:
142
+ results.append([])
143
+ continue
144
+
145
+ features = self._extract_features(tokens)
146
+ tags = self.tagger.tag(features)
147
+
148
+ result = [{"token": token, "tag": tag} for token, tag in zip(tokens, tags)]
149
+ results.append(result)
150
+
151
+ return results if len(results) > 1 else results[0]
pos_tagger.crfsuite ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd377052b800765329f6fa849e1c2d7507b19dd6acd3c8d8600f9a026afab447
3
+ size 458700
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ python-crfsuite>=0.9.11