Initial model upload: Vietnamese POS Tagger

Browse files

Files changed (5) hide show

.gitattributes +1 -0
README.md +107 -0
handler.py +151 -0
pos_tagger.crfsuite +3 -0
requirements.txt +1 -0

.gitattributes CHANGED Viewed

@@ -33,5 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 *.mco filter=lfs diff=lfs merge=lfs -text
 *.jar filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+pos_tagger.crfsuite filter=lfs diff=lfs merge=lfs -text
 *.mco filter=lfs diff=lfs merge=lfs -text
 *.jar filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,107 @@

+---
+language:
+- vi
+license: apache-2.0
+tags:
+- pos
+- part-of-speech
+- vietnamese
+- crf
+- nlp
+- token-classification
+datasets:
+- undertheseanlp/UDD-v0.1
+library_name: python-crfsuite
+pipeline_tag: token-classification
+---
+# Vietnamese POS Tagger (TRE-1)
+A Conditional Random Field (CRF) based Part-of-Speech tagger for Vietnamese, trained on the Universal Dependencies Dataset (UDD-v0.1).
+## Model Description
+This model uses CRF (Conditional Random Fields) with handcrafted features inspired by the underthesea NLP library. It achieves high accuracy on Vietnamese POS tagging tasks.
+### Features
+- **Architecture**: CRF (python-crfsuite)
+- **Language**: Vietnamese
+- **Tagset**: Universal POS tags (UPOS)
+- **Training Data**: undertheseanlp/UDD-v0.1
+### Feature Templates
+The model uses the following feature templates:
+- Current token features: word form, lowercase, prefix/suffix (2-3 chars), character type checks
+- Context features: previous and next 1-2 tokens
+- Bigram features: adjacent token combinations
+- Dictionary features: in-vocabulary checks
+## Usage
+### Using the Inference API
+```python
+import requests
+API_URL = "https://api-inference.huggingface.co/models/undertheseanlp/tre-1"
+headers = {"Authorization": "Bearer YOUR_TOKEN"}
+def query(payload):
+    response = requests.post(API_URL, headers=headers, json=payload)
+    return response.json()
+output = query({"inputs": "Tôi yêu Việt Nam"})
+print(output)
+# [{"token": "Tôi", "tag": "PRON"}, {"token": "yêu", "tag": "VERB"}, ...]
+```
+### Local Usage
+```python
+import pycrfsuite
+from handler import EndpointHandler
+handler = EndpointHandler(path="./")
+result = handler({"inputs": "Tôi yêu Việt Nam"})
+print(result)
+```
+## Training
+The model was trained using:
+- L1 regularization (c1): 1.0
+- L2 regularization (c2): 1e-3
+- Max iterations: 100
+## Performance
+Evaluated on a held-out test set from UDD-v0.1:
+- Accuracy: ~94%
+- F1 (macro): ~90%
+- F1 (weighted): ~94%
+## Limitations
+- Requires pre-tokenized input (whitespace-separated tokens)
+- Performance may vary on out-of-domain text
+- Does not handle Vietnamese word segmentation
+## Citation
+If you use this model, please cite:
+```bibtex
+@misc{tre1-pos-tagger,
+  author = {undertheseanlp},
+  title = {Vietnamese POS Tagger TRE-1},
+  year = {2025},
+  publisher = {Hugging Face},
+  url = {https://huggingface.co/undertheseanlp/tre-1}
+}
+```
+## License
+Apache 2.0

handler.py ADDED Viewed

	@@ -0,0 +1,151 @@

+"""
+Custom handler for Vietnamese POS Tagger inference on Hugging Face.
+"""
+import re
+import pycrfsuite
+from typing import Dict, List, Any
+class PythonCRFFeaturizer:
+    """
+    Python implementation of CRFFeaturizer compatible with underthesea_core API.
+    """
+    def __init__(self, feature_templates, dictionary=None):
+        self.feature_templates = feature_templates
+        self.dictionary = dictionary or set()
+    def _parse_template(self, template):
+        match = re.match(r'T\[([^\]]+)\](?:\.(\w+))?', template)
+        if not match:
+            return None, None, None
+        indices_str = match.group(1)
+        attribute = match.group(2)
+        indices = [int(i.strip()) for i in indices_str.split(',')]
+        return indices, attribute, template
+    def _get_token_value(self, tokens, position, index):
+        actual_pos = position + index
+        if actual_pos < 0:
+            return '__BOS__'
+        elif actual_pos >= len(tokens):
+            return '__EOS__'
+        return tokens[actual_pos]
+    def _apply_attribute(self, value, attribute):
+        if value in ('__BOS__', '__EOS__'):
+            return value
+        if attribute is None:
+            return value
+        elif attribute == 'lower':
+            return value.lower()
+        elif attribute == 'upper':
+            return value.upper()
+        elif attribute == 'istitle':
+            return str(value.istitle())
+        elif attribute == 'isupper':
+            return str(value.isupper())
+        elif attribute == 'islower':
+            return str(value.islower())
+        elif attribute == 'isdigit':
+            return str(value.isdigit())
+        elif attribute == 'isalpha':
+            return str(value.isalpha())
+        elif attribute == 'is_in_dict':
+            return str(value in self.dictionary)
+        elif attribute.startswith('prefix'):
+            n = int(attribute[6:]) if len(attribute) > 6 else 2
+            return value[:n] if len(value) >= n else value
+        elif attribute.startswith('suffix'):
+            n = int(attribute[6:]) if len(attribute) > 6 else 2
+            return value[-n:] if len(value) >= n else value
+        else:
+            return value
+    def extract_features(self, tokens, position):
+        features = {}
+        for template in self.feature_templates:
+            indices, attribute, template_str = self._parse_template(template)
+            if indices is None:
+                continue
+            if len(indices) == 1:
+                value = self._get_token_value(tokens, position, indices[0])
+                value = self._apply_attribute(value, attribute)
+                features[template_str] = value
+            else:
+                values = [self._get_token_value(tokens, position, idx) for idx in indices]
+                if attribute == 'is_in_dict':
+                    combined = ' '.join(values)
+                    features[template_str] = str(combined in self.dictionary)
+                else:
+                    combined = '|'.join(values)
+                    features[template_str] = combined
+        return features
+class EndpointHandler:
+    def __init__(self, path: str = ""):
+        import os
+        # Feature templates
+        self.feature_templates = [
+            "T[0]", "T[0].lower", "T[0].istitle", "T[0].isupper",
+            "T[0].isdigit", "T[0].isalpha", "T[0].prefix2", "T[0].prefix3",
+            "T[0].suffix2", "T[0].suffix3", "T[-1]", "T[-1].lower",
+            "T[-1].istitle", "T[-1].isupper", "T[-2]", "T[-2].lower",
+            "T[1]", "T[1].lower", "T[1].istitle", "T[1].isupper",
+            "T[2]", "T[2].lower", "T[-1,0]", "T[0,1]",
+            "T[0].is_in_dict", "T[-1,0].is_in_dict", "T[0,1].is_in_dict",
+        ]
+        self.featurizer = PythonCRFFeaturizer(self.feature_templates)
+        # Load CRF model
+        model_path = os.path.join(path, "pos_tagger.crfsuite")
+        self.tagger = pycrfsuite.Tagger()
+        self.tagger.open(model_path)
+    def _tokenize(self, text: str) -> List[str]:
+        """Simple whitespace tokenization."""
+        return text.strip().split()
+    def _extract_features(self, tokens: List[str]) -> List[List[str]]:
+        """Extract features for all tokens in a sentence."""
+        features = []
+        for i in range(len(tokens)):
+            feat_dict = self.featurizer.extract_features(tokens, i)
+            feature_list = [f"{k}={v}" for k, v in feat_dict.items()]
+            features.append(feature_list)
+        return features
+    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
+        """
+        Handle inference requests.
+        Args:
+            data: Dict with "inputs" key containing text or list of texts
+        Returns:
+            List of dicts with token and POS tag pairs
+        """
+        inputs = data.get("inputs", data.get("text", ""))
+        # Handle single string or list
+        if isinstance(inputs, str):
+            inputs = [inputs]
+        results = []
+        for text in inputs:
+            tokens = self._tokenize(text)
+            if not tokens:
+                results.append([])
+                continue
+            features = self._extract_features(tokens)
+            tags = self.tagger.tag(features)
+            result = [{"token": token, "tag": tag} for token, tag in zip(tokens, tags)]
+            results.append(result)
+        return results if len(results) > 1 else results[0]

pos_tagger.crfsuite ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bd377052b800765329f6fa849e1c2d7507b19dd6acd3c8d8600f9a026afab447
+size 458700

requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ python-crfsuite>=0.9.11