tre-1

File size: 7,199 Bytes

"""
Custom handler for Vietnamese POS Tagger inference on Hugging Face.

Supports two model formats:
- CRFsuite format (.crfsuite) - loaded with pycrfsuite
- underthesea-core format (.crf) - loaded with underthesea_core
"""

import os
import re
from typing import Dict, List, Any

# Try importing both taggers
try:
    import pycrfsuite
    HAS_PYCRFSUITE = True
except ImportError:
    HAS_PYCRFSUITE = False

try:
    from underthesea_core import CRFModel, CRFTagger
    HAS_UNDERTHESEA_CORE = True
except ImportError:
    try:
        from underthesea_core.underthesea_core import CRFModel, CRFTagger
        HAS_UNDERTHESEA_CORE = True
    except ImportError:
        HAS_UNDERTHESEA_CORE = False


class PythonCRFFeaturizer:
    """
    Python implementation of CRFFeaturizer compatible with underthesea_core API.
    """

    def __init__(self, feature_templates, dictionary=None):
        self.feature_templates = feature_templates
        self.dictionary = dictionary or set()

    def _parse_template(self, template):
        match = re.match(r'T\[([^\]]+)\](?:\.(\w+))?', template)
        if not match:
            return None, None, None
        indices_str = match.group(1)
        attribute = match.group(2)
        indices = [int(i.strip()) for i in indices_str.split(',')]
        return indices, attribute, template

    def _get_token_value(self, tokens, position, index):
        actual_pos = position + index
        if actual_pos < 0:
            return '__BOS__'
        elif actual_pos >= len(tokens):
            return '__EOS__'
        return tokens[actual_pos]

    def _apply_attribute(self, value, attribute):
        if value in ('__BOS__', '__EOS__'):
            return value
        if attribute is None:
            return value
        elif attribute == 'lower':
            return value.lower()
        elif attribute == 'upper':
            return value.upper()
        elif attribute == 'istitle':
            return str(value.istitle())
        elif attribute == 'isupper':
            return str(value.isupper())
        elif attribute == 'islower':
            return str(value.islower())
        elif attribute == 'isdigit':
            return str(value.isdigit())
        elif attribute == 'isalpha':
            return str(value.isalpha())
        elif attribute == 'is_in_dict':
            return str(value in self.dictionary)
        elif attribute.startswith('prefix'):
            n = int(attribute[6:]) if len(attribute) > 6 else 2
            return value[:n] if len(value) >= n else value
        elif attribute.startswith('suffix'):
            n = int(attribute[6:]) if len(attribute) > 6 else 2
            return value[-n:] if len(value) >= n else value
        else:
            return value

    def extract_features(self, tokens, position):
        features = {}
        for template in self.feature_templates:
            indices, attribute, template_str = self._parse_template(template)
            if indices is None:
                continue
            if len(indices) == 1:
                value = self._get_token_value(tokens, position, indices[0])
                value = self._apply_attribute(value, attribute)
                features[template_str] = value
            else:
                values = [self._get_token_value(tokens, position, idx) for idx in indices]
                if attribute == 'is_in_dict':
                    combined = ' '.join(values)
                    features[template_str] = str(combined in self.dictionary)
                else:
                    combined = '|'.join(values)
                    features[template_str] = combined
        return features


class EndpointHandler:
    def __init__(self, path: str = ""):
        import os

        # Feature templates
        self.feature_templates = [
            "T[0]", "T[0].lower", "T[0].istitle", "T[0].isupper",
            "T[0].isdigit", "T[0].isalpha", "T[0].prefix2", "T[0].prefix3",
            "T[0].suffix2", "T[0].suffix3", "T[-1]", "T[-1].lower",
            "T[-1].istitle", "T[-1].isupper", "T[-2]", "T[-2].lower",
            "T[1]", "T[1].lower", "T[1].istitle", "T[1].isupper",
            "T[2]", "T[2].lower", "T[-1,0]", "T[0,1]",
            "T[0].is_in_dict", "T[-1,0].is_in_dict", "T[0,1].is_in_dict",
        ]

        self.featurizer = PythonCRFFeaturizer(self.feature_templates)

        # Load CRF model - check multiple possible locations and formats
        # Priority: .crfsuite (pycrfsuite) > .crf (underthesea-core)
        model_candidates = [
            (os.path.join(path, "model.crfsuite"), "pycrfsuite"),
            (os.path.join(path, "pos_tagger.crfsuite"), "pycrfsuite"),
            (os.path.join(path, "model.crf"), "underthesea-core"),
        ]

        model_path = None
        model_format = None
        for candidate, fmt in model_candidates:
            if os.path.exists(candidate):
                model_path = candidate
                model_format = fmt
                break

        if model_path is None:
            raise FileNotFoundError(
                f"No model found. Checked: {[c for c, _ in model_candidates]}"
            )

        # Load model based on format
        self.model_format = model_format
        if model_format == "pycrfsuite":
            if not HAS_PYCRFSUITE:
                raise ImportError("pycrfsuite not installed. Install with: pip install python-crfsuite")
            self.tagger = pycrfsuite.Tagger()
            self.tagger.open(model_path)
        elif model_format == "underthesea-core":
            if not HAS_UNDERTHESEA_CORE:
                raise ImportError("underthesea-core not installed")
            model = CRFModel.load(model_path)
            self.tagger = CRFTagger.from_model(model)

    def _tokenize(self, text: str) -> List[str]:
        """Simple whitespace tokenization."""
        return text.strip().split()

    def _extract_features(self, tokens: List[str]) -> List[List[str]]:
        """Extract features for all tokens in a sentence."""
        features = []
        for i in range(len(tokens)):
            feat_dict = self.featurizer.extract_features(tokens, i)
            feature_list = [f"{k}={v}" for k, v in feat_dict.items()]
            features.append(feature_list)
        return features

    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
        """
        Handle inference requests.

        Args:
            data: Dict with "inputs" key containing text or list of texts

        Returns:
            List of dicts with token and POS tag pairs
        """
        inputs = data.get("inputs", data.get("text", ""))

        # Handle single string or list
        if isinstance(inputs, str):
            inputs = [inputs]

        results = []
        for text in inputs:
            tokens = self._tokenize(text)
            if not tokens:
                results.append([])
                continue

            features = self._extract_features(tokens)
            tags = self.tagger.tag(features)

            result = [{"token": token, "tag": tag} for token, tag in zip(tokens, tags)]
            results.append(result)

        return results if len(results) > 1 else results[0]