alfonsovelp
/

gliner

@@ -1,120 +1,48 @@
-import re
-class TokenSplitterBase():
-    def __init__(self):
-        pass
-    def __call__(self, text) -> (str, int, int):
-        pass
-class WhitespaceTokenSplitter(TokenSplitterBase):
-    def __init__(self):
-        self.whitespace_pattern = re.compile(r'\w+(?:[-_]\w+)*|\S')
-    def __call__(self, text):
-        for match in self.whitespace_pattern.finditer(text):
-            yield match.group(), match.start(), match.end()
-class SpaCyTokenSplitter(TokenSplitterBase):
-    def __init__(self, lang=None):
-        try:
-            import spacy # noqa
-        except ModuleNotFoundError as error:
-            raise error.__class__(
-                "Please install spacy with: `pip install spacy`"
-            )
-        if lang is None:
-            lang = 'en'  # Default to English if no language is specified
-        self.nlp = spacy.blank(lang)
-    def __call__(self, text):
-        doc = self.nlp(text)
-        for token in doc:
-            yield token.text, token.idx, token.idx + len(token.text)
-class MecabKoTokenSplitter(TokenSplitterBase):
-    def __init__(self):
-        try:
-            import mecab  # noqa
-        except ModuleNotFoundError as error:
-            raise error.__class__(
-                "Please install python-mecab-ko with: `pip install python-mecab-ko`"
-            )
-        self.tagger = mecab.MeCab()
-    def __call__(self, text):
-        tokens = self.tagger.morphs(text)
-        last_idx = 0
-        for morph in tokens:
-            start_idx = text.find(morph, last_idx)
-            end_idx = start_idx + len(morph)
-            last_idx = end_idx
-            yield morph, start_idx, end_idx
-class JiebaTokenSplitter(TokenSplitterBase):
-    def __init__(self):
-        try:
-            import jieba  # noqa
-        except ModuleNotFoundError as error:
-            raise error.__class__(
-                "Please install jieba with: `pip install jieba`"
-            )
-        self.tagger = jieba
-    def __call__(self, text):
-        tokens = self.tagger.cut(text)
-        last_idx = 0
-        for token in tokens:
-            start_idx = text.find(token, last_idx)
-            end_idx = start_idx + len(token)
-            last_idx = end_idx
-            yield token, start_idx, end_idx
-class HanLPTokenSplitter(TokenSplitterBase):
-    def __init__(self, model_name="FINE_ELECTRA_SMALL_ZH"):
-        try:
-            import hanlp  # noqa
-            import hanlp.pretrained
-        except ModuleNotFoundError as error:
-            raise error.__class__(
-                "Please install hanlp with: `pip install hanlp`"
-            )
-        models = hanlp.pretrained.tok.ALL
-        if model_name not in models:
-            raise ValueError(f"HanLP: {model_name} is not available, choose between {models.keys()}")
-        url = models[model_name]
-        self.tagger = hanlp.load(url)
-    def __call__(self, text):
-        tokens = self.tagger(text)
-        last_idx = 0
-        for token in tokens:
-            start_idx = text.find(token, last_idx)
-            end_idx = start_idx + len(token)
-            last_idx = end_idx
-            yield token, start_idx, end_idx
-class WordsSplitter(TokenSplitterBase):
-    def __init__(self, splitter_type='whitespace'):
-        if splitter_type=='whitespace':
-            self.splitter = WhitespaceTokenSplitter()
-        elif splitter_type == 'spacy':
-            self.splitter = SpaCyTokenSplitter()
-        elif splitter_type == 'mecab':
-            self.splitter = MecabKoTokenSplitter()
-        elif splitter_type == 'jieba':
-            self.splitter = JiebaTokenSplitter()
-        elif splitter_type == 'hanlp':
-            self.splitter = HanLPTokenSplitter()
-        else:
-            raise ValueError(f"{splitter_type} is not implemented, choose between 'whitespace', 'spacy', 'jieba', 'hanlp' and 'mecab'")
-    def __call__(self, text):
-        for token in self.splitter(text):
-            yield token

+from transformers import AutoTokenizer
+from gliner import GLiNER
+from huggingface_inference_toolkit.base import BaseHandler
+class EndpointHandler(BaseHandler):
+    def __init__(self, path=""):
+        self.model = GLiNER.from_pretrained(path)
+        self.tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-large")
+        self.initialized = True
+    def __call__(self, data):
+        """
+        Args:
+            data: Dictionary with:
+                - text (str): Input text
+                - labels (str): Comma-separated labels
+                - threshold (float, optional): Confidence threshold
+                - nested_ner (bool, optional): Enable nested NER
+        Returns:
+            Dictionary with predicted entities
+        """
+        # Get inputs
+        text = data.pop("inputs", data.get("text", ""))
+        labels = data.get("labels", "").split(",")
+        threshold = float(data.get("threshold", 0.3))
+        nested_ner = bool(data.get("nested_ner", True))
+        # Run prediction
+        entities = self.model.predict_entities(
+            text,
+            labels,
+            flat_ner=not nested_ner,
+            threshold=threshold
+        )
+        # Format output
+        return {
+            "entities": [
+                {
+                    "entity": entity["label"],
+                    "word": entity["text"],
+                    "start": entity["start"],
+                    "end": entity["end"],
+                    "score": 0
+                }
+                for entity in entities
+            ]
+        }