Spaces:

SinaLab
/

wojood-api

Running

App Files Files Community

TymaaHammouda commited on 11 days ago

Commit

2ba7df1

1 Parent(s): 1de09fd

Updates

Browse files

Files changed (3) hide show

Nested/utils/data.py +49 -1
app.py +55 -2
requirements.txt +2 -1

Nested/utils/data.py CHANGED Viewed

@@ -1,3 +1,5 @@
 class Vocab:
     def _init_(self, counter, specials=[]) -> None:
         self.itos = list(counter.keys()) + specials
@@ -11,4 +13,50 @@ class Vocab:
         return self.stoi
     def _len_(self):
-        return len(self.itos)

+from collections import Counter
 class Vocab:
     def _init_(self, counter, specials=[]) -> None:
         self.itos = list(counter.keys()) + specials
         return self.stoi
     def _len_(self):
+        return len(self.itos)
+class Token:
+    def __init__(self, text=None, pred_tag=None, gold_tag=None):
+        """
+        Token object to hold token attributes
+        :param text: str
+        :param pred_tag: str
+        :param gold_tag: str
+        """
+        self.text = text
+        self.gold_tag = gold_tag
+        self.pred_tag = pred_tag
+        self.subwords = None
+    @property
+    def subwords(self):
+        return self._subwords
+    @subwords.setter
+    def subwords(self, value):
+        self._subwords = value
+    def __str__(self):
+        """
+        Token text representation
+        :return: str
+        """
+        gold_tags = "|".join(self.gold_tag)
+        if self.pred_tag:
+            pred_tags = "|".join([pred_tag["tag"] for pred_tag in self.pred_tag])
+        else:
+            pred_tags = ""
+        if self.gold_tag:
+            r = f"{self.text}\t{gold_tags}\t{pred_tags}"
+        else:
+            r = f"{self.text}\t{pred_tags}"
+        return r
+def text2segments(text):
+    """
+    Convert text to a datasets and index the tokens
+    """
+    dataset = [[Token(text=token, gold_tag=["O"]) for token in text.split()]]
+    tokens = [token.text for segment in dataset for token in segment]
+    # Generate vocabs for the tokens
+    segment_vocab = Vocab(Counter(tokens), specials=["UNK"])
+    return dataset, segment_vocab

app.py CHANGED Viewed

@@ -20,7 +20,7 @@ checkpoint_path = hf_hub_download(
 # Load model
 with open("Nested/utils/tag_vocab.pkl", "rb") as f:
-    id2label = pickle.load(f)
 # model = torch.load(checkpoint_path, map_location="cpu")
 model = BertSeqTagger(
@@ -72,4 +72,57 @@ def load_model_from_checkpoint(model, checkpoint, strict=True):
 ckpt = torch.load(checkpoint_path, map_location="cpu")
 model = load_model_from_checkpoint(model, ckpt, strict=False)
-model.eval()

 # Load model
 with open("Nested/utils/tag_vocab.pkl", "rb") as f:
+    label_vocab = pickle.load(f)
 # model = torch.load(checkpoint_path, map_location="cpu")
 model = BertSeqTagger(
 ckpt = torch.load(checkpoint_path, map_location="cpu")
 model = load_model_from_checkpoint(model, ckpt, strict=False)
+# model.eval()
+def predict_ner(sentence: str, model, id2label: dict, device="cpu"):
+    model.to(device)
+    model.eval()
+    words = sentence.split()
+    tokenizer = getattr(model, "tokenizer", None)
+    if tokenizer is None:
+        raise ValueError("Model has no tokenizer. Use AutoTokenizer and attach it or pass it explicitly.")
+    enc = tokenizer(
+        words,
+        is_split_into_words=True,
+        return_tensors="pt",
+        truncation=True,
+        padding=False
+    )
+    enc = {k: v.to(device) for k, v in enc.items()}
+    with torch.no_grad():
+        try:
+            out = model(**enc)
+            logits = out.logits if hasattr(out, "logits") else out
+        except TypeError:
+            if not hasattr(model, "transformer") or not hasattr(model, "classification_head"):
+                raise
+            h = model.transformer(**enc).last_hidden_state
+            h = model.dropout(h) if hasattr(model, "dropout") else h
+            logits = model.classification_head(h)
+    pred_ids = logits.argmax(dim=-1).squeeze(0).tolist()
+    word_ids = enc.get("input_ids").new_tensor([0])  # placeholder to keep structure
+    word_ids = tokenizer(words, is_split_into_words=True).word_ids()
+    word_labels = []
+    used = set()
+    for tok_i, w_i in enumerate(word_ids):
+        if w_i is None:
+            continue
+        if w_i in used:
+            continue
+        used.add(w_i)
+        word_labels.append((words[w_i], id2label[pred_ids[tok_i]]))
+    return word_labels
+sentence = "ذهب احمد الى السوق"
+id2label = {i: s for i, s in enumerate(label_vocab.itos)}
+pairs = predict_ner(sentence, model, id2label, device="cpu")
+print(pairs)

requirements.txt CHANGED Viewed

@@ -3,4 +3,5 @@ fastapi
 uvicorn
 numpy
 huggingface_hub
-transformers

 uvicorn
 numpy
 huggingface_hub
+transformers
+collections