TymaaHammouda commited on
Commit
2ba7df1
·
1 Parent(s): 1de09fd
Files changed (3) hide show
  1. Nested/utils/data.py +49 -1
  2. app.py +55 -2
  3. requirements.txt +2 -1
Nested/utils/data.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  class Vocab:
2
  def _init_(self, counter, specials=[]) -> None:
3
  self.itos = list(counter.keys()) + specials
@@ -11,4 +13,50 @@ class Vocab:
11
  return self.stoi
12
 
13
  def _len_(self):
14
- return len(self.itos)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import Counter
2
+
3
  class Vocab:
4
  def _init_(self, counter, specials=[]) -> None:
5
  self.itos = list(counter.keys()) + specials
 
13
  return self.stoi
14
 
15
  def _len_(self):
16
+ return len(self.itos)
17
+
18
+
19
+ class Token:
20
+ def __init__(self, text=None, pred_tag=None, gold_tag=None):
21
+ """
22
+ Token object to hold token attributes
23
+ :param text: str
24
+ :param pred_tag: str
25
+ :param gold_tag: str
26
+ """
27
+ self.text = text
28
+ self.gold_tag = gold_tag
29
+ self.pred_tag = pred_tag
30
+ self.subwords = None
31
+ @property
32
+ def subwords(self):
33
+ return self._subwords
34
+ @subwords.setter
35
+ def subwords(self, value):
36
+ self._subwords = value
37
+ def __str__(self):
38
+ """
39
+ Token text representation
40
+ :return: str
41
+ """
42
+ gold_tags = "|".join(self.gold_tag)
43
+ if self.pred_tag:
44
+ pred_tags = "|".join([pred_tag["tag"] for pred_tag in self.pred_tag])
45
+ else:
46
+ pred_tags = ""
47
+ if self.gold_tag:
48
+ r = f"{self.text}\t{gold_tags}\t{pred_tags}"
49
+ else:
50
+ r = f"{self.text}\t{pred_tags}"
51
+ return r
52
+
53
+
54
+ def text2segments(text):
55
+ """
56
+ Convert text to a datasets and index the tokens
57
+ """
58
+ dataset = [[Token(text=token, gold_tag=["O"]) for token in text.split()]]
59
+ tokens = [token.text for segment in dataset for token in segment]
60
+ # Generate vocabs for the tokens
61
+ segment_vocab = Vocab(Counter(tokens), specials=["UNK"])
62
+ return dataset, segment_vocab
app.py CHANGED
@@ -20,7 +20,7 @@ checkpoint_path = hf_hub_download(
20
 
21
  # Load model
22
  with open("Nested/utils/tag_vocab.pkl", "rb") as f:
23
- id2label = pickle.load(f)
24
 
25
  # model = torch.load(checkpoint_path, map_location="cpu")
26
  model = BertSeqTagger(
@@ -72,4 +72,57 @@ def load_model_from_checkpoint(model, checkpoint, strict=True):
72
 
73
  ckpt = torch.load(checkpoint_path, map_location="cpu")
74
  model = load_model_from_checkpoint(model, ckpt, strict=False)
75
- model.eval()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  # Load model
22
  with open("Nested/utils/tag_vocab.pkl", "rb") as f:
23
+ label_vocab = pickle.load(f)
24
 
25
  # model = torch.load(checkpoint_path, map_location="cpu")
26
  model = BertSeqTagger(
 
72
 
73
  ckpt = torch.load(checkpoint_path, map_location="cpu")
74
  model = load_model_from_checkpoint(model, ckpt, strict=False)
75
+ # model.eval()
76
+
77
+ def predict_ner(sentence: str, model, id2label: dict, device="cpu"):
78
+ model.to(device)
79
+ model.eval()
80
+
81
+ words = sentence.split()
82
+
83
+ tokenizer = getattr(model, "tokenizer", None)
84
+ if tokenizer is None:
85
+ raise ValueError("Model has no tokenizer. Use AutoTokenizer and attach it or pass it explicitly.")
86
+
87
+ enc = tokenizer(
88
+ words,
89
+ is_split_into_words=True,
90
+ return_tensors="pt",
91
+ truncation=True,
92
+ padding=False
93
+ )
94
+ enc = {k: v.to(device) for k, v in enc.items()}
95
+
96
+ with torch.no_grad():
97
+ try:
98
+ out = model(**enc)
99
+ logits = out.logits if hasattr(out, "logits") else out
100
+ except TypeError:
101
+ if not hasattr(model, "transformer") or not hasattr(model, "classification_head"):
102
+ raise
103
+ h = model.transformer(**enc).last_hidden_state
104
+ h = model.dropout(h) if hasattr(model, "dropout") else h
105
+ logits = model.classification_head(h)
106
+
107
+ pred_ids = logits.argmax(dim=-1).squeeze(0).tolist()
108
+
109
+ word_ids = enc.get("input_ids").new_tensor([0]) # placeholder to keep structure
110
+ word_ids = tokenizer(words, is_split_into_words=True).word_ids()
111
+
112
+ word_labels = []
113
+ used = set()
114
+ for tok_i, w_i in enumerate(word_ids):
115
+ if w_i is None:
116
+ continue
117
+ if w_i in used:
118
+ continue
119
+ used.add(w_i)
120
+ word_labels.append((words[w_i], id2label[pred_ids[tok_i]]))
121
+
122
+ return word_labels
123
+
124
+
125
+ sentence = "ذهب احمد الى السوق"
126
+ id2label = {i: s for i, s in enumerate(label_vocab.itos)}
127
+ pairs = predict_ner(sentence, model, id2label, device="cpu")
128
+ print(pairs)
requirements.txt CHANGED
@@ -3,4 +3,5 @@ fastapi
3
  uvicorn
4
  numpy
5
  huggingface_hub
6
- transformers
 
 
3
  uvicorn
4
  numpy
5
  huggingface_hub
6
+ transformers
7
+ collections