TymaaHammouda commited on
Commit
fac8a97
·
1 Parent(s): e028bfd
Files changed (1) hide show
  1. app.py +21 -22
app.py CHANGED
@@ -3,7 +3,7 @@ import torch
3
  import pickle
4
  from huggingface_hub import hf_hub_download
5
  from Nested.nn.BertSeqTagger import BertSeqTagger
6
-
7
  app = FastAPI()
8
  print("Version 2...")
9
 
@@ -13,6 +13,9 @@ print("Version 2...")
13
  # filename="tag_vocab.pkl"
14
  # )
15
 
 
 
 
16
  checkpoint_path = hf_hub_download(
17
  repo_id="SinaLab/Nested",
18
  filename="checkpoints/checkpoint_2.pt"
@@ -74,22 +77,18 @@ ckpt = torch.load(checkpoint_path, map_location="cpu")
74
  model = load_model_from_checkpoint(model, ckpt, strict=False)
75
  # model.eval()
76
 
77
- def predict_ner(sentence: str, model, id2label: dict, device="cpu"):
78
  model.to(device)
79
  model.eval()
80
 
81
  words = sentence.split()
82
 
83
- tokenizer = getattr(model, "tokenizer", None)
84
- if tokenizer is None:
85
- raise ValueError("Model has no tokenizer. Use AutoTokenizer and attach it or pass it explicitly.")
86
-
87
  enc = tokenizer(
88
  words,
89
  is_split_into_words=True,
90
  return_tensors="pt",
91
  truncation=True,
92
- padding=False
93
  )
94
  enc = {k: v.to(device) for k, v in enc.items()}
95
 
@@ -98,28 +97,27 @@ def predict_ner(sentence: str, model, id2label: dict, device="cpu"):
98
  out = model(**enc)
99
  logits = out.logits if hasattr(out, "logits") else out
100
  except TypeError:
101
- if not hasattr(model, "transformer") or not hasattr(model, "classification_head"):
102
- raise
103
  h = model.transformer(**enc).last_hidden_state
104
- h = model.dropout(h) if hasattr(model, "dropout") else h
 
105
  logits = model.classification_head(h)
106
 
107
- pred_ids = logits.argmax(dim=-1).squeeze(0).tolist()
108
 
109
- word_ids = enc.get("input_ids").new_tensor([0]) # placeholder to keep structure
110
- word_ids = tokenizer(words, is_split_into_words=True).word_ids()
111
 
112
- word_labels = []
113
- used = set()
 
114
  for tok_i, w_i in enumerate(word_ids):
115
- if w_i is None:
116
- continue
117
- if w_i in used:
118
  continue
119
- used.add(w_i)
120
- word_labels.append((words[w_i], id2label[pred_ids[tok_i]]))
121
 
122
- return word_labels
123
 
124
  def find_label_vocab(vocabs):
125
  for i, v in enumerate(vocabs):
@@ -137,5 +135,6 @@ id2label = {i: s for i, s in enumerate(label_vocab.itos)}
137
 
138
  sentence = "ذهب احمد الى السوق"
139
  # id2label = {i: s for i, s in enumerate(label_vocab.itos)}
140
- pairs = predict_ner(sentence, model, label_vocab, device="cpu")
 
141
  print(pairs)
 
3
  import pickle
4
  from huggingface_hub import hf_hub_download
5
  from Nested.nn.BertSeqTagger import BertSeqTagger
6
+ from transformers import AutoTokenizer
7
  app = FastAPI()
8
  print("Version 2...")
9
 
 
13
  # filename="tag_vocab.pkl"
14
  # )
15
 
16
+ pretrained_path = "aubmindlab/bert-base-arabertv2" # change if different in your training
17
+ tokenizer = AutoTokenizer.from_pretrained(pretrained_path)
18
+
19
  checkpoint_path = hf_hub_download(
20
  repo_id="SinaLab/Nested",
21
  filename="checkpoints/checkpoint_2.pt"
 
77
  model = load_model_from_checkpoint(model, ckpt, strict=False)
78
  # model.eval()
79
 
80
+ def predict_ner(sentence: str, model, tokenizer, id2label: dict, device="cpu", max_length=128):
81
  model.to(device)
82
  model.eval()
83
 
84
  words = sentence.split()
85
 
 
 
 
 
86
  enc = tokenizer(
87
  words,
88
  is_split_into_words=True,
89
  return_tensors="pt",
90
  truncation=True,
91
+ max_length=max_length
92
  )
93
  enc = {k: v.to(device) for k, v in enc.items()}
94
 
 
97
  out = model(**enc)
98
  logits = out.logits if hasattr(out, "logits") else out
99
  except TypeError:
100
+ # fallback for your custom BertSeqTagger-like model
 
101
  h = model.transformer(**enc).last_hidden_state
102
+ if hasattr(model, "dropout"):
103
+ h = model.dropout(h)
104
  logits = model.classification_head(h)
105
 
106
+ pred_ids = logits.argmax(dim=-1)[0].tolist()
107
 
108
+ word_ids = enc["input_ids"].new_zeros(enc["input_ids"].shape[1]).tolist()
109
+ word_ids = tokenizer(words, is_split_into_words=True, truncation=True, max_length=max_length).word_ids()
110
 
111
+ # first subtoken per word -> label
112
+ results = []
113
+ seen = set()
114
  for tok_i, w_i in enumerate(word_ids):
115
+ if w_i is None or w_i in seen:
 
 
116
  continue
117
+ seen.add(w_i)
118
+ results.append((words[w_i], id2label[pred_ids[tok_i]]))
119
 
120
+ return results
121
 
122
  def find_label_vocab(vocabs):
123
  for i, v in enumerate(vocabs):
 
135
 
136
  sentence = "ذهب احمد الى السوق"
137
  # id2label = {i: s for i, s in enumerate(label_vocab.itos)}
138
+ # pairs = predict_ner(sentence, model, label_vocab, device="cpu")
139
+ pairs = predict_ner(sentence, model, tokenizer, id2label, device="cpu")
140
  print(pairs)