auhide
/

bert-bg-ner

@@ -10,4 +10,110 @@ metrics:
 ---
 # BERT Bulgarian Named Entity Recognition
-Fine-tuned on a Bulgarian subset of [wikiann](https://huggingface.co/datasets/wikiann).

 ---
 # BERT Bulgarian Named Entity Recognition
+Fine-tuned on a Bulgarian subset of [wikiann](https://huggingface.co/datasets/wikiann).
+## Usage
+Firstly, you'll have to define these methods, since we are using a subword Tokenizer:
+```python
+def predict(
+    text: str,
+    model: torch.nn.Module,
+    tokenizer: AutoTokenizer,
+    labels_tags={
+        0: "O",
+        1: "B-PER", 2: "I-PER",
+        3: "B-ORG", 4: "I-ORG",
+        5: "B-LOC", 6: "I-LOC"
+    }):
+    tokens_data = tokenizer(text)
+    tokens = tokenizer.convert_ids_to_tokens(tokens_data["input_ids"])
+    words = subwords_to_words(tokens)
+    input_ids = torch.LongTensor(tokens_data["input_ids"]).unsqueeze(0)
+    attention_mask = torch.LongTensor(tokens_data["attention_mask"]).unsqueeze(0)
+    out = model(input_ids, attention_mask=attention_mask).logits
+    out = out.argmax(-1).squeeze(0).tolist()
+    prediction = [labels_tags[idx] if idx in labels_tags else idx for idx in out]
+    return merge_words_and_predictions(words, prediction)
+def subwords_to_words(tokens: List[str]) -> List[str]:
+    out_tokens = []
+    curr_token = ""
+    tags = []
+    for token in tokens:
+        if token == "[SEP]":
+            curr_token = curr_token.replace("▁", "")
+            out_tokens.append(curr_token)
+            out_tokens.append("[SEP]")
+            break
+        if "▁" in token and curr_token == "":
+            curr_token += token
+        elif "▁" in token and curr_token != "":
+            curr_token = curr_token.replace("▁", "")
+            out_tokens.append(curr_token)
+            curr_token = ""
+            curr_token += token
+        elif "▁" not in token:
+            curr_token += token
+    return out_tokens
+def merge_words_and_predictions(words, entities):
+    result = []
+    curr_word = []
+    for i, (word, entity) in enumerate(zip(words[1:], entities[1:])):
+        if "B-" in entity:
+            if curr_word:
+                curr_word = " ".join(curr_word)
+                result.append({
+                    "word": curr_word,
+                    "entity": entities[i][2:]
+                })
+                curr_word = [word]
+            else:
+                curr_word.append(word)
+        if "I-" in entity:
+            curr_word.append(word)
+        if "O" == entity:
+            if curr_word:
+                curr_word = " ".join(curr_word)
+                result.append({
+                    "word": curr_word,
+                    "entity": entities[i][2:]
+                })
+            curr_word = []
+    return result
+```
+Then, you can initialize the `AutoTokenizer` and `AutoModelForTokenClassification` objects:
+```python
+MODEL_ID = "auhide/bert-bg-ner"
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+model = AutoModelForTokenClassification.from_pretrained(MODEL_ID)
+```
+Finally, you can call the `predict()` method from above like that:
+```python
+text = "Барух Спиноза е роден в Амстердам"
+print(f"Input: {text}")
+print("NERs:", predict(text, model=model, tokenizer=tokenizer))
+```
+```sh
+Input: Барух Спиноза е роден в Амстердам .
+NERs: [{'word': 'Барух Спиноза', 'entity': 'PER'}, {'word': 'Амстердам', 'entity': 'LOC'}]
+```