auhide commited on
Commit
adf689b
·
1 Parent(s): 811dc07

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +107 -1
README.md CHANGED
@@ -10,4 +10,110 @@ metrics:
10
  ---
11
 
12
  # BERT Bulgarian Named Entity Recognition
13
- Fine-tuned on a Bulgarian subset of [wikiann](https://huggingface.co/datasets/wikiann).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  ---
11
 
12
  # BERT Bulgarian Named Entity Recognition
13
+ Fine-tuned on a Bulgarian subset of [wikiann](https://huggingface.co/datasets/wikiann).
14
+
15
+ ## Usage
16
+ Firstly, you'll have to define these methods, since we are using a subword Tokenizer:
17
+ ```python
18
+ def predict(
19
+ text: str,
20
+ model: torch.nn.Module,
21
+ tokenizer: AutoTokenizer,
22
+ labels_tags={
23
+ 0: "O",
24
+ 1: "B-PER", 2: "I-PER",
25
+ 3: "B-ORG", 4: "I-ORG",
26
+ 5: "B-LOC", 6: "I-LOC"
27
+ }):
28
+ tokens_data = tokenizer(text)
29
+ tokens = tokenizer.convert_ids_to_tokens(tokens_data["input_ids"])
30
+ words = subwords_to_words(tokens)
31
+
32
+ input_ids = torch.LongTensor(tokens_data["input_ids"]).unsqueeze(0)
33
+ attention_mask = torch.LongTensor(tokens_data["attention_mask"]).unsqueeze(0)
34
+
35
+ out = model(input_ids, attention_mask=attention_mask).logits
36
+ out = out.argmax(-1).squeeze(0).tolist()
37
+
38
+ prediction = [labels_tags[idx] if idx in labels_tags else idx for idx in out]
39
+
40
+ return merge_words_and_predictions(words, prediction)
41
+
42
+
43
+ def subwords_to_words(tokens: List[str]) -> List[str]:
44
+ out_tokens = []
45
+ curr_token = ""
46
+ tags = []
47
+
48
+ for token in tokens:
49
+ if token == "[SEP]":
50
+ curr_token = curr_token.replace("▁", "")
51
+ out_tokens.append(curr_token)
52
+ out_tokens.append("[SEP]")
53
+ break
54
+
55
+ if "▁" in token and curr_token == "":
56
+ curr_token += token
57
+
58
+ elif "▁" in token and curr_token != "":
59
+ curr_token = curr_token.replace("▁", "")
60
+ out_tokens.append(curr_token)
61
+ curr_token = ""
62
+ curr_token += token
63
+
64
+ elif "▁" not in token:
65
+ curr_token += token
66
+
67
+ return out_tokens
68
+
69
+
70
+ def merge_words_and_predictions(words, entities):
71
+ result = []
72
+ curr_word = []
73
+
74
+ for i, (word, entity) in enumerate(zip(words[1:], entities[1:])):
75
+ if "B-" in entity:
76
+ if curr_word:
77
+ curr_word = " ".join(curr_word)
78
+ result.append({
79
+ "word": curr_word,
80
+ "entity": entities[i][2:]
81
+ })
82
+ curr_word = [word]
83
+ else:
84
+ curr_word.append(word)
85
+
86
+ if "I-" in entity:
87
+ curr_word.append(word)
88
+
89
+ if "O" == entity:
90
+ if curr_word:
91
+ curr_word = " ".join(curr_word)
92
+ result.append({
93
+ "word": curr_word,
94
+ "entity": entities[i][2:]
95
+ })
96
+
97
+ curr_word = []
98
+
99
+ return result
100
+ ```
101
+
102
+ Then, you can initialize the `AutoTokenizer` and `AutoModelForTokenClassification` objects:
103
+ ```python
104
+ MODEL_ID = "auhide/bert-bg-ner"
105
+
106
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
107
+ model = AutoModelForTokenClassification.from_pretrained(MODEL_ID)
108
+ ```
109
+
110
+ Finally, you can call the `predict()` method from above like that:
111
+ ```python
112
+ text = "Барух Спиноза е роден в Амстердам"
113
+ print(f"Input: {text}")
114
+ print("NERs:", predict(text, model=model, tokenizer=tokenizer))
115
+ ```
116
+ ```sh
117
+ Input: Барух Спиноза е роден в Амстердам .
118
+ NERs: [{'word': 'Барух Спиноза', 'entity': 'PER'}, {'word': 'Амстердам', 'entity': 'LOC'}]
119
+ ```