Update README.md
Browse files
README.md
CHANGED
|
@@ -15,10 +15,10 @@ Fine-tuned on a Bulgarian subset of [wikiann](https://huggingface.co/datasets/wi
|
|
| 15 |
## Usage
|
| 16 |
Import the libraries:
|
| 17 |
```python
|
| 18 |
-
from typing import List
|
| 19 |
|
| 20 |
import torch
|
| 21 |
-
from transformers import AutoModelForTokenClassification, AutoTokenizer
|
| 22 |
```
|
| 23 |
|
| 24 |
Firstly, you'll have to define these methods, since we are using a subword Tokenizer:
|
|
@@ -32,7 +32,7 @@ def predict(
|
|
| 32 |
1: "B-PER", 2: "I-PER",
|
| 33 |
3: "B-ORG", 4: "I-ORG",
|
| 34 |
5: "B-LOC", 6: "I-LOC"
|
| 35 |
-
}):
|
| 36 |
tokens_data = tokenizer(text)
|
| 37 |
tokens = tokenizer.convert_ids_to_tokens(tokens_data["input_ids"])
|
| 38 |
words = subwords_to_words(tokens)
|
|
@@ -75,7 +75,7 @@ def subwords_to_words(tokens: List[str]) -> List[str]:
|
|
| 75 |
return out_tokens
|
| 76 |
|
| 77 |
|
| 78 |
-
def merge_words_and_predictions(words, entities):
|
| 79 |
result = []
|
| 80 |
curr_word = []
|
| 81 |
|
|
@@ -85,7 +85,7 @@ def merge_words_and_predictions(words, entities):
|
|
| 85 |
curr_word = " ".join(curr_word)
|
| 86 |
result.append({
|
| 87 |
"word": curr_word,
|
| 88 |
-
"
|
| 89 |
})
|
| 90 |
curr_word = [word]
|
| 91 |
else:
|
|
@@ -99,7 +99,7 @@ def merge_words_and_predictions(words, entities):
|
|
| 99 |
curr_word = " ".join(curr_word)
|
| 100 |
result.append({
|
| 101 |
"word": curr_word,
|
| 102 |
-
"
|
| 103 |
})
|
| 104 |
|
| 105 |
curr_word = []
|
|
|
|
| 15 |
## Usage
|
| 16 |
Import the libraries:
|
| 17 |
```python
|
| 18 |
+
from typing import List, Dict
|
| 19 |
|
| 20 |
import torch
|
| 21 |
+
from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline
|
| 22 |
```
|
| 23 |
|
| 24 |
Firstly, you'll have to define these methods, since we are using a subword Tokenizer:
|
|
|
|
| 32 |
1: "B-PER", 2: "I-PER",
|
| 33 |
3: "B-ORG", 4: "I-ORG",
|
| 34 |
5: "B-LOC", 6: "I-LOC"
|
| 35 |
+
}) -> List[Dict[str, str]]:
|
| 36 |
tokens_data = tokenizer(text)
|
| 37 |
tokens = tokenizer.convert_ids_to_tokens(tokens_data["input_ids"])
|
| 38 |
words = subwords_to_words(tokens)
|
|
|
|
| 75 |
return out_tokens
|
| 76 |
|
| 77 |
|
| 78 |
+
def merge_words_and_predictions(words: List[str], entities: List[str]) -> List[Dict[str, str]]:
|
| 79 |
result = []
|
| 80 |
curr_word = []
|
| 81 |
|
|
|
|
| 85 |
curr_word = " ".join(curr_word)
|
| 86 |
result.append({
|
| 87 |
"word": curr_word,
|
| 88 |
+
"entity_group": entities[i][2:]
|
| 89 |
})
|
| 90 |
curr_word = [word]
|
| 91 |
else:
|
|
|
|
| 99 |
curr_word = " ".join(curr_word)
|
| 100 |
result.append({
|
| 101 |
"word": curr_word,
|
| 102 |
+
"entity_group": entities[i][2:]
|
| 103 |
})
|
| 104 |
|
| 105 |
curr_word = []
|