saif
commited on
Commit
·
ba83523
1
Parent(s):
e235524
Update model by training for 25 epochs and two more datasets i.e. mit restaurant and mit movie trivia.
Browse files- README.md +131 -39
- added_tokens.json +1 -1
- config.json +4 -2
- pytorch_model.bin +2 -2
- tokenizer.json +0 -0
- tokenizer_config.json +1 -1
README.md
CHANGED
|
@@ -17,59 +17,151 @@ The FSNER model was proposed in [Example-Based Named Entity Recognition](https:/
|
|
| 17 |
## Model Training Details
|
| 18 |
-----
|
| 19 |
|
| 20 |
-
| identifier | epochs
|
| 21 |
-
| ----------
|
| 22 |
-
| [sayef/fsner-bert-base-uncased](https://huggingface.co/sayef/fsner-bert-base-uncased) |
|
| 23 |
-
|
| 24 |
|
| 25 |
## Installation and Example Usage
|
| 26 |
------
|
| 27 |
|
| 28 |
-
|
| 29 |
|
|
|
|
| 30 |
|
| 31 |
-
|
| 32 |
-
from fsner import FSNERModel, FSNERTokenizerUtils
|
| 33 |
|
| 34 |
-
model
|
| 35 |
|
| 36 |
-
|
| 37 |
|
| 38 |
-
|
| 39 |
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
'I would like to order a computer from eBay.',
|
| 43 |
-
]
|
| 44 |
|
| 45 |
-
|
| 46 |
-
# wrap entities around with [E] and [/E] in the examples
|
| 47 |
-
|
| 48 |
-
supports = [
|
| 49 |
-
[
|
| 50 |
-
'Horizontal flow wrapper [E] Pack 403 [/E] features the new retrofit-kit „paper-ON-form“',
|
| 51 |
-
'[E] Paloma Pick-and-Place-Roboter [/E] arranges the bakery products for the downstream tray-forming equipment',
|
| 52 |
-
'Finally, the new [E] Kliklok ACE [/E] carton former forms cartons and trays without the use of glue',
|
| 53 |
-
'We set up our pilot plant with the right [E] FibreForm® [/E] configuration to make prototypes for your marketing tests and package validation',
|
| 54 |
-
'The [E] CAR-T5 [/E] is a reliable, purely mechanically driven cartoning machine for versatile application fields'
|
| 55 |
-
],
|
| 56 |
-
[
|
| 57 |
-
"[E] Walmart [/E] is a leading e-commerce company",
|
| 58 |
-
"I recently ordered a book from [E] Amazon [/E]",
|
| 59 |
-
"I ordered this from [E] ShopClues [/E]",
|
| 60 |
-
"Fridge can be ordered in [E] Amazon [/E]",
|
| 61 |
-
"[E] Flipkart [/E] started it's journey from zero"
|
| 62 |
-
]
|
| 63 |
-
]
|
| 64 |
|
| 65 |
-
device = 'cpu'
|
| 66 |
|
| 67 |
-
|
| 68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
|
| 70 |
-
|
| 71 |
|
| 72 |
-
output = tokenizer.extract_entity_from_scores(query, W_query, start_prob, end_prob, thresh=0.50)
|
| 73 |
|
| 74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
```
|
|
|
|
| 17 |
## Model Training Details
|
| 18 |
-----
|
| 19 |
|
| 20 |
+
| identifier | epochs | datasets |
|
| 21 |
+
| ---------- |:------:|:-----------------------------------------------------------------------------------------------:|
|
| 22 |
+
| [sayef/fsner-bert-base-uncased](https://huggingface.co/sayef/fsner-bert-base-uncased) | 25 | ontonotes5, conll2003, wnut2017, mit_movie_trivia, mit_restaurant and fin (Alvarado et al.). |
|
|
|
|
| 23 |
|
| 24 |
## Installation and Example Usage
|
| 25 |
------
|
| 26 |
|
| 27 |
+
You can use the FSNER model in 3 ways:
|
| 28 |
|
| 29 |
+
1. Install directly from PyPI: `pip install fsner` and import the model as shown in the code example below
|
| 30 |
|
| 31 |
+
or
|
|
|
|
| 32 |
|
| 33 |
+
2. Install from source: `python setup.py install` and import the model as shown in the code example below
|
| 34 |
|
| 35 |
+
or
|
| 36 |
|
| 37 |
+
3. Clone [repo](https://github.com/sayef/fsner) and add absolute path of `fsner/src` directory to your PYTHONPATH and import the model as shown in the code example below
|
| 38 |
|
| 39 |
+
```python
|
| 40 |
+
import json
|
|
|
|
|
|
|
| 41 |
|
| 42 |
+
from fsner import FSNERModel, FSNERTokenizerUtils, pretty_embed
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
|
|
|
|
| 44 |
|
| 45 |
+
query_texts = [
|
| 46 |
+
"Does Luke's serve lunch?",
|
| 47 |
+
"Chang does not speak Taiwanese very well.",
|
| 48 |
+
"I like Berlin."
|
| 49 |
+
]
|
| 50 |
+
|
| 51 |
+
# Each list in supports are the examples of one entity type
|
| 52 |
+
# Wrap entities around with [E] and [/E] in the examples.
|
| 53 |
+
# Each sentence should have only one pair of [E] ... [/E]
|
| 54 |
+
|
| 55 |
+
support_texts = {
|
| 56 |
+
"Restaurant": [
|
| 57 |
+
"What time does [E] Subway [/E] open for breakfast?",
|
| 58 |
+
"Is there a [E] China Garden [/E] restaurant in newark?",
|
| 59 |
+
"Does [E] Le Cirque [/E] have valet parking?",
|
| 60 |
+
"Is there a [E] McDonalds [/E] on main street?",
|
| 61 |
+
"Does [E] Mike's Diner [/E] offer huge portions and outdoor dining?"
|
| 62 |
+
],
|
| 63 |
+
"Language": [
|
| 64 |
+
"Although I understood no [E] French [/E] in those days , I was prepared to spend the whole day with Chien - chien .",
|
| 65 |
+
"like what the hell 's that called in [E] English [/E] ? I have to register to be here like since I 'm a foreigner .",
|
| 66 |
+
"So , I 'm also working on an [E] English [/E] degree because that 's my real interest .",
|
| 67 |
+
"Al - Jazeera TV station , established in November 1996 in Qatar , is an [E] Arabic - language [/E] news TV station broadcasting global news and reports nonstop around the clock .",
|
| 68 |
+
"They think it 's far better for their children to be here improving their [E] English [/E] than sitting at home in front of a TV . \"",
|
| 69 |
+
"The only solution seemed to be to have her learn [E] French [/E] .",
|
| 70 |
+
"I have to read sixty pages of [E] Russian [/E] today ."
|
| 71 |
+
]
|
| 72 |
+
}
|
| 73 |
|
| 74 |
+
device = 'cpu'
|
| 75 |
|
|
|
|
| 76 |
|
| 77 |
+
tokenizer = FSNERTokenizerUtils("checkpoints/model")
|
| 78 |
+
queries = tokenizer.tokenize(query_texts).to(device)
|
| 79 |
+
supports = tokenizer.tokenize(list(support_texts.values())).to(device)
|
| 80 |
+
|
| 81 |
+
model = FSNERModel("checkpoints/model")
|
| 82 |
+
model.to(device)
|
| 83 |
+
|
| 84 |
+
p_starts, p_ends = model.predict(queries, supports)
|
| 85 |
+
|
| 86 |
+
# One can prepare supports once and reuse multiple times with different queries
|
| 87 |
+
# ------------------------------------------------------------------------------
|
| 88 |
+
# start_token_embeddings, end_token_embeddings = model.prepare_supports(supports)
|
| 89 |
+
# p_starts, p_ends = model.predict(queries, start_token_embeddings=start_token_embeddings,
|
| 90 |
+
# end_token_embeddings=end_token_embeddings)
|
| 91 |
+
|
| 92 |
+
output = tokenizer.extract_entity_from_scores(query_texts, queries, p_starts, p_ends,
|
| 93 |
+
entity_keys=list(support_texts.keys()), thresh=0.50)
|
| 94 |
+
|
| 95 |
+
print(json.dumps(output, indent=2))
|
| 96 |
+
|
| 97 |
+
# install displacy for pretty embed
|
| 98 |
+
pretty_embed(query_texts, output, list(support_texts.keys()))
|
| 99 |
+
```
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
<!DOCTYPE html>
|
| 103 |
+
<html lang="en">
|
| 104 |
+
<head>
|
| 105 |
+
<title>displaCy</title>
|
| 106 |
+
</head>
|
| 107 |
+
<body style="font-size: 16px; font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'; padding: 4rem 2rem; direction: ltr">
|
| 108 |
+
<figure style="margin-bottom: 6rem">
|
| 109 |
+
<div class="entities" style="line-height: 2.5; direction: ltr">
|
| 110 |
+
|
| 111 |
+
<div class="entities" style="line-height: 2.5; direction: ltr">Does
|
| 112 |
+
<mark class="entity" style="background: #7aecec; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">
|
| 113 |
+
Luke's
|
| 114 |
+
<span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">Restaurant</span>
|
| 115 |
+
</mark>
|
| 116 |
+
serve lunch?</div>
|
| 117 |
+
<div class="entities" style="line-height: 2.5; direction: ltr">Chang does not speak
|
| 118 |
+
<mark class="entity" style="background: #bfeeb7; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">
|
| 119 |
+
Taiwanese
|
| 120 |
+
<span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">Language</span>
|
| 121 |
+
</mark>
|
| 122 |
+
very well.</div>
|
| 123 |
+
<div class="entities" style="line-height: 2.5; direction: ltr">I like Berlin.</div>
|
| 124 |
+
</div>
|
| 125 |
+
</figure>
|
| 126 |
+
</body>
|
| 127 |
+
</html>
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
## Datasets preparation
|
| 131 |
+
|
| 132 |
+
1. We need to convert dataset into the following format. Let's say we have a dataset file train.json like following.
|
| 133 |
+
|
| 134 |
+
```json
|
| 135 |
+
{
|
| 136 |
+
"CARDINAL_NUMBER": [
|
| 137 |
+
"Washington , cloudy , [E] 2 [/E] to 6 degrees .",
|
| 138 |
+
"New Dehli , sunny , [E] 6 [/E] to 19 degrees .",
|
| 139 |
+
"Well this is number [E] two [/E] .",
|
| 140 |
+
"....."
|
| 141 |
+
],
|
| 142 |
+
"LANGUAGE": [
|
| 143 |
+
"They do n't have the Quicken [E] Dutch [/E] version ?",
|
| 144 |
+
"they learned a lot of [E] German [/E] .",
|
| 145 |
+
"and then [E] Dutch [/E] it 's Mifrau",
|
| 146 |
+
"...."
|
| 147 |
+
],
|
| 148 |
+
"MONEY": [
|
| 149 |
+
"Per capita personal income ranged from $ [E] 11,116 [/E] in Mississippi to $ 23,059 in Connecticut ... .",
|
| 150 |
+
"The trade surplus was [E] 582 million US dollars [/E] .",
|
| 151 |
+
"It settled with a loss of 4.95 cents at $ [E] 1.3210 [/E] a pound .",
|
| 152 |
+
"...."
|
| 153 |
+
]
|
| 154 |
+
}
|
| 155 |
+
```
|
| 156 |
+
|
| 157 |
+
2. Converted ontonotes5 dataset can be found here:
|
| 158 |
+
1. [train](https://gist.githubusercontent.com/sayef/46deaf7e6c6e1410b430ddc8aff9c557/raw/ea7ae2ae933bfc9c0daac1aa52a9dc093d5b36f4/ontonotes5.train.json)
|
| 159 |
+
2. [dev](https://gist.githubusercontent.com/sayef/46deaf7e6c6e1410b430ddc8aff9c557/raw/ea7ae2ae933bfc9c0daac1aa52a9dc093d5b36f4/ontonotes5.dev.json)
|
| 160 |
+
|
| 161 |
+
3. Then one could use examples/train.py script to train/evaluate your fsner model.
|
| 162 |
+
|
| 163 |
+
```bash
|
| 164 |
+
python train.py --pretrained-model bert-base-uncased --mode train --train-data train.json --val-data val.json \
|
| 165 |
+
--train-batch-size 6 --val-batch-size 6 --n-examples-per-entity 10 --neg-example-batch-ratio 1/3 --max-epochs 25 --device gpu \
|
| 166 |
+
--gpus -1 --strategy ddp
|
| 167 |
```
|
added_tokens.json
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"[
|
|
|
|
| 1 |
+
{"[E]": 30522, "[/E]": 30523}
|
config.json
CHANGED
|
@@ -1,9 +1,10 @@
|
|
| 1 |
{
|
| 2 |
-
"_name_or_path": "
|
| 3 |
"architectures": [
|
| 4 |
"BertModel"
|
| 5 |
],
|
| 6 |
"attention_probs_dropout_prob": 0.1,
|
|
|
|
| 7 |
"gradient_checkpointing": false,
|
| 8 |
"hidden_act": "gelu",
|
| 9 |
"hidden_dropout_prob": 0.1,
|
|
@@ -17,7 +18,8 @@
|
|
| 17 |
"num_hidden_layers": 12,
|
| 18 |
"pad_token_id": 0,
|
| 19 |
"position_embedding_type": "absolute",
|
| 20 |
-
"
|
|
|
|
| 21 |
"type_vocab_size": 2,
|
| 22 |
"use_cache": true,
|
| 23 |
"vocab_size": 30524
|
|
|
|
| 1 |
{
|
| 2 |
+
"_name_or_path": "checkpoints/model4",
|
| 3 |
"architectures": [
|
| 4 |
"BertModel"
|
| 5 |
],
|
| 6 |
"attention_probs_dropout_prob": 0.1,
|
| 7 |
+
"classifier_dropout": null,
|
| 8 |
"gradient_checkpointing": false,
|
| 9 |
"hidden_act": "gelu",
|
| 10 |
"hidden_dropout_prob": 0.1,
|
|
|
|
| 18 |
"num_hidden_layers": 12,
|
| 19 |
"pad_token_id": 0,
|
| 20 |
"position_embedding_type": "absolute",
|
| 21 |
+
"torch_dtype": "float32",
|
| 22 |
+
"transformers_version": "4.17.0",
|
| 23 |
"type_vocab_size": 2,
|
| 24 |
"use_cache": true,
|
| 25 |
"vocab_size": 30524
|
pytorch_model.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c2a2401a91d2bf80826341c52a0c1f8b6814f36c1b7852d4c93482a13041260f
|
| 3 |
+
size 438017329
|
tokenizer.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizer_config.json
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "
|
|
|
|
| 1 |
+
{"do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "checkpoints/model4", "tokenizer_class": "BertTokenizer"}
|