lynn-twinkl
commited on
Commit
·
7f30bf8
1
Parent(s):
19fcede
removed this as it's no longer used
Browse files- ner-training/begin-training.zsh +0 -14
- ner-training/config.cfg +0 -145
- ner-training/convert_to_spacy.py +0 -37
- ner-training/debug_labeled_data.py +0 -89
- ner-training/merge_json.py +0 -25
- ner-training/predict.py +0 -18
- ner-training/prepare_data.py +0 -39
- ner-training/readme.md +0 -36
- ner-training/remove_non_context_labels.py +0 -19
- ner-training/split_data.py +0 -24
- ner-training/testing-model.ipynb +0 -0
- ner-training/transformer.cfg +0 -147
ner-training/begin-training.zsh
DELETED
|
@@ -1,14 +0,0 @@
|
|
| 1 |
-
#!/bin/zsh
|
| 2 |
-
|
| 3 |
-
source ner_venv/bin/activate
|
| 4 |
-
|
| 5 |
-
train_spacy_file=$1
|
| 6 |
-
dev_spacy_file=$2
|
| 7 |
-
model_outdir=$3
|
| 8 |
-
|
| 9 |
-
python3 -m spacy train transformer.cfg \
|
| 10 |
-
--paths.train "$train_spacy_file" \
|
| 11 |
-
--paths.dev "$dev_spacy_file" \
|
| 12 |
-
--gpu-id 0 \
|
| 13 |
-
--output "$model_outdir"
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ner-training/config.cfg
DELETED
|
@@ -1,145 +0,0 @@
|
|
| 1 |
-
[paths]
|
| 2 |
-
train = null
|
| 3 |
-
dev = null
|
| 4 |
-
vectors = "en_core_web_lg"
|
| 5 |
-
init_tok2vec = null
|
| 6 |
-
|
| 7 |
-
[system]
|
| 8 |
-
gpu_allocator = null
|
| 9 |
-
seed = 0
|
| 10 |
-
|
| 11 |
-
[nlp]
|
| 12 |
-
lang = "en"
|
| 13 |
-
pipeline = ["tok2vec","ner"]
|
| 14 |
-
batch_size = 1000
|
| 15 |
-
disabled = []
|
| 16 |
-
before_creation = null
|
| 17 |
-
after_creation = null
|
| 18 |
-
after_pipeline_creation = null
|
| 19 |
-
tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
|
| 20 |
-
vectors = {"@vectors":"spacy.Vectors.v1"}
|
| 21 |
-
|
| 22 |
-
[components]
|
| 23 |
-
|
| 24 |
-
[components.ner]
|
| 25 |
-
factory = "ner"
|
| 26 |
-
incorrect_spans_key = null
|
| 27 |
-
moves = null
|
| 28 |
-
scorer = {"@scorers":"spacy.ner_scorer.v1"}
|
| 29 |
-
update_with_oracle_cut_size = 100
|
| 30 |
-
|
| 31 |
-
[components.ner.model]
|
| 32 |
-
@architectures = "spacy.TransitionBasedParser.v2"
|
| 33 |
-
state_type = "ner"
|
| 34 |
-
extra_state_tokens = false
|
| 35 |
-
hidden_width = 64
|
| 36 |
-
maxout_pieces = 2
|
| 37 |
-
use_upper = true
|
| 38 |
-
nO = null
|
| 39 |
-
|
| 40 |
-
[components.ner.model.tok2vec]
|
| 41 |
-
@architectures = "spacy.Tok2VecListener.v1"
|
| 42 |
-
width = ${components.tok2vec.model.encode.width}
|
| 43 |
-
upstream = "*"
|
| 44 |
-
|
| 45 |
-
[components.tok2vec]
|
| 46 |
-
factory = "tok2vec"
|
| 47 |
-
|
| 48 |
-
[components.tok2vec.model]
|
| 49 |
-
@architectures = "spacy.Tok2Vec.v2"
|
| 50 |
-
|
| 51 |
-
[components.tok2vec.model.embed]
|
| 52 |
-
@architectures = "spacy.MultiHashEmbed.v2"
|
| 53 |
-
width = ${components.tok2vec.model.encode.width}
|
| 54 |
-
attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]
|
| 55 |
-
rows = [5000,1000,2500,2500]
|
| 56 |
-
include_static_vectors = true
|
| 57 |
-
|
| 58 |
-
[components.tok2vec.model.encode]
|
| 59 |
-
@architectures = "spacy.MaxoutWindowEncoder.v2"
|
| 60 |
-
width = 256
|
| 61 |
-
depth = 8
|
| 62 |
-
window_size = 1
|
| 63 |
-
maxout_pieces = 3
|
| 64 |
-
|
| 65 |
-
[corpora]
|
| 66 |
-
|
| 67 |
-
[corpora.dev]
|
| 68 |
-
@readers = "spacy.Corpus.v1"
|
| 69 |
-
path = ${paths.dev}
|
| 70 |
-
max_length = 0
|
| 71 |
-
gold_preproc = false
|
| 72 |
-
limit = 0
|
| 73 |
-
augmenter = null
|
| 74 |
-
|
| 75 |
-
[corpora.train]
|
| 76 |
-
@readers = "spacy.Corpus.v1"
|
| 77 |
-
path = ${paths.train}
|
| 78 |
-
max_length = 0
|
| 79 |
-
gold_preproc = false
|
| 80 |
-
limit = 0
|
| 81 |
-
augmenter = null
|
| 82 |
-
|
| 83 |
-
[training]
|
| 84 |
-
dev_corpus = "corpora.dev"
|
| 85 |
-
train_corpus = "corpora.train"
|
| 86 |
-
seed = ${system.seed}
|
| 87 |
-
gpu_allocator = ${system.gpu_allocator}
|
| 88 |
-
dropout = 0.1
|
| 89 |
-
accumulate_gradient = 1
|
| 90 |
-
patience = 1600
|
| 91 |
-
max_epochs = 0
|
| 92 |
-
max_steps = 20000
|
| 93 |
-
eval_frequency = 200
|
| 94 |
-
frozen_components = []
|
| 95 |
-
annotating_components = []
|
| 96 |
-
before_to_disk = null
|
| 97 |
-
before_update = null
|
| 98 |
-
|
| 99 |
-
[training.batcher]
|
| 100 |
-
@batchers = "spacy.batch_by_words.v1"
|
| 101 |
-
discard_oversize = false
|
| 102 |
-
tolerance = 0.2
|
| 103 |
-
get_length = null
|
| 104 |
-
|
| 105 |
-
[training.batcher.size]
|
| 106 |
-
@schedules = "compounding.v1"
|
| 107 |
-
start = 100
|
| 108 |
-
stop = 1000
|
| 109 |
-
compound = 1.001
|
| 110 |
-
t = 0.0
|
| 111 |
-
|
| 112 |
-
[training.logger]
|
| 113 |
-
@loggers = "spacy.ConsoleLogger.v1"
|
| 114 |
-
progress_bar = false
|
| 115 |
-
|
| 116 |
-
[training.optimizer]
|
| 117 |
-
@optimizers = "Adam.v1"
|
| 118 |
-
beta1 = 0.9
|
| 119 |
-
beta2 = 0.999
|
| 120 |
-
L2_is_weight_decay = true
|
| 121 |
-
L2 = 0.01
|
| 122 |
-
grad_clip = 1.0
|
| 123 |
-
use_averages = false
|
| 124 |
-
eps = 0.00000001
|
| 125 |
-
learn_rate = 0.001
|
| 126 |
-
|
| 127 |
-
[training.score_weights]
|
| 128 |
-
ents_f = 1.0
|
| 129 |
-
ents_p = 0.0
|
| 130 |
-
ents_r = 0.0
|
| 131 |
-
ents_per_type = null
|
| 132 |
-
|
| 133 |
-
[pretraining]
|
| 134 |
-
|
| 135 |
-
[initialize]
|
| 136 |
-
vectors = ${paths.vectors}
|
| 137 |
-
init_tok2vec = ${paths.init_tok2vec}
|
| 138 |
-
vocab_data = null
|
| 139 |
-
lookups = null
|
| 140 |
-
before_init = null
|
| 141 |
-
after_init = null
|
| 142 |
-
|
| 143 |
-
[initialize.components]
|
| 144 |
-
|
| 145 |
-
[initialize.tokenizer]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ner-training/convert_to_spacy.py
DELETED
|
@@ -1,37 +0,0 @@
|
|
| 1 |
-
import spacy
|
| 2 |
-
from spacy.tokens import DocBin
|
| 3 |
-
from prepare_data import load_data
|
| 4 |
-
import sys
|
| 5 |
-
|
| 6 |
-
json_file_path = sys.argv[1]
|
| 7 |
-
outpath = sys.argv[2]
|
| 8 |
-
|
| 9 |
-
def create_spacy_binary(json_path, output_path, nlp):
|
| 10 |
-
"""
|
| 11 |
-
Convert raw training examples into spaCy DocBin.
|
| 12 |
-
"""
|
| 13 |
-
db = DocBin()
|
| 14 |
-
|
| 15 |
-
# Load your custom data
|
| 16 |
-
training_data = load_data(json_path)
|
| 17 |
-
|
| 18 |
-
for text, ann in training_data:
|
| 19 |
-
doc = nlp.make_doc(text)
|
| 20 |
-
ents = []
|
| 21 |
-
for start, end, label in ann["entities"]:
|
| 22 |
-
span = doc.char_span(start, end, label=label, alignment_mode="contract")
|
| 23 |
-
if span is None:
|
| 24 |
-
# If spaCy can't align the tokenization, skip or handle carefully
|
| 25 |
-
print(f"Skipping misaligned entity: '{text[start:end]}'")
|
| 26 |
-
else:
|
| 27 |
-
ents.append(span)
|
| 28 |
-
doc.ents = ents
|
| 29 |
-
db.add(doc)
|
| 30 |
-
|
| 31 |
-
db.to_disk(output_path)
|
| 32 |
-
print(f"Created spaCy binary file: {output_path}")
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
if __name__ == "__main__":
|
| 36 |
-
nlp = spacy.blank("en") # blank English pipeline
|
| 37 |
-
create_spacy_binary(json_file_path, outpath, nlp)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ner-training/debug_labeled_data.py
DELETED
|
@@ -1,89 +0,0 @@
|
|
| 1 |
-
import sys
|
| 2 |
-
import json
|
| 3 |
-
import re
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
json_file_path = sys.argv[1]
|
| 7 |
-
text_key = sys.argv[2]
|
| 8 |
-
out_path = sys.argv[3]
|
| 9 |
-
|
| 10 |
-
# ------------ FUNCTION ------------
|
| 11 |
-
|
| 12 |
-
def trim_and_fix_offsets(raw_data, context_key=text_key):
|
| 13 |
-
"""
|
| 14 |
-
Attempt to fix leading/trailing whitespace in spans and recalc offsets.
|
| 15 |
-
Then do a local substring search to fix minor misalignments.
|
| 16 |
-
"""
|
| 17 |
-
fixed_data = []
|
| 18 |
-
for i, record in enumerate(raw_data):
|
| 19 |
-
text = record[context_key]
|
| 20 |
-
new_labels = []
|
| 21 |
-
for ann in record["label"]:
|
| 22 |
-
label = ann["labels"][0]
|
| 23 |
-
old_start, old_end = ann["start"], ann["end"]
|
| 24 |
-
original_substring = text[old_start:old_end]
|
| 25 |
-
trimmed_substring = original_substring.strip()
|
| 26 |
-
|
| 27 |
-
# 1) Trim leading/trailing whitespace offsets
|
| 28 |
-
# Move start forward while it points to space
|
| 29 |
-
start = old_start
|
| 30 |
-
while start < old_end and text[start].isspace():
|
| 31 |
-
start += 1
|
| 32 |
-
# Move end backward while it points to space
|
| 33 |
-
end = old_end
|
| 34 |
-
while end > start and text[end - 1].isspace():
|
| 35 |
-
end -= 1
|
| 36 |
-
|
| 37 |
-
# After naive trimming, see if the substring still matches
|
| 38 |
-
new_substring = text[start:end]
|
| 39 |
-
if new_substring == trimmed_substring:
|
| 40 |
-
# Great, we can trust these offsets directly
|
| 41 |
-
pass
|
| 42 |
-
else:
|
| 43 |
-
# Possibly there's hidden Unicode or the original offset was off.
|
| 44 |
-
# We'll do a local substring search around `old_start`.
|
| 45 |
-
# We'll search for `trimmed_substring` in a window of +/- 30 chars.
|
| 46 |
-
window_size = 30
|
| 47 |
-
|
| 48 |
-
# Define a safe search window in the text
|
| 49 |
-
search_start = max(0, old_start - window_size)
|
| 50 |
-
search_end = min(len(text), old_end + window_size)
|
| 51 |
-
window_text = text[search_start:search_end]
|
| 52 |
-
|
| 53 |
-
# Try to find the first occurrence of trimmed_substring in that window
|
| 54 |
-
local_pos = window_text.find(trimmed_substring)
|
| 55 |
-
if local_pos != -1:
|
| 56 |
-
# Recalc absolute offset
|
| 57 |
-
start = search_start + local_pos
|
| 58 |
-
end = start + len(trimmed_substring)
|
| 59 |
-
new_substring = text[start:end]
|
| 60 |
-
else:
|
| 61 |
-
# We failed to find it in the local region
|
| 62 |
-
print(f"[Record {i}] Can't find '{trimmed_substring}' near offset {old_start}-{old_end}")
|
| 63 |
-
# We'll leave this annotation as-is or skip it
|
| 64 |
-
start, end = old_start, old_end
|
| 65 |
-
new_substring = original_substring
|
| 66 |
-
|
| 67 |
-
new_labels.append({
|
| 68 |
-
"start": start,
|
| 69 |
-
"end": end,
|
| 70 |
-
"text": new_substring,
|
| 71 |
-
"labels": [label]
|
| 72 |
-
})
|
| 73 |
-
|
| 74 |
-
# Update the record with the new label data
|
| 75 |
-
new_record = dict(record)
|
| 76 |
-
new_record["label"] = new_labels
|
| 77 |
-
fixed_data.append(new_record)
|
| 78 |
-
|
| 79 |
-
return fixed_data
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
# ----------------- USAGE ----------------
|
| 83 |
-
with open(json_file_path, "r", encoding="utf-8") as f:
|
| 84 |
-
raw_data = json.load(f)
|
| 85 |
-
|
| 86 |
-
fixed_data = trim_and_fix_offsets(raw_data, context_key=text_key)
|
| 87 |
-
|
| 88 |
-
with open(out_path, "w", encoding="utf-8") as out:
|
| 89 |
-
json.dump(fixed_data, out, indent=2, ensure_ascii=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ner-training/merge_json.py
DELETED
|
@@ -1,25 +0,0 @@
|
|
| 1 |
-
import json
|
| 2 |
-
import sys
|
| 3 |
-
|
| 4 |
-
original_json_path = sys.argv[1]
|
| 5 |
-
additional_json_path = sys.argv[2]
|
| 6 |
-
|
| 7 |
-
with open(additional_json_path, 'r') as source_file:
|
| 8 |
-
source_data = json.load(source_file)
|
| 9 |
-
|
| 10 |
-
# Load data from target.json
|
| 11 |
-
with open(original_json_path, 'r') as target_file:
|
| 12 |
-
target_data = json.load(target_file)
|
| 13 |
-
|
| 14 |
-
# Ensure both source_data and target_data are lists
|
| 15 |
-
if isinstance(source_data, list) and isinstance(target_data, list):
|
| 16 |
-
# Append records from source_data to target_data
|
| 17 |
-
target_data.extend(source_data)
|
| 18 |
-
else:
|
| 19 |
-
print("The JSON data must be a list of records in both files.")
|
| 20 |
-
|
| 21 |
-
# Write updated data back to target.json
|
| 22 |
-
with open(original_json_path, 'w') as target_file:
|
| 23 |
-
json.dump(target_data, target_file, indent=4)
|
| 24 |
-
|
| 25 |
-
print("Records have been appended successfully.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ner-training/predict.py
DELETED
|
@@ -1,18 +0,0 @@
|
|
| 1 |
-
import spacy
|
| 2 |
-
import pandas as pd
|
| 3 |
-
import sys
|
| 4 |
-
|
| 5 |
-
csv_path = sys.argv[1]
|
| 6 |
-
custom_model_path = sys.argv[2]
|
| 7 |
-
|
| 8 |
-
df = pd.read_csv(csv_path)
|
| 9 |
-
texts = df['Additional Info'].to_list()
|
| 10 |
-
|
| 11 |
-
trained_nlp = spacy.load(custom_model_path)
|
| 12 |
-
|
| 13 |
-
for text in texts:
|
| 14 |
-
doc = trained_nlp(text)
|
| 15 |
-
print(f"TEXT: {text}")
|
| 16 |
-
print()
|
| 17 |
-
print("ENTITIES:", [(ent.text, ent.label_) for ent in doc.ents])
|
| 18 |
-
print('-'*60)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ner-training/prepare_data.py
DELETED
|
@@ -1,39 +0,0 @@
|
|
| 1 |
-
import json
|
| 2 |
-
import sys
|
| 3 |
-
|
| 4 |
-
raw_data = sys.argv[1]
|
| 5 |
-
|
| 6 |
-
def load_data(json_path):
|
| 7 |
-
"""
|
| 8 |
-
Load your custom JSON with 'additional_info' and 'label' fields.
|
| 9 |
-
Returns a list of (text, {"entities": [(start, end, label), ...]}) tuples.
|
| 10 |
-
"""
|
| 11 |
-
with open(json_path, 'r', encoding='utf-8') as f:
|
| 12 |
-
data = json.load(f)
|
| 13 |
-
|
| 14 |
-
# If your JSON is a list of records
|
| 15 |
-
# If it's a single record, wrap it in [data] or handle accordingly
|
| 16 |
-
if not isinstance(data, list):
|
| 17 |
-
data = [data]
|
| 18 |
-
|
| 19 |
-
training_data = []
|
| 20 |
-
|
| 21 |
-
for record in data:
|
| 22 |
-
text = record["additional_info"]
|
| 23 |
-
spans = []
|
| 24 |
-
for annotation in record["label"]:
|
| 25 |
-
# Each annotation can have multiple "labels", but typically there's just one
|
| 26 |
-
label = annotation["labels"][0]
|
| 27 |
-
start = annotation["start"]
|
| 28 |
-
end = annotation["end"]
|
| 29 |
-
spans.append((start, end, label))
|
| 30 |
-
# Append in spaCy's format
|
| 31 |
-
training_data.append((text, {"entities": spans}))
|
| 32 |
-
|
| 33 |
-
return training_data
|
| 34 |
-
|
| 35 |
-
if __name__ == "__main__":
|
| 36 |
-
# Example usage
|
| 37 |
-
TRAIN_DATA = load_data(raw_data)
|
| 38 |
-
|
| 39 |
-
print(TRAIN_DATA[:2])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ner-training/readme.md
DELETED
|
@@ -1,36 +0,0 @@
|
|
| 1 |
-
# Appropriate Usage for NER Training
|
| 2 |
-
|
| 3 |
-
## Cleaning and Debugging Training Data
|
| 4 |
-
|
| 5 |
-
We first need to debug our raw labeled data from Label Studio. Sometimes, labeled data has trailing whitespaces or punctuation, which Spacy _really_ doesn't like. So we need to remove it.
|
| 6 |
-
|
| 7 |
-
`python3 debug_labeled_data.py raw_labeded_data_path text_key_to_debug outdir`
|
| 8 |
-
|
| 9 |
-
This will create a new debugged json file in the specified directory. **Use this file for the next step.**
|
| 10 |
-
|
| 11 |
-
## Preparing Data For Training
|
| 12 |
-
|
| 13 |
-
Now, we need to convert this raw labeled data into Spacy's binary format. Before doing so however, we must make sure to split the data into training and dev sets for testing.
|
| 14 |
-
|
| 15 |
-
1. `python3 split_data.py debugged_json_path`
|
| 16 |
-
|
| 17 |
-
This will create `train.json` and `dev.json` files in the current working directory.
|
| 18 |
-
|
| 19 |
-
2. Move these file into the trianing_data dir: `mv *.json training_data/`
|
| 20 |
-
|
| 21 |
-
3. Convert both sets into Spacy's binary format:
|
| 22 |
-
|
| 23 |
-
`python3 convert_to_spacy.py training_data/train.json training_data/train.spacy`
|
| 24 |
-
`python3 convert_to_spacy.py training_data/dev.json training_data/dev.spacy`
|
| 25 |
-
|
| 26 |
-
## Training
|
| 27 |
-
|
| 28 |
-
To start training the data from the CLI, we simply run the following command:
|
| 29 |
-
|
| 30 |
-
`
|
| 31 |
-
python -m spacy train transformer.cfg \
|
| 32 |
-
--paths.train training_data/train.spacy \
|
| 33 |
-
--paths.dev training_data/dev.spacy \
|
| 34 |
-
--gpu-id 0 \
|
| 35 |
-
--output ./roberta_model
|
| 36 |
-
`
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ner-training/remove_non_context_labels.py
DELETED
|
@@ -1,19 +0,0 @@
|
|
| 1 |
-
import json
|
| 2 |
-
import sys
|
| 3 |
-
|
| 4 |
-
file_to_filter = sys.argv[1]
|
| 5 |
-
tag_to_keep = sys.argv[2]
|
| 6 |
-
outpath = sys.argv[3]
|
| 7 |
-
|
| 8 |
-
with open(file_to_filter, 'r') as input_file:
|
| 9 |
-
dataset = json.load(input_file)
|
| 10 |
-
|
| 11 |
-
def filter_context_labels(dataset):
|
| 12 |
-
for item in dataset:
|
| 13 |
-
item['label'] = [l for l in item['label'] if tag_to_keep in l['labels']]
|
| 14 |
-
return dataset
|
| 15 |
-
|
| 16 |
-
filtered_data = filter_context_labels(dataset)
|
| 17 |
-
|
| 18 |
-
with open(outpath, 'w') as output_file:
|
| 19 |
-
json.dump(filtered_data, output_file, indent=2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ner-training/split_data.py
DELETED
|
@@ -1,24 +0,0 @@
|
|
| 1 |
-
import json
|
| 2 |
-
import random
|
| 3 |
-
import sys
|
| 4 |
-
|
| 5 |
-
json_file_path = sys.argv[1]
|
| 6 |
-
|
| 7 |
-
# Load your full dataset (make sure it's a list of records)
|
| 8 |
-
with open(json_file_path, "r", encoding="utf-8") as f:
|
| 9 |
-
data = json.load(f)
|
| 10 |
-
|
| 11 |
-
# Shuffle and then split the data 80/20
|
| 12 |
-
random.shuffle(data)
|
| 13 |
-
split_index = int(len(data) * 0.8)
|
| 14 |
-
train_data = data[:split_index]
|
| 15 |
-
dev_data = data[split_index:]
|
| 16 |
-
|
| 17 |
-
# Save the train and dev JSON files
|
| 18 |
-
with open("train.json", "w", encoding="utf-8") as f:
|
| 19 |
-
json.dump(train_data, f, indent=2)
|
| 20 |
-
|
| 21 |
-
with open("dev.json", "w", encoding="utf-8") as f:
|
| 22 |
-
json.dump(dev_data, f, indent=2)
|
| 23 |
-
|
| 24 |
-
print(f"Train examples: {len(train_data)}, Dev examples: {len(dev_data)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ner-training/testing-model.ipynb
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
ner-training/transformer.cfg
DELETED
|
@@ -1,147 +0,0 @@
|
|
| 1 |
-
[paths]
|
| 2 |
-
train = null
|
| 3 |
-
dev = null
|
| 4 |
-
vectors = null
|
| 5 |
-
init_tok2vec = null
|
| 6 |
-
|
| 7 |
-
[system]
|
| 8 |
-
gpu_allocator = "pytorch"
|
| 9 |
-
seed = 0
|
| 10 |
-
|
| 11 |
-
[nlp]
|
| 12 |
-
lang = "en"
|
| 13 |
-
pipeline = ["transformer","ner"]
|
| 14 |
-
batch_size = 128
|
| 15 |
-
disabled = []
|
| 16 |
-
before_creation = null
|
| 17 |
-
after_creation = null
|
| 18 |
-
after_pipeline_creation = null
|
| 19 |
-
tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
|
| 20 |
-
vectors = {"@vectors":"spacy.Vectors.v1"}
|
| 21 |
-
|
| 22 |
-
[components]
|
| 23 |
-
|
| 24 |
-
[components.ner]
|
| 25 |
-
factory = "ner"
|
| 26 |
-
incorrect_spans_key = null
|
| 27 |
-
moves = null
|
| 28 |
-
scorer = {"@scorers":"spacy.ner_scorer.v1"}
|
| 29 |
-
update_with_oracle_cut_size = 100
|
| 30 |
-
|
| 31 |
-
[components.ner.model]
|
| 32 |
-
@architectures = "spacy.TransitionBasedParser.v2"
|
| 33 |
-
state_type = "ner"
|
| 34 |
-
extra_state_tokens = false
|
| 35 |
-
hidden_width = 64
|
| 36 |
-
maxout_pieces = 2
|
| 37 |
-
use_upper = false
|
| 38 |
-
nO = null
|
| 39 |
-
|
| 40 |
-
[components.ner.model.tok2vec]
|
| 41 |
-
@architectures = "spacy-transformers.TransformerListener.v1"
|
| 42 |
-
grad_factor = 1.0
|
| 43 |
-
pooling = {"@layers":"reduce_mean.v1"}
|
| 44 |
-
upstream = "*"
|
| 45 |
-
|
| 46 |
-
[components.transformer]
|
| 47 |
-
factory = "transformer"
|
| 48 |
-
max_batch_items = 4096
|
| 49 |
-
set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"}
|
| 50 |
-
|
| 51 |
-
[components.transformer.model]
|
| 52 |
-
@architectures = "spacy-transformers.TransformerModel.v3"
|
| 53 |
-
name = "roberta-base"
|
| 54 |
-
mixed_precision = false
|
| 55 |
-
|
| 56 |
-
[components.transformer.model.get_spans]
|
| 57 |
-
@span_getters = "spacy-transformers.strided_spans.v1"
|
| 58 |
-
window = 128
|
| 59 |
-
stride = 96
|
| 60 |
-
|
| 61 |
-
[components.transformer.model.grad_scaler_config]
|
| 62 |
-
|
| 63 |
-
[components.transformer.model.tokenizer_config]
|
| 64 |
-
use_fast = true
|
| 65 |
-
|
| 66 |
-
[components.transformer.model.transformer_config]
|
| 67 |
-
|
| 68 |
-
[corpora]
|
| 69 |
-
|
| 70 |
-
[corpora.dev]
|
| 71 |
-
@readers = "spacy.Corpus.v1"
|
| 72 |
-
path = ${paths.dev}
|
| 73 |
-
max_length = 0
|
| 74 |
-
gold_preproc = false
|
| 75 |
-
limit = 0
|
| 76 |
-
augmenter = null
|
| 77 |
-
|
| 78 |
-
[corpora.train]
|
| 79 |
-
@readers = "spacy.Corpus.v1"
|
| 80 |
-
path = ${paths.train}
|
| 81 |
-
max_length = 0
|
| 82 |
-
gold_preproc = false
|
| 83 |
-
limit = 0
|
| 84 |
-
augmenter = null
|
| 85 |
-
|
| 86 |
-
[training]
|
| 87 |
-
accumulate_gradient = 3
|
| 88 |
-
dev_corpus = "corpora.dev"
|
| 89 |
-
train_corpus = "corpora.train"
|
| 90 |
-
seed = ${system.seed}
|
| 91 |
-
gpu_allocator = ${system.gpu_allocator}
|
| 92 |
-
dropout = 0.1
|
| 93 |
-
patience = 1600
|
| 94 |
-
max_epochs = 0
|
| 95 |
-
max_steps = 20000
|
| 96 |
-
eval_frequency = 200
|
| 97 |
-
frozen_components = []
|
| 98 |
-
annotating_components = []
|
| 99 |
-
before_to_disk = null
|
| 100 |
-
before_update = null
|
| 101 |
-
|
| 102 |
-
[training.batcher]
|
| 103 |
-
@batchers = "spacy.batch_by_padded.v1"
|
| 104 |
-
discard_oversize = true
|
| 105 |
-
size = 2000
|
| 106 |
-
buffer = 256
|
| 107 |
-
get_length = null
|
| 108 |
-
|
| 109 |
-
[training.logger]
|
| 110 |
-
@loggers = "spacy.ConsoleLogger.v1"
|
| 111 |
-
progress_bar = false
|
| 112 |
-
|
| 113 |
-
[training.optimizer]
|
| 114 |
-
@optimizers = "Adam.v1"
|
| 115 |
-
beta1 = 0.9
|
| 116 |
-
beta2 = 0.999
|
| 117 |
-
L2_is_weight_decay = true
|
| 118 |
-
L2 = 0.01
|
| 119 |
-
grad_clip = 1.0
|
| 120 |
-
use_averages = false
|
| 121 |
-
eps = 0.00000001
|
| 122 |
-
|
| 123 |
-
[training.optimizer.learn_rate]
|
| 124 |
-
@schedules = "warmup_linear.v1"
|
| 125 |
-
warmup_steps = 250
|
| 126 |
-
total_steps = 20000
|
| 127 |
-
initial_rate = 0.00005
|
| 128 |
-
|
| 129 |
-
[training.score_weights]
|
| 130 |
-
ents_f = 1.0
|
| 131 |
-
ents_p = 0.0
|
| 132 |
-
ents_r = 0.0
|
| 133 |
-
ents_per_type = null
|
| 134 |
-
|
| 135 |
-
[pretraining]
|
| 136 |
-
|
| 137 |
-
[initialize]
|
| 138 |
-
vectors = ${paths.vectors}
|
| 139 |
-
init_tok2vec = ${paths.init_tok2vec}
|
| 140 |
-
vocab_data = null
|
| 141 |
-
lookups = null
|
| 142 |
-
before_init = null
|
| 143 |
-
after_init = null
|
| 144 |
-
|
| 145 |
-
[initialize.components]
|
| 146 |
-
|
| 147 |
-
[initialize.tokenizer]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|