lynn-twinkl commited on
Commit
7f30bf8
·
1 Parent(s): 19fcede

removed this as it's no longer used

Browse files
ner-training/begin-training.zsh DELETED
@@ -1,14 +0,0 @@
1
- #!/bin/zsh
2
-
3
- source ner_venv/bin/activate
4
-
5
- train_spacy_file=$1
6
- dev_spacy_file=$2
7
- model_outdir=$3
8
-
9
- python3 -m spacy train transformer.cfg \
10
- --paths.train "$train_spacy_file" \
11
- --paths.dev "$dev_spacy_file" \
12
- --gpu-id 0 \
13
- --output "$model_outdir"
14
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ner-training/config.cfg DELETED
@@ -1,145 +0,0 @@
1
- [paths]
2
- train = null
3
- dev = null
4
- vectors = "en_core_web_lg"
5
- init_tok2vec = null
6
-
7
- [system]
8
- gpu_allocator = null
9
- seed = 0
10
-
11
- [nlp]
12
- lang = "en"
13
- pipeline = ["tok2vec","ner"]
14
- batch_size = 1000
15
- disabled = []
16
- before_creation = null
17
- after_creation = null
18
- after_pipeline_creation = null
19
- tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
20
- vectors = {"@vectors":"spacy.Vectors.v1"}
21
-
22
- [components]
23
-
24
- [components.ner]
25
- factory = "ner"
26
- incorrect_spans_key = null
27
- moves = null
28
- scorer = {"@scorers":"spacy.ner_scorer.v1"}
29
- update_with_oracle_cut_size = 100
30
-
31
- [components.ner.model]
32
- @architectures = "spacy.TransitionBasedParser.v2"
33
- state_type = "ner"
34
- extra_state_tokens = false
35
- hidden_width = 64
36
- maxout_pieces = 2
37
- use_upper = true
38
- nO = null
39
-
40
- [components.ner.model.tok2vec]
41
- @architectures = "spacy.Tok2VecListener.v1"
42
- width = ${components.tok2vec.model.encode.width}
43
- upstream = "*"
44
-
45
- [components.tok2vec]
46
- factory = "tok2vec"
47
-
48
- [components.tok2vec.model]
49
- @architectures = "spacy.Tok2Vec.v2"
50
-
51
- [components.tok2vec.model.embed]
52
- @architectures = "spacy.MultiHashEmbed.v2"
53
- width = ${components.tok2vec.model.encode.width}
54
- attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]
55
- rows = [5000,1000,2500,2500]
56
- include_static_vectors = true
57
-
58
- [components.tok2vec.model.encode]
59
- @architectures = "spacy.MaxoutWindowEncoder.v2"
60
- width = 256
61
- depth = 8
62
- window_size = 1
63
- maxout_pieces = 3
64
-
65
- [corpora]
66
-
67
- [corpora.dev]
68
- @readers = "spacy.Corpus.v1"
69
- path = ${paths.dev}
70
- max_length = 0
71
- gold_preproc = false
72
- limit = 0
73
- augmenter = null
74
-
75
- [corpora.train]
76
- @readers = "spacy.Corpus.v1"
77
- path = ${paths.train}
78
- max_length = 0
79
- gold_preproc = false
80
- limit = 0
81
- augmenter = null
82
-
83
- [training]
84
- dev_corpus = "corpora.dev"
85
- train_corpus = "corpora.train"
86
- seed = ${system.seed}
87
- gpu_allocator = ${system.gpu_allocator}
88
- dropout = 0.1
89
- accumulate_gradient = 1
90
- patience = 1600
91
- max_epochs = 0
92
- max_steps = 20000
93
- eval_frequency = 200
94
- frozen_components = []
95
- annotating_components = []
96
- before_to_disk = null
97
- before_update = null
98
-
99
- [training.batcher]
100
- @batchers = "spacy.batch_by_words.v1"
101
- discard_oversize = false
102
- tolerance = 0.2
103
- get_length = null
104
-
105
- [training.batcher.size]
106
- @schedules = "compounding.v1"
107
- start = 100
108
- stop = 1000
109
- compound = 1.001
110
- t = 0.0
111
-
112
- [training.logger]
113
- @loggers = "spacy.ConsoleLogger.v1"
114
- progress_bar = false
115
-
116
- [training.optimizer]
117
- @optimizers = "Adam.v1"
118
- beta1 = 0.9
119
- beta2 = 0.999
120
- L2_is_weight_decay = true
121
- L2 = 0.01
122
- grad_clip = 1.0
123
- use_averages = false
124
- eps = 0.00000001
125
- learn_rate = 0.001
126
-
127
- [training.score_weights]
128
- ents_f = 1.0
129
- ents_p = 0.0
130
- ents_r = 0.0
131
- ents_per_type = null
132
-
133
- [pretraining]
134
-
135
- [initialize]
136
- vectors = ${paths.vectors}
137
- init_tok2vec = ${paths.init_tok2vec}
138
- vocab_data = null
139
- lookups = null
140
- before_init = null
141
- after_init = null
142
-
143
- [initialize.components]
144
-
145
- [initialize.tokenizer]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ner-training/convert_to_spacy.py DELETED
@@ -1,37 +0,0 @@
1
- import spacy
2
- from spacy.tokens import DocBin
3
- from prepare_data import load_data
4
- import sys
5
-
6
- json_file_path = sys.argv[1]
7
- outpath = sys.argv[2]
8
-
9
- def create_spacy_binary(json_path, output_path, nlp):
10
- """
11
- Convert raw training examples into spaCy DocBin.
12
- """
13
- db = DocBin()
14
-
15
- # Load your custom data
16
- training_data = load_data(json_path)
17
-
18
- for text, ann in training_data:
19
- doc = nlp.make_doc(text)
20
- ents = []
21
- for start, end, label in ann["entities"]:
22
- span = doc.char_span(start, end, label=label, alignment_mode="contract")
23
- if span is None:
24
- # If spaCy can't align the tokenization, skip or handle carefully
25
- print(f"Skipping misaligned entity: '{text[start:end]}'")
26
- else:
27
- ents.append(span)
28
- doc.ents = ents
29
- db.add(doc)
30
-
31
- db.to_disk(output_path)
32
- print(f"Created spaCy binary file: {output_path}")
33
-
34
-
35
- if __name__ == "__main__":
36
- nlp = spacy.blank("en") # blank English pipeline
37
- create_spacy_binary(json_file_path, outpath, nlp)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ner-training/debug_labeled_data.py DELETED
@@ -1,89 +0,0 @@
1
- import sys
2
- import json
3
- import re
4
-
5
-
6
- json_file_path = sys.argv[1]
7
- text_key = sys.argv[2]
8
- out_path = sys.argv[3]
9
-
10
- # ------------ FUNCTION ------------
11
-
12
- def trim_and_fix_offsets(raw_data, context_key=text_key):
13
- """
14
- Attempt to fix leading/trailing whitespace in spans and recalc offsets.
15
- Then do a local substring search to fix minor misalignments.
16
- """
17
- fixed_data = []
18
- for i, record in enumerate(raw_data):
19
- text = record[context_key]
20
- new_labels = []
21
- for ann in record["label"]:
22
- label = ann["labels"][0]
23
- old_start, old_end = ann["start"], ann["end"]
24
- original_substring = text[old_start:old_end]
25
- trimmed_substring = original_substring.strip()
26
-
27
- # 1) Trim leading/trailing whitespace offsets
28
- # Move start forward while it points to space
29
- start = old_start
30
- while start < old_end and text[start].isspace():
31
- start += 1
32
- # Move end backward while it points to space
33
- end = old_end
34
- while end > start and text[end - 1].isspace():
35
- end -= 1
36
-
37
- # After naive trimming, see if the substring still matches
38
- new_substring = text[start:end]
39
- if new_substring == trimmed_substring:
40
- # Great, we can trust these offsets directly
41
- pass
42
- else:
43
- # Possibly there's hidden Unicode or the original offset was off.
44
- # We'll do a local substring search around `old_start`.
45
- # We'll search for `trimmed_substring` in a window of +/- 30 chars.
46
- window_size = 30
47
-
48
- # Define a safe search window in the text
49
- search_start = max(0, old_start - window_size)
50
- search_end = min(len(text), old_end + window_size)
51
- window_text = text[search_start:search_end]
52
-
53
- # Try to find the first occurrence of trimmed_substring in that window
54
- local_pos = window_text.find(trimmed_substring)
55
- if local_pos != -1:
56
- # Recalc absolute offset
57
- start = search_start + local_pos
58
- end = start + len(trimmed_substring)
59
- new_substring = text[start:end]
60
- else:
61
- # We failed to find it in the local region
62
- print(f"[Record {i}] Can't find '{trimmed_substring}' near offset {old_start}-{old_end}")
63
- # We'll leave this annotation as-is or skip it
64
- start, end = old_start, old_end
65
- new_substring = original_substring
66
-
67
- new_labels.append({
68
- "start": start,
69
- "end": end,
70
- "text": new_substring,
71
- "labels": [label]
72
- })
73
-
74
- # Update the record with the new label data
75
- new_record = dict(record)
76
- new_record["label"] = new_labels
77
- fixed_data.append(new_record)
78
-
79
- return fixed_data
80
-
81
-
82
- # ----------------- USAGE ----------------
83
- with open(json_file_path, "r", encoding="utf-8") as f:
84
- raw_data = json.load(f)
85
-
86
- fixed_data = trim_and_fix_offsets(raw_data, context_key=text_key)
87
-
88
- with open(out_path, "w", encoding="utf-8") as out:
89
- json.dump(fixed_data, out, indent=2, ensure_ascii=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ner-training/merge_json.py DELETED
@@ -1,25 +0,0 @@
1
- import json
2
- import sys
3
-
4
- original_json_path = sys.argv[1]
5
- additional_json_path = sys.argv[2]
6
-
7
- with open(additional_json_path, 'r') as source_file:
8
- source_data = json.load(source_file)
9
-
10
- # Load data from target.json
11
- with open(original_json_path, 'r') as target_file:
12
- target_data = json.load(target_file)
13
-
14
- # Ensure both source_data and target_data are lists
15
- if isinstance(source_data, list) and isinstance(target_data, list):
16
- # Append records from source_data to target_data
17
- target_data.extend(source_data)
18
- else:
19
- print("The JSON data must be a list of records in both files.")
20
-
21
- # Write updated data back to target.json
22
- with open(original_json_path, 'w') as target_file:
23
- json.dump(target_data, target_file, indent=4)
24
-
25
- print("Records have been appended successfully.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ner-training/predict.py DELETED
@@ -1,18 +0,0 @@
1
- import spacy
2
- import pandas as pd
3
- import sys
4
-
5
- csv_path = sys.argv[1]
6
- custom_model_path = sys.argv[2]
7
-
8
- df = pd.read_csv(csv_path)
9
- texts = df['Additional Info'].to_list()
10
-
11
- trained_nlp = spacy.load(custom_model_path)
12
-
13
- for text in texts:
14
- doc = trained_nlp(text)
15
- print(f"TEXT: {text}")
16
- print()
17
- print("ENTITIES:", [(ent.text, ent.label_) for ent in doc.ents])
18
- print('-'*60)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ner-training/prepare_data.py DELETED
@@ -1,39 +0,0 @@
1
- import json
2
- import sys
3
-
4
- raw_data = sys.argv[1]
5
-
6
- def load_data(json_path):
7
- """
8
- Load your custom JSON with 'additional_info' and 'label' fields.
9
- Returns a list of (text, {"entities": [(start, end, label), ...]}) tuples.
10
- """
11
- with open(json_path, 'r', encoding='utf-8') as f:
12
- data = json.load(f)
13
-
14
- # If your JSON is a list of records
15
- # If it's a single record, wrap it in [data] or handle accordingly
16
- if not isinstance(data, list):
17
- data = [data]
18
-
19
- training_data = []
20
-
21
- for record in data:
22
- text = record["additional_info"]
23
- spans = []
24
- for annotation in record["label"]:
25
- # Each annotation can have multiple "labels", but typically there's just one
26
- label = annotation["labels"][0]
27
- start = annotation["start"]
28
- end = annotation["end"]
29
- spans.append((start, end, label))
30
- # Append in spaCy's format
31
- training_data.append((text, {"entities": spans}))
32
-
33
- return training_data
34
-
35
- if __name__ == "__main__":
36
- # Example usage
37
- TRAIN_DATA = load_data(raw_data)
38
-
39
- print(TRAIN_DATA[:2])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ner-training/readme.md DELETED
@@ -1,36 +0,0 @@
1
- # Appropriate Usage for NER Training
2
-
3
- ## Cleaning and Debugging Training Data
4
-
5
- We first need to debug our raw labeled data from Label Studio. Sometimes, labeled data has trailing whitespaces or punctuation, which Spacy _really_ doesn't like. So we need to remove it.
6
-
7
- `python3 debug_labeled_data.py raw_labeded_data_path text_key_to_debug outdir`
8
-
9
- This will create a new debugged json file in the specified directory. **Use this file for the next step.**
10
-
11
- ## Preparing Data For Training
12
-
13
- Now, we need to convert this raw labeled data into Spacy's binary format. Before doing so however, we must make sure to split the data into training and dev sets for testing.
14
-
15
- 1. `python3 split_data.py debugged_json_path`
16
-
17
- This will create `train.json` and `dev.json` files in the current working directory.
18
-
19
- 2. Move these file into the trianing_data dir: `mv *.json training_data/`
20
-
21
- 3. Convert both sets into Spacy's binary format:
22
-
23
- `python3 convert_to_spacy.py training_data/train.json training_data/train.spacy`
24
- `python3 convert_to_spacy.py training_data/dev.json training_data/dev.spacy`
25
-
26
- ## Training
27
-
28
- To start training the data from the CLI, we simply run the following command:
29
-
30
- `
31
- python -m spacy train transformer.cfg \
32
- --paths.train training_data/train.spacy \
33
- --paths.dev training_data/dev.spacy \
34
- --gpu-id 0 \
35
- --output ./roberta_model
36
- `
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ner-training/remove_non_context_labels.py DELETED
@@ -1,19 +0,0 @@
1
- import json
2
- import sys
3
-
4
- file_to_filter = sys.argv[1]
5
- tag_to_keep = sys.argv[2]
6
- outpath = sys.argv[3]
7
-
8
- with open(file_to_filter, 'r') as input_file:
9
- dataset = json.load(input_file)
10
-
11
- def filter_context_labels(dataset):
12
- for item in dataset:
13
- item['label'] = [l for l in item['label'] if tag_to_keep in l['labels']]
14
- return dataset
15
-
16
- filtered_data = filter_context_labels(dataset)
17
-
18
- with open(outpath, 'w') as output_file:
19
- json.dump(filtered_data, output_file, indent=2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ner-training/split_data.py DELETED
@@ -1,24 +0,0 @@
1
- import json
2
- import random
3
- import sys
4
-
5
- json_file_path = sys.argv[1]
6
-
7
- # Load your full dataset (make sure it's a list of records)
8
- with open(json_file_path, "r", encoding="utf-8") as f:
9
- data = json.load(f)
10
-
11
- # Shuffle and then split the data 80/20
12
- random.shuffle(data)
13
- split_index = int(len(data) * 0.8)
14
- train_data = data[:split_index]
15
- dev_data = data[split_index:]
16
-
17
- # Save the train and dev JSON files
18
- with open("train.json", "w", encoding="utf-8") as f:
19
- json.dump(train_data, f, indent=2)
20
-
21
- with open("dev.json", "w", encoding="utf-8") as f:
22
- json.dump(dev_data, f, indent=2)
23
-
24
- print(f"Train examples: {len(train_data)}, Dev examples: {len(dev_data)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ner-training/testing-model.ipynb DELETED
The diff for this file is too large to render. See raw diff
 
ner-training/transformer.cfg DELETED
@@ -1,147 +0,0 @@
1
- [paths]
2
- train = null
3
- dev = null
4
- vectors = null
5
- init_tok2vec = null
6
-
7
- [system]
8
- gpu_allocator = "pytorch"
9
- seed = 0
10
-
11
- [nlp]
12
- lang = "en"
13
- pipeline = ["transformer","ner"]
14
- batch_size = 128
15
- disabled = []
16
- before_creation = null
17
- after_creation = null
18
- after_pipeline_creation = null
19
- tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
20
- vectors = {"@vectors":"spacy.Vectors.v1"}
21
-
22
- [components]
23
-
24
- [components.ner]
25
- factory = "ner"
26
- incorrect_spans_key = null
27
- moves = null
28
- scorer = {"@scorers":"spacy.ner_scorer.v1"}
29
- update_with_oracle_cut_size = 100
30
-
31
- [components.ner.model]
32
- @architectures = "spacy.TransitionBasedParser.v2"
33
- state_type = "ner"
34
- extra_state_tokens = false
35
- hidden_width = 64
36
- maxout_pieces = 2
37
- use_upper = false
38
- nO = null
39
-
40
- [components.ner.model.tok2vec]
41
- @architectures = "spacy-transformers.TransformerListener.v1"
42
- grad_factor = 1.0
43
- pooling = {"@layers":"reduce_mean.v1"}
44
- upstream = "*"
45
-
46
- [components.transformer]
47
- factory = "transformer"
48
- max_batch_items = 4096
49
- set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"}
50
-
51
- [components.transformer.model]
52
- @architectures = "spacy-transformers.TransformerModel.v3"
53
- name = "roberta-base"
54
- mixed_precision = false
55
-
56
- [components.transformer.model.get_spans]
57
- @span_getters = "spacy-transformers.strided_spans.v1"
58
- window = 128
59
- stride = 96
60
-
61
- [components.transformer.model.grad_scaler_config]
62
-
63
- [components.transformer.model.tokenizer_config]
64
- use_fast = true
65
-
66
- [components.transformer.model.transformer_config]
67
-
68
- [corpora]
69
-
70
- [corpora.dev]
71
- @readers = "spacy.Corpus.v1"
72
- path = ${paths.dev}
73
- max_length = 0
74
- gold_preproc = false
75
- limit = 0
76
- augmenter = null
77
-
78
- [corpora.train]
79
- @readers = "spacy.Corpus.v1"
80
- path = ${paths.train}
81
- max_length = 0
82
- gold_preproc = false
83
- limit = 0
84
- augmenter = null
85
-
86
- [training]
87
- accumulate_gradient = 3
88
- dev_corpus = "corpora.dev"
89
- train_corpus = "corpora.train"
90
- seed = ${system.seed}
91
- gpu_allocator = ${system.gpu_allocator}
92
- dropout = 0.1
93
- patience = 1600
94
- max_epochs = 0
95
- max_steps = 20000
96
- eval_frequency = 200
97
- frozen_components = []
98
- annotating_components = []
99
- before_to_disk = null
100
- before_update = null
101
-
102
- [training.batcher]
103
- @batchers = "spacy.batch_by_padded.v1"
104
- discard_oversize = true
105
- size = 2000
106
- buffer = 256
107
- get_length = null
108
-
109
- [training.logger]
110
- @loggers = "spacy.ConsoleLogger.v1"
111
- progress_bar = false
112
-
113
- [training.optimizer]
114
- @optimizers = "Adam.v1"
115
- beta1 = 0.9
116
- beta2 = 0.999
117
- L2_is_weight_decay = true
118
- L2 = 0.01
119
- grad_clip = 1.0
120
- use_averages = false
121
- eps = 0.00000001
122
-
123
- [training.optimizer.learn_rate]
124
- @schedules = "warmup_linear.v1"
125
- warmup_steps = 250
126
- total_steps = 20000
127
- initial_rate = 0.00005
128
-
129
- [training.score_weights]
130
- ents_f = 1.0
131
- ents_p = 0.0
132
- ents_r = 0.0
133
- ents_per_type = null
134
-
135
- [pretraining]
136
-
137
- [initialize]
138
- vectors = ${paths.vectors}
139
- init_tok2vec = ${paths.init_tok2vec}
140
- vocab_data = null
141
- lookups = null
142
- before_init = null
143
- after_init = null
144
-
145
- [initialize.components]
146
-
147
- [initialize.tokenizer]