Spaces:

TwinklData
/

Community_Collections_App

Sleeping

App Files Files Community

lynn-twinkl commited on May 9, 2025

Commit

7f30bf8

1 Parent(s): 19fcede

removed this as it's no longer used

Browse files

Files changed (12) hide show

ner-training/begin-training.zsh +0 -14
ner-training/config.cfg +0 -145
ner-training/convert_to_spacy.py +0 -37
ner-training/debug_labeled_data.py +0 -89
ner-training/merge_json.py +0 -25
ner-training/predict.py +0 -18
ner-training/prepare_data.py +0 -39
ner-training/readme.md +0 -36
ner-training/remove_non_context_labels.py +0 -19
ner-training/split_data.py +0 -24
ner-training/testing-model.ipynb +0 -0
ner-training/transformer.cfg +0 -147

ner-training/begin-training.zsh DELETED Viewed

@@ -1,14 +0,0 @@
-#!/bin/zsh
-source ner_venv/bin/activate
-train_spacy_file=$1
-dev_spacy_file=$2
-model_outdir=$3
-python3 -m spacy train transformer.cfg \
---paths.train "$train_spacy_file" \
---paths.dev "$dev_spacy_file"  \
---gpu-id 0 \
---output "$model_outdir"

ner-training/config.cfg DELETED Viewed

@@ -1,145 +0,0 @@
-[paths]
-train = null
-dev = null
-vectors = "en_core_web_lg"
-init_tok2vec = null
-[system]
-gpu_allocator = null
-seed = 0
-[nlp]
-lang = "en"
-pipeline = ["tok2vec","ner"]
-batch_size = 1000
-disabled = []
-before_creation = null
-after_creation = null
-after_pipeline_creation = null
-tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
-vectors = {"@vectors":"spacy.Vectors.v1"}
-[components]
-[components.ner]
-factory = "ner"
-incorrect_spans_key = null
-moves = null
-scorer = {"@scorers":"spacy.ner_scorer.v1"}
-update_with_oracle_cut_size = 100
-[components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v2"
-state_type = "ner"
-extra_state_tokens = false
-hidden_width = 64
-maxout_pieces = 2
-use_upper = true
-nO = null
-[components.ner.model.tok2vec]
-@architectures = "spacy.Tok2VecListener.v1"
-width = ${components.tok2vec.model.encode.width}
-upstream = "*"
-[components.tok2vec]
-factory = "tok2vec"
-[components.tok2vec.model]
-@architectures = "spacy.Tok2Vec.v2"
-[components.tok2vec.model.embed]
-@architectures = "spacy.MultiHashEmbed.v2"
-width = ${components.tok2vec.model.encode.width}
-attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]
-rows = [5000,1000,2500,2500]
-include_static_vectors = true
-[components.tok2vec.model.encode]
-@architectures = "spacy.MaxoutWindowEncoder.v2"
-width = 256
-depth = 8
-window_size = 1
-maxout_pieces = 3
-[corpora]
-[corpora.dev]
-@readers = "spacy.Corpus.v1"
-path = ${paths.dev}
-max_length = 0
-gold_preproc = false
-limit = 0
-augmenter = null
-[corpora.train]
-@readers = "spacy.Corpus.v1"
-path = ${paths.train}
-max_length = 0
-gold_preproc = false
-limit = 0
-augmenter = null
-[training]
-dev_corpus = "corpora.dev"
-train_corpus = "corpora.train"
-seed = ${system.seed}
-gpu_allocator = ${system.gpu_allocator}
-dropout = 0.1
-accumulate_gradient = 1
-patience = 1600
-max_epochs = 0
-max_steps = 20000
-eval_frequency = 200
-frozen_components = []
-annotating_components = []
-before_to_disk = null
-before_update = null
-[training.batcher]
-@batchers = "spacy.batch_by_words.v1"
-discard_oversize = false
-tolerance = 0.2
-get_length = null
-[training.batcher.size]
-@schedules = "compounding.v1"
-start = 100
-stop = 1000
-compound = 1.001
-t = 0.0
-[training.logger]
-@loggers = "spacy.ConsoleLogger.v1"
-progress_bar = false
-[training.optimizer]
-@optimizers = "Adam.v1"
-beta1 = 0.9
-beta2 = 0.999
-L2_is_weight_decay = true
-L2 = 0.01
-grad_clip = 1.0
-use_averages = false
-eps = 0.00000001
-learn_rate = 0.001
-[training.score_weights]
-ents_f = 1.0
-ents_p = 0.0
-ents_r = 0.0
-ents_per_type = null
-[pretraining]
-[initialize]
-vectors = ${paths.vectors}
-init_tok2vec = ${paths.init_tok2vec}
-vocab_data = null
-lookups = null
-before_init = null
-after_init = null
-[initialize.components]
-[initialize.tokenizer]

ner-training/convert_to_spacy.py DELETED Viewed

@@ -1,37 +0,0 @@
-import spacy
-from spacy.tokens import DocBin
-from prepare_data import load_data
-import sys
-json_file_path = sys.argv[1]
-outpath = sys.argv[2]
-def create_spacy_binary(json_path, output_path, nlp):
-    """
-    Convert raw training examples into spaCy DocBin.
-    """
-    db = DocBin()
-    # Load your custom data
-    training_data = load_data(json_path)
-    for text, ann in training_data:
-        doc = nlp.make_doc(text)
-        ents = []
-        for start, end, label in ann["entities"]:
-            span = doc.char_span(start, end, label=label, alignment_mode="contract")
-            if span is None:
-                # If spaCy can't align the tokenization, skip or handle carefully
-                print(f"Skipping misaligned entity: '{text[start:end]}'")
-            else:
-                ents.append(span)
-        doc.ents = ents
-        db.add(doc)
-    db.to_disk(output_path)
-    print(f"Created spaCy binary file: {output_path}")
-if __name__ == "__main__":
-    nlp = spacy.blank("en")  # blank English pipeline
-    create_spacy_binary(json_file_path, outpath, nlp)

ner-training/debug_labeled_data.py DELETED Viewed

@@ -1,89 +0,0 @@
-import sys
-import json
-import re
-json_file_path = sys.argv[1]
-text_key = sys.argv[2]
-out_path = sys.argv[3]
-# ------------ FUNCTION ------------
-def trim_and_fix_offsets(raw_data, context_key=text_key):
-    """
-    Attempt to fix leading/trailing whitespace in spans and recalc offsets.
-    Then do a local substring search to fix minor misalignments.
-    """
-    fixed_data = []
-    for i, record in enumerate(raw_data):
-        text = record[context_key]
-        new_labels = []
-        for ann in record["label"]:
-            label = ann["labels"][0]
-            old_start, old_end = ann["start"], ann["end"]
-            original_substring = text[old_start:old_end]
-            trimmed_substring = original_substring.strip()
-            # 1) Trim leading/trailing whitespace offsets
-            # Move start forward while it points to space
-            start = old_start
-            while start < old_end and text[start].isspace():
-                start += 1
-            # Move end backward while it points to space
-            end = old_end
-            while end > start and text[end - 1].isspace():
-                end -= 1
-            # After naive trimming, see if the substring still matches
-            new_substring = text[start:end]
-            if new_substring == trimmed_substring:
-                # Great, we can trust these offsets directly
-                pass
-            else:
-                # Possibly there's hidden Unicode or the original offset was off.
-                # We'll do a local substring search around `old_start`.
-                # We'll search for `trimmed_substring` in a window of +/- 30 chars.
-                window_size = 30
-                # Define a safe search window in the text
-                search_start = max(0, old_start - window_size)
-                search_end = min(len(text), old_end + window_size)
-                window_text = text[search_start:search_end]
-                # Try to find the first occurrence of trimmed_substring in that window
-                local_pos = window_text.find(trimmed_substring)
-                if local_pos != -1:
-                    # Recalc absolute offset
-                    start = search_start + local_pos
-                    end = start + len(trimmed_substring)
-                    new_substring = text[start:end]
-                else:
-                    # We failed to find it in the local region
-                    print(f"[Record {i}] Can't find '{trimmed_substring}' near offset {old_start}-{old_end}")
-                    # We'll leave this annotation as-is or skip it
-                    start, end = old_start, old_end
-                    new_substring = original_substring
-            new_labels.append({
-                "start": start,
-                "end": end,
-                "text": new_substring,
-                "labels": [label]
-            })
-        # Update the record with the new label data
-        new_record = dict(record)
-        new_record["label"] = new_labels
-        fixed_data.append(new_record)
-    return fixed_data
-# ----------------- USAGE ----------------
-with open(json_file_path, "r", encoding="utf-8") as f:
-    raw_data = json.load(f)
-fixed_data = trim_and_fix_offsets(raw_data, context_key=text_key)
-with open(out_path, "w", encoding="utf-8") as out:
-    json.dump(fixed_data, out, indent=2, ensure_ascii=False)

ner-training/merge_json.py DELETED Viewed

@@ -1,25 +0,0 @@
-import json
-import sys
-original_json_path = sys.argv[1]
-additional_json_path = sys.argv[2]
-with open(additional_json_path, 'r') as source_file:
-    source_data = json.load(source_file)
-# Load data from target.json
-with open(original_json_path, 'r') as target_file:
-    target_data = json.load(target_file)
-# Ensure both source_data and target_data are lists
-if isinstance(source_data, list) and isinstance(target_data, list):
-    # Append records from source_data to target_data
-    target_data.extend(source_data)
-else:
-    print("The JSON data must be a list of records in both files.")
-# Write updated data back to target.json
-with open(original_json_path, 'w') as target_file:
-    json.dump(target_data, target_file, indent=4)
-print("Records have been appended successfully.")

ner-training/predict.py DELETED Viewed

@@ -1,18 +0,0 @@
-import spacy
-import pandas as pd
-import sys
-csv_path = sys.argv[1]
-custom_model_path = sys.argv[2]
-df = pd.read_csv(csv_path)
-texts = df['Additional Info'].to_list()
-trained_nlp = spacy.load(custom_model_path)
-for text in texts:
-    doc = trained_nlp(text)
-    print(f"TEXT: {text}")
-    print()
-    print("ENTITIES:", [(ent.text, ent.label_) for ent in doc.ents])
-    print('-'*60)

ner-training/prepare_data.py DELETED Viewed

@@ -1,39 +0,0 @@
-import json
-import sys
-raw_data = sys.argv[1]
-def load_data(json_path):
-    """
-    Load your custom JSON with 'additional_info' and 'label' fields.
-    Returns a list of (text, {"entities": [(start, end, label), ...]}) tuples.
-    """
-    with open(json_path, 'r', encoding='utf-8') as f:
-        data = json.load(f)
-    # If your JSON is a list of records
-    # If it's a single record, wrap it in [data] or handle accordingly
-    if not isinstance(data, list):
-        data = [data]
-    training_data = []
-    for record in data:
-        text = record["additional_info"]
-        spans = []
-        for annotation in record["label"]:
-            # Each annotation can have multiple "labels", but typically there's just one
-            label = annotation["labels"][0]
-            start = annotation["start"]
-            end = annotation["end"]
-            spans.append((start, end, label))
-        # Append in spaCy's format
-        training_data.append((text, {"entities": spans}))
-    return training_data
-if __name__ == "__main__":
-    # Example usage
-    TRAIN_DATA = load_data(raw_data)
-    print(TRAIN_DATA[:2])

ner-training/readme.md DELETED Viewed

@@ -1,36 +0,0 @@
-# Appropriate Usage for NER Training
-## Cleaning and Debugging Training Data
-We first need to debug our raw labeled data from Label Studio. Sometimes, labeled data has trailing whitespaces or punctuation, which Spacy _really_ doesn't like. So we need to remove it.
-`python3 debug_labeled_data.py raw_labeded_data_path text_key_to_debug outdir`
-This will create a new debugged json file in the specified directory. **Use this file for the next step.**
-## Preparing Data For Training
-Now, we need to convert this raw labeled data into Spacy's binary format. Before doing so however, we must make sure to split the data into training and dev sets for testing.
-1. `python3 split_data.py debugged_json_path`
-This will create `train.json` and `dev.json` files in the current working directory.
-2. Move these file into the trianing_data dir:  `mv *.json training_data/`
-3. Convert both sets into Spacy's binary format:
-`python3 convert_to_spacy.py training_data/train.json training_data/train.spacy`
-`python3 convert_to_spacy.py training_data/dev.json training_data/dev.spacy`
-## Training
-To start training the data from the CLI, we simply run the following command:
-`
-python -m spacy train transformer.cfg \
---paths.train training_data/train.spacy \
---paths.dev training_data/dev.spacy \
---gpu-id 0 \
---output ./roberta_model
-`

ner-training/remove_non_context_labels.py DELETED Viewed

@@ -1,19 +0,0 @@
-import json
-import sys
-file_to_filter = sys.argv[1]
-tag_to_keep = sys.argv[2]
-outpath = sys.argv[3]
-with open(file_to_filter, 'r') as input_file:
-    dataset = json.load(input_file)
-def filter_context_labels(dataset):
-    for item in dataset:
-        item['label'] = [l for l in item['label'] if tag_to_keep in l['labels']]
-    return dataset
-filtered_data = filter_context_labels(dataset)
-with open(outpath, 'w') as output_file:
-    json.dump(filtered_data, output_file, indent=2)

ner-training/split_data.py DELETED Viewed

@@ -1,24 +0,0 @@
-import json
-import random
-import sys
-json_file_path = sys.argv[1]
-# Load your full dataset (make sure it's a list of records)
-with open(json_file_path, "r", encoding="utf-8") as f:
-    data = json.load(f)
-# Shuffle and then split the data 80/20
-random.shuffle(data)
-split_index = int(len(data) * 0.8)
-train_data = data[:split_index]
-dev_data = data[split_index:]
-# Save the train and dev JSON files
-with open("train.json", "w", encoding="utf-8") as f:
-    json.dump(train_data, f, indent=2)
-with open("dev.json", "w", encoding="utf-8") as f:
-    json.dump(dev_data, f, indent=2)
-print(f"Train examples: {len(train_data)}, Dev examples: {len(dev_data)}")

ner-training/testing-model.ipynb DELETED Viewed

The diff for this file is too large to render. See raw diff

ner-training/transformer.cfg DELETED Viewed

@@ -1,147 +0,0 @@
-[paths]
-train = null
-dev = null
-vectors = null
-init_tok2vec = null
-[system]
-gpu_allocator = "pytorch"
-seed = 0
-[nlp]
-lang = "en"
-pipeline = ["transformer","ner"]
-batch_size = 128
-disabled = []
-before_creation = null
-after_creation = null
-after_pipeline_creation = null
-tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
-vectors = {"@vectors":"spacy.Vectors.v1"}
-[components]
-[components.ner]
-factory = "ner"
-incorrect_spans_key = null
-moves = null
-scorer = {"@scorers":"spacy.ner_scorer.v1"}
-update_with_oracle_cut_size = 100
-[components.ner.model]
-@architectures = "spacy.TransitionBasedParser.v2"
-state_type = "ner"
-extra_state_tokens = false
-hidden_width = 64
-maxout_pieces = 2
-use_upper = false
-nO = null
-[components.ner.model.tok2vec]
-@architectures = "spacy-transformers.TransformerListener.v1"
-grad_factor = 1.0
-pooling = {"@layers":"reduce_mean.v1"}
-upstream = "*"
-[components.transformer]
-factory = "transformer"
-max_batch_items = 4096
-set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"}
-[components.transformer.model]
-@architectures = "spacy-transformers.TransformerModel.v3"
-name = "roberta-base"
-mixed_precision = false
-[components.transformer.model.get_spans]
-@span_getters = "spacy-transformers.strided_spans.v1"
-window = 128
-stride = 96
-[components.transformer.model.grad_scaler_config]
-[components.transformer.model.tokenizer_config]
-use_fast = true
-[components.transformer.model.transformer_config]
-[corpora]
-[corpora.dev]
-@readers = "spacy.Corpus.v1"
-path = ${paths.dev}
-max_length = 0
-gold_preproc = false
-limit = 0
-augmenter = null
-[corpora.train]
-@readers = "spacy.Corpus.v1"
-path = ${paths.train}
-max_length = 0
-gold_preproc = false
-limit = 0
-augmenter = null
-[training]
-accumulate_gradient = 3
-dev_corpus = "corpora.dev"
-train_corpus = "corpora.train"
-seed = ${system.seed}
-gpu_allocator = ${system.gpu_allocator}
-dropout = 0.1
-patience = 1600
-max_epochs = 0
-max_steps = 20000
-eval_frequency = 200
-frozen_components = []
-annotating_components = []
-before_to_disk = null
-before_update = null
-[training.batcher]
-@batchers = "spacy.batch_by_padded.v1"
-discard_oversize = true
-size = 2000
-buffer = 256
-get_length = null
-[training.logger]
-@loggers = "spacy.ConsoleLogger.v1"
-progress_bar = false
-[training.optimizer]
-@optimizers = "Adam.v1"
-beta1 = 0.9
-beta2 = 0.999
-L2_is_weight_decay = true
-L2 = 0.01
-grad_clip = 1.0
-use_averages = false
-eps = 0.00000001
-[training.optimizer.learn_rate]
-@schedules = "warmup_linear.v1"
-warmup_steps = 250
-total_steps = 20000
-initial_rate = 0.00005
-[training.score_weights]
-ents_f = 1.0
-ents_p = 0.0
-ents_r = 0.0
-ents_per_type = null
-[pretraining]
-[initialize]
-vectors = ${paths.vectors}
-init_tok2vec = ${paths.init_tok2vec}
-vocab_data = null
-lookups = null
-before_init = null
-after_init = null
-[initialize.components]
-[initialize.tokenizer]