Spaces:
Runtime error
Runtime error
| import spacy | |
| from spacy.util import minibatch, compounding | |
| from spacy.scorer import Scorer | |
| from src.model_utils import * | |
| import random | |
| from tqdm import tqdm | |
| def train_transformer(config: dict, train_data: list, components: list, iter: int, | |
| batch_size: int=compounding(4.0, 32.0, 1.001), entities: list=None, eval_data: list=None)-> spacy: | |
| """ | |
| Finetune a transformer model or resume training from a fine-tuned model. | |
| Parameters: | |
| config: dict, configuration parameters | |
| train_data: list, contain training data | |
| components: list, list of components to be trained | |
| iter: int, number of iterations to train | |
| batch_size: int, batch size to be used for training | |
| entities: list of entities to be trained on for NER | |
| eval_data: list, containing evaluation data | |
| Returns: | |
| nlp : spacy transformer | |
| losses: list of the losses at every iteration | |
| """ | |
| if config['dir'] is not None: | |
| nlp = spacy.load(config['dir']) | |
| optimizer = nlp.resume_training() | |
| else: | |
| nlp = spacy.blank("en") # empty English pipeline | |
| nlp.add_pipe("transformer", config=config['config']) | |
| for component in components: | |
| nlp.add_pipe(component) | |
| task=nlp.get_pipe(component) | |
| if ('ner' in components) and (entities is not None): | |
| for label in entities: | |
| task.add_label(label) | |
| nlp.initialize() # XXX don't forget this step! | |
| optimizer = nlp.create_optimizer() | |
| # convert data into training doc | |
| train_data_doc = make_training_doc(nlp, train_data) | |
| all_losses = [] | |
| for itn in tqdm(range(1,iter+1)): | |
| print("Starting iteration " + str(itn)) | |
| random.shuffle(train_data) | |
| losses = {} | |
| # compounding(4.0, 32.0, 1.001) | |
| batches = minibatch(train_data_doc, size=batch_size) | |
| for batch in batches: | |
| nlp.update(batch, sgd=optimizer,drop=0.2, losses=losses) | |
| scores = eval_spacy(nlp, eval_data) if eval_data else eval_spacy(nlp, train_data) | |
| print("epoch: {} Losses: {} Recall: {} Precision: {} F1: {}". \ | |
| format(itn, str(losses),scores['ents_r'],scores['ents_p'],scores['ents_f'])) | |
| all_losses.append([losses[component] for component in components]) | |
| return nlp, all_losses | |
| def train_spacy(model: spacy, train_data: list, components: list, iter: int, | |
| batch_size:int=compounding(4.0, 32.0, 1.001), entities: list=None, eval_data:list=None)-> spacy: | |
| """ | |
| Finetune a spacy model or resume training from a fine-tuned model. | |
| Parameters: | |
| model: str, name of spacy model | |
| train_data: list, contain training data | |
| components: list, list of components to be trained | |
| iter: int, number of iterations to train | |
| batch_size: int, batch size to be used for training | |
| entities: list of entities to be trained on for NER | |
| eval_data: list, containing evaluation data | |
| Returns: | |
| nlp : spacy model | |
| losses: list of the losses at every iteration | |
| """ | |
| # get model and optimizer | |
| if model is not None: | |
| nlp, optimizer = load_model(model) # load existing spaCy model/ blank models | |
| # convert data into training doc | |
| train_data_doc = make_training_doc(nlp, train_data) | |
| # create the built-in pipeline components and add them to the pipeline | |
| # nlp.create_pipe works for built-ins that are registered with spaCy | |
| for component in components: | |
| if component not in nlp.pipe_names: | |
| ner = nlp.create_pipe(component) | |
| nlp.add_pipe(component, last=True) | |
| else: | |
| ner = nlp.get_pipe(component) | |
| # add labels if component is NER | |
| if (component == 'ner') and (entities is not None): | |
| for ent in entities: | |
| ner.add_label(ent) | |
| print(f'Entities in the model are: {nlp.get_pipe("ner").labels}') | |
| # get names of other pipes to disable them during training | |
| other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in components] | |
| all_losses = [] | |
| with nlp.disable_pipes(*other_pipes): # only train NER | |
| for itn in tqdm(range(1,iter+1)): | |
| print("Starting iteration " + str(itn)) | |
| random.shuffle(train_data) | |
| losses = {} | |
| batches = minibatch(train_data_doc, size=batch_size) | |
| for batch in batches: | |
| nlp.update(list(batch), | |
| losses=losses, | |
| drop=0.1, | |
| sgd=optimizer) | |
| scores = eval_spacy(nlp, eval_data) if eval_data else eval_spacy(nlp, train_data) | |
| print("epoch: {} Losses: {} Recall: {} Precision: {} F1: {}". \ | |
| format(itn, str(losses),scores['ents_r'],scores['ents_p'],scores['ents_f'])) | |
| all_losses.append([losses[component] for component in components]) | |
| return nlp, all_losses | |
| def eval_spacy(model: spacy, data): | |
| """ | |
| Function to perform evaluation and scoring | |
| Parameters: | |
| model: either a spacy model or spacy transformer | |
| data: evaluation data so that scoring can be done | |
| Returns: | |
| score: dict with scores of the model | |
| """ | |
| scorer = Scorer() | |
| examples = [] | |
| try: | |
| # accept spacy format json data | |
| for input_, annot in data: | |
| doc = model.make_doc(input_) | |
| example = Example.from_dict(doc, annot) | |
| example.predicted = model(str(example.text)) | |
| examples.append(example) | |
| scores = scorer.score(examples) | |
| return scores | |
| except TypeError: | |
| # accept alternative format json data | |
| for row in data: | |
| input_, annot = row.values() | |
| doc = model.make_doc(input_) | |
| example = Example.from_dict(doc, {'entities':annot}) | |
| example.predicted = model(str(example.text)) | |
| examples.append(example) | |
| scores = scorer.score(examples) | |
| return scores | |
| except Exception as e: print(e) | |