| | import types |
| | import warnings |
| | from typing import List, Optional, Tuple, Union |
| |
|
| | import numpy as np |
| |
|
| | from ..models.bert.tokenization_bert import BasicTokenizer |
| | from ..utils import ( |
| | ExplicitEnum, |
| | add_end_docstrings, |
| | is_tf_available, |
| | is_torch_available, |
| | ) |
| | from .base import PIPELINE_INIT_ARGS, ArgumentHandler, ChunkPipeline, Dataset |
| |
|
| |
|
| | if is_tf_available(): |
| | import tensorflow as tf |
| |
|
| | from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES |
| | if is_torch_available(): |
| | from ..models.auto.modeling_auto import MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES |
| |
|
| |
|
| | class TokenClassificationArgumentHandler(ArgumentHandler): |
| | """ |
| | Handles arguments for token classification. |
| | """ |
| |
|
| | def __call__(self, inputs: Union[str, List[str]], **kwargs): |
| | if inputs is not None and isinstance(inputs, (list, tuple)) and len(inputs) > 0: |
| | inputs = list(inputs) |
| | batch_size = len(inputs) |
| | elif isinstance(inputs, str): |
| | inputs = [inputs] |
| | batch_size = 1 |
| | elif Dataset is not None and isinstance(inputs, Dataset) or isinstance(inputs, types.GeneratorType): |
| | return inputs, None |
| | else: |
| | raise ValueError("At least one input is required.") |
| |
|
| | offset_mapping = kwargs.get("offset_mapping") |
| | if offset_mapping: |
| | if isinstance(offset_mapping, list) and isinstance(offset_mapping[0], tuple): |
| | offset_mapping = [offset_mapping] |
| | if len(offset_mapping) != batch_size: |
| | raise ValueError("offset_mapping should have the same batch size as the input") |
| | return inputs, offset_mapping |
| |
|
| |
|
| | class AggregationStrategy(ExplicitEnum): |
| | """All the valid aggregation strategies for TokenClassificationPipeline""" |
| |
|
| | NONE = "none" |
| | SIMPLE = "simple" |
| | FIRST = "first" |
| | AVERAGE = "average" |
| | MAX = "max" |
| |
|
| |
|
| | @add_end_docstrings( |
| | PIPELINE_INIT_ARGS, |
| | r""" |
| | ignore_labels (`List[str]`, defaults to `["O"]`): |
| | A list of labels to ignore. |
| | grouped_entities (`bool`, *optional*, defaults to `False`): |
| | DEPRECATED, use `aggregation_strategy` instead. Whether or not to group the tokens corresponding to the |
| | same entity together in the predictions or not. |
| | stride (`int`, *optional*): |
| | If stride is provided, the pipeline is applied on all the text. The text is split into chunks of size |
| | model_max_length. Works only with fast tokenizers and `aggregation_strategy` different from `NONE`. The |
| | value of this argument defines the number of overlapping tokens between chunks. In other words, the model |
| | will shift forward by `tokenizer.model_max_length - stride` tokens each step. |
| | aggregation_strategy (`str`, *optional*, defaults to `"none"`): |
| | The strategy to fuse (or not) tokens based on the model prediction. |
| | |
| | - "none" : Will simply not do any aggregation and simply return raw results from the model |
| | - "simple" : Will attempt to group entities following the default schema. (A, B-TAG), (B, I-TAG), (C, |
| | I-TAG), (D, B-TAG2) (E, B-TAG2) will end up being [{"word": ABC, "entity": "TAG"}, {"word": "D", |
| | "entity": "TAG2"}, {"word": "E", "entity": "TAG2"}] Notice that two consecutive B tags will end up as |
| | different entities. On word based languages, we might end up splitting words undesirably : Imagine |
| | Microsoft being tagged as [{"word": "Micro", "entity": "ENTERPRISE"}, {"word": "soft", "entity": |
| | "NAME"}]. Look for FIRST, MAX, AVERAGE for ways to mitigate that and disambiguate words (on languages |
| | that support that meaning, which is basically tokens separated by a space). These mitigations will |
| | only work on real words, "New york" might still be tagged with two different entities. |
| | - "first" : (works only on word based models) Will use the `SIMPLE` strategy except that words, cannot |
| | end up with different tags. Words will simply use the tag of the first token of the word when there |
| | is ambiguity. |
| | - "average" : (works only on word based models) Will use the `SIMPLE` strategy except that words, |
| | cannot end up with different tags. scores will be averaged first across tokens, and then the maximum |
| | label is applied. |
| | - "max" : (works only on word based models) Will use the `SIMPLE` strategy except that words, cannot |
| | end up with different tags. Word entity will simply be the token with the maximum score. |
| | """, |
| | ) |
| | class TokenClassificationPipeline(ChunkPipeline): |
| | """ |
| | Named Entity Recognition pipeline using any `ModelForTokenClassification`. See the [named entity recognition |
| | examples](../task_summary#named-entity-recognition) for more information. |
| | |
| | Example: |
| | |
| | ```python |
| | >>> from transformers import pipeline |
| | |
| | >>> token_classifier = pipeline(model="Jean-Baptiste/camembert-ner", aggregation_strategy="simple") |
| | >>> sentence = "Je m'appelle jean-baptiste et je vis à montréal" |
| | >>> tokens = token_classifier(sentence) |
| | >>> tokens |
| | [{'entity_group': 'PER', 'score': 0.9931, 'word': 'jean-baptiste', 'start': 12, 'end': 26}, {'entity_group': 'LOC', 'score': 0.998, 'word': 'montréal', 'start': 38, 'end': 47}] |
| | |
| | >>> token = tokens[0] |
| | >>> # Start and end provide an easy way to highlight words in the original text. |
| | >>> sentence[token["start"] : token["end"]] |
| | ' jean-baptiste' |
| | |
| | >>> # Some models use the same idea to do part of speech. |
| | >>> syntaxer = pipeline(model="vblagoje/bert-english-uncased-finetuned-pos", aggregation_strategy="simple") |
| | >>> syntaxer("My name is Sarah and I live in London") |
| | [{'entity_group': 'PRON', 'score': 0.999, 'word': 'my', 'start': 0, 'end': 2}, {'entity_group': 'NOUN', 'score': 0.997, 'word': 'name', 'start': 3, 'end': 7}, {'entity_group': 'AUX', 'score': 0.994, 'word': 'is', 'start': 8, 'end': 10}, {'entity_group': 'PROPN', 'score': 0.999, 'word': 'sarah', 'start': 11, 'end': 16}, {'entity_group': 'CCONJ', 'score': 0.999, 'word': 'and', 'start': 17, 'end': 20}, {'entity_group': 'PRON', 'score': 0.999, 'word': 'i', 'start': 21, 'end': 22}, {'entity_group': 'VERB', 'score': 0.998, 'word': 'live', 'start': 23, 'end': 27}, {'entity_group': 'ADP', 'score': 0.999, 'word': 'in', 'start': 28, 'end': 30}, {'entity_group': 'PROPN', 'score': 0.999, 'word': 'london', 'start': 31, 'end': 37}] |
| | ``` |
| | |
| | Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial) |
| | |
| | This token recognition pipeline can currently be loaded from [`pipeline`] using the following task identifier: |
| | `"ner"` (for predicting the classes of tokens in a sequence: person, organisation, location or miscellaneous). |
| | |
| | The models that this pipeline can use are models that have been fine-tuned on a token classification task. See the |
| | up-to-date list of available models on |
| | [huggingface.co/models](https://huggingface.co/models?filter=token-classification). |
| | """ |
| |
|
| | default_input_names = "sequences" |
| |
|
| | def __init__(self, args_parser=TokenClassificationArgumentHandler(), *args, **kwargs): |
| | super().__init__(*args, **kwargs) |
| | self.check_model_type( |
| | TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES |
| | if self.framework == "tf" |
| | else MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES |
| | ) |
| |
|
| | self._basic_tokenizer = BasicTokenizer(do_lower_case=False) |
| | self._args_parser = args_parser |
| |
|
| | def _sanitize_parameters( |
| | self, |
| | ignore_labels=None, |
| | grouped_entities: Optional[bool] = None, |
| | ignore_subwords: Optional[bool] = None, |
| | aggregation_strategy: Optional[AggregationStrategy] = None, |
| | offset_mapping: Optional[List[Tuple[int, int]]] = None, |
| | stride: Optional[int] = None, |
| | ): |
| | preprocess_params = {} |
| | if offset_mapping is not None: |
| | preprocess_params["offset_mapping"] = offset_mapping |
| |
|
| | postprocess_params = {} |
| | if grouped_entities is not None or ignore_subwords is not None: |
| | if grouped_entities and ignore_subwords: |
| | aggregation_strategy = AggregationStrategy.FIRST |
| | elif grouped_entities and not ignore_subwords: |
| | aggregation_strategy = AggregationStrategy.SIMPLE |
| | else: |
| | aggregation_strategy = AggregationStrategy.NONE |
| |
|
| | if grouped_entities is not None: |
| | warnings.warn( |
| | "`grouped_entities` is deprecated and will be removed in version v5.0.0, defaulted to" |
| | f' `aggregation_strategy="{aggregation_strategy}"` instead.' |
| | ) |
| | if ignore_subwords is not None: |
| | warnings.warn( |
| | "`ignore_subwords` is deprecated and will be removed in version v5.0.0, defaulted to" |
| | f' `aggregation_strategy="{aggregation_strategy}"` instead.' |
| | ) |
| |
|
| | if aggregation_strategy is not None: |
| | if isinstance(aggregation_strategy, str): |
| | aggregation_strategy = AggregationStrategy[aggregation_strategy.upper()] |
| | if ( |
| | aggregation_strategy |
| | in {AggregationStrategy.FIRST, AggregationStrategy.MAX, AggregationStrategy.AVERAGE} |
| | and not self.tokenizer.is_fast |
| | ): |
| | raise ValueError( |
| | "Slow tokenizers cannot handle subwords. Please set the `aggregation_strategy` option" |
| | ' to `"simple"` or use a fast tokenizer.' |
| | ) |
| | postprocess_params["aggregation_strategy"] = aggregation_strategy |
| | if ignore_labels is not None: |
| | postprocess_params["ignore_labels"] = ignore_labels |
| | if stride is not None: |
| | if stride >= self.tokenizer.model_max_length: |
| | raise ValueError( |
| | "`stride` must be less than `tokenizer.model_max_length` (or even lower if the tokenizer adds special tokens)" |
| | ) |
| | if aggregation_strategy == AggregationStrategy.NONE: |
| | raise ValueError( |
| | "`stride` was provided to process all the text but `aggregation_strategy=" |
| | f'"{aggregation_strategy}"`, please select another one instead.' |
| | ) |
| | else: |
| | if self.tokenizer.is_fast: |
| | tokenizer_params = { |
| | "return_overflowing_tokens": True, |
| | "padding": True, |
| | "stride": stride, |
| | } |
| | preprocess_params["tokenizer_params"] = tokenizer_params |
| | else: |
| | raise ValueError( |
| | "`stride` was provided to process all the text but you're using a slow tokenizer." |
| | " Please use a fast tokenizer." |
| | ) |
| | return preprocess_params, {}, postprocess_params |
| |
|
| | def __call__(self, inputs: Union[str, List[str]], **kwargs): |
| | """ |
| | Classify each token of the text(s) given as inputs. |
| | |
| | Args: |
| | inputs (`str` or `List[str]`): |
| | One or several texts (or one list of texts) for token classification. |
| | |
| | Return: |
| | A list or a list of list of `dict`: Each result comes as a list of dictionaries (one for each token in the |
| | corresponding input, or each entity if this pipeline was instantiated with an aggregation_strategy) with |
| | the following keys: |
| | |
| | - **word** (`str`) -- The token/word classified. This is obtained by decoding the selected tokens. If you |
| | want to have the exact string in the original sentence, use `start` and `end`. |
| | - **score** (`float`) -- The corresponding probability for `entity`. |
| | - **entity** (`str`) -- The entity predicted for that token/word (it is named *entity_group* when |
| | *aggregation_strategy* is not `"none"`. |
| | - **index** (`int`, only present when `aggregation_strategy="none"`) -- The index of the corresponding |
| | token in the sentence. |
| | - **start** (`int`, *optional*) -- The index of the start of the corresponding entity in the sentence. Only |
| | exists if the offsets are available within the tokenizer |
| | - **end** (`int`, *optional*) -- The index of the end of the corresponding entity in the sentence. Only |
| | exists if the offsets are available within the tokenizer |
| | """ |
| |
|
| | _inputs, offset_mapping = self._args_parser(inputs, **kwargs) |
| | if offset_mapping: |
| | kwargs["offset_mapping"] = offset_mapping |
| |
|
| | return super().__call__(inputs, **kwargs) |
| |
|
| | def preprocess(self, sentence, offset_mapping=None, **preprocess_params): |
| | tokenizer_params = preprocess_params.pop("tokenizer_params", {}) |
| | truncation = True if self.tokenizer.model_max_length and self.tokenizer.model_max_length > 0 else False |
| | inputs = self.tokenizer( |
| | sentence, |
| | return_tensors=self.framework, |
| | truncation=truncation, |
| | return_special_tokens_mask=True, |
| | return_offsets_mapping=self.tokenizer.is_fast, |
| | **tokenizer_params, |
| | ) |
| | inputs.pop("overflow_to_sample_mapping", None) |
| | num_chunks = len(inputs["input_ids"]) |
| |
|
| | for i in range(num_chunks): |
| | if self.framework == "tf": |
| | model_inputs = {k: tf.expand_dims(v[i], 0) for k, v in inputs.items()} |
| | else: |
| | model_inputs = {k: v[i].unsqueeze(0) for k, v in inputs.items()} |
| | if offset_mapping is not None: |
| | model_inputs["offset_mapping"] = offset_mapping |
| | model_inputs["sentence"] = sentence if i == 0 else None |
| | model_inputs["is_last"] = i == num_chunks - 1 |
| |
|
| | yield model_inputs |
| |
|
| | def _forward(self, model_inputs): |
| | |
| | special_tokens_mask = model_inputs.pop("special_tokens_mask") |
| | offset_mapping = model_inputs.pop("offset_mapping", None) |
| | sentence = model_inputs.pop("sentence") |
| | is_last = model_inputs.pop("is_last") |
| | if self.framework == "tf": |
| | logits = self.model(**model_inputs)[0] |
| | else: |
| | output = self.model(**model_inputs) |
| | logits = output["logits"] if isinstance(output, dict) else output[0] |
| |
|
| | return { |
| | "logits": logits, |
| | "special_tokens_mask": special_tokens_mask, |
| | "offset_mapping": offset_mapping, |
| | "sentence": sentence, |
| | "is_last": is_last, |
| | **model_inputs, |
| | } |
| |
|
| | def postprocess(self, all_outputs, aggregation_strategy=AggregationStrategy.NONE, ignore_labels=None): |
| | if ignore_labels is None: |
| | ignore_labels = ["O"] |
| | all_entities = [] |
| | for model_outputs in all_outputs: |
| | logits = model_outputs["logits"][0].numpy() |
| | sentence = all_outputs[0]["sentence"] |
| | input_ids = model_outputs["input_ids"][0] |
| | offset_mapping = ( |
| | model_outputs["offset_mapping"][0] if model_outputs["offset_mapping"] is not None else None |
| | ) |
| | special_tokens_mask = model_outputs["special_tokens_mask"][0].numpy() |
| |
|
| | maxes = np.max(logits, axis=-1, keepdims=True) |
| | shifted_exp = np.exp(logits - maxes) |
| | scores = shifted_exp / shifted_exp.sum(axis=-1, keepdims=True) |
| |
|
| | if self.framework == "tf": |
| | input_ids = input_ids.numpy() |
| | offset_mapping = offset_mapping.numpy() if offset_mapping is not None else None |
| |
|
| | pre_entities = self.gather_pre_entities( |
| | sentence, input_ids, scores, offset_mapping, special_tokens_mask, aggregation_strategy |
| | ) |
| | grouped_entities = self.aggregate(pre_entities, aggregation_strategy) |
| | |
| | entities = [ |
| | entity |
| | for entity in grouped_entities |
| | if entity.get("entity", None) not in ignore_labels |
| | and entity.get("entity_group", None) not in ignore_labels |
| | ] |
| | all_entities.extend(entities) |
| | num_chunks = len(all_outputs) |
| | if num_chunks > 1: |
| | all_entities = self.aggregate_overlapping_entities(all_entities) |
| | return all_entities |
| |
|
| | def aggregate_overlapping_entities(self, entities): |
| | if len(entities) == 0: |
| | return entities |
| | entities = sorted(entities, key=lambda x: x["start"]) |
| | aggregated_entities = [] |
| | previous_entity = entities[0] |
| | for entity in entities: |
| | if previous_entity["start"] <= entity["start"] < previous_entity["end"]: |
| | current_length = entity["end"] - entity["start"] |
| | previous_length = previous_entity["end"] - previous_entity["start"] |
| | if current_length > previous_length: |
| | previous_entity = entity |
| | elif current_length == previous_length and entity["score"] > previous_entity["score"]: |
| | previous_entity = entity |
| | else: |
| | aggregated_entities.append(previous_entity) |
| | previous_entity = entity |
| | aggregated_entities.append(previous_entity) |
| | return aggregated_entities |
| |
|
| | def gather_pre_entities( |
| | self, |
| | sentence: str, |
| | input_ids: np.ndarray, |
| | scores: np.ndarray, |
| | offset_mapping: Optional[List[Tuple[int, int]]], |
| | special_tokens_mask: np.ndarray, |
| | aggregation_strategy: AggregationStrategy, |
| | ) -> List[dict]: |
| | """Fuse various numpy arrays into dicts with all the information needed for aggregation""" |
| | pre_entities = [] |
| | for idx, token_scores in enumerate(scores): |
| | |
| | if special_tokens_mask[idx]: |
| | continue |
| |
|
| | word = self.tokenizer.convert_ids_to_tokens(int(input_ids[idx])) |
| | if offset_mapping is not None: |
| | start_ind, end_ind = offset_mapping[idx] |
| | if not isinstance(start_ind, int): |
| | if self.framework == "pt": |
| | start_ind = start_ind.item() |
| | end_ind = end_ind.item() |
| | word_ref = sentence[start_ind:end_ind] |
| | if getattr(self.tokenizer, "_tokenizer", None) and getattr( |
| | self.tokenizer._tokenizer.model, "continuing_subword_prefix", None |
| | ): |
| | |
| | |
| | is_subword = len(word) != len(word_ref) |
| | else: |
| | |
| | if aggregation_strategy in { |
| | AggregationStrategy.FIRST, |
| | AggregationStrategy.AVERAGE, |
| | AggregationStrategy.MAX, |
| | }: |
| | warnings.warn( |
| | "Tokenizer does not support real words, using fallback heuristic", |
| | UserWarning, |
| | ) |
| | is_subword = start_ind > 0 and " " not in sentence[start_ind - 1 : start_ind + 1] |
| |
|
| | if int(input_ids[idx]) == self.tokenizer.unk_token_id: |
| | word = word_ref |
| | is_subword = False |
| | else: |
| | start_ind = None |
| | end_ind = None |
| | is_subword = False |
| |
|
| | pre_entity = { |
| | "word": word, |
| | "scores": token_scores, |
| | "start": start_ind, |
| | "end": end_ind, |
| | "index": idx, |
| | "is_subword": is_subword, |
| | } |
| | pre_entities.append(pre_entity) |
| | return pre_entities |
| |
|
| | def aggregate(self, pre_entities: List[dict], aggregation_strategy: AggregationStrategy) -> List[dict]: |
| | if aggregation_strategy in {AggregationStrategy.NONE, AggregationStrategy.SIMPLE}: |
| | entities = [] |
| | for pre_entity in pre_entities: |
| | entity_idx = pre_entity["scores"].argmax() |
| | score = pre_entity["scores"][entity_idx] |
| | entity = { |
| | "entity": self.model.config.id2label[entity_idx], |
| | "score": score, |
| | "index": pre_entity["index"], |
| | "word": pre_entity["word"], |
| | "start": pre_entity["start"], |
| | "end": pre_entity["end"], |
| | } |
| | entities.append(entity) |
| | else: |
| | entities = self.aggregate_words(pre_entities, aggregation_strategy) |
| |
|
| | if aggregation_strategy == AggregationStrategy.NONE: |
| | return entities |
| | return self.group_entities(entities) |
| |
|
| | def aggregate_word(self, entities: List[dict], aggregation_strategy: AggregationStrategy) -> dict: |
| | word = self.tokenizer.convert_tokens_to_string([entity["word"] for entity in entities]) |
| | if aggregation_strategy == AggregationStrategy.FIRST: |
| | scores = entities[0]["scores"] |
| | idx = scores.argmax() |
| | score = scores[idx] |
| | entity = self.model.config.id2label[idx] |
| | elif aggregation_strategy == AggregationStrategy.MAX: |
| | max_entity = max(entities, key=lambda entity: entity["scores"].max()) |
| | scores = max_entity["scores"] |
| | idx = scores.argmax() |
| | score = scores[idx] |
| | entity = self.model.config.id2label[idx] |
| | elif aggregation_strategy == AggregationStrategy.AVERAGE: |
| | scores = np.stack([entity["scores"] for entity in entities]) |
| | average_scores = np.nanmean(scores, axis=0) |
| | entity_idx = average_scores.argmax() |
| | entity = self.model.config.id2label[entity_idx] |
| | score = average_scores[entity_idx] |
| | else: |
| | raise ValueError("Invalid aggregation_strategy") |
| | new_entity = { |
| | "entity": entity, |
| | "score": score, |
| | "word": word, |
| | "start": entities[0]["start"], |
| | "end": entities[-1]["end"], |
| | } |
| | return new_entity |
| |
|
| | def aggregate_words(self, entities: List[dict], aggregation_strategy: AggregationStrategy) -> List[dict]: |
| | """ |
| | Override tokens from a given word that disagree to force agreement on word boundaries. |
| | |
| | Example: micro|soft| com|pany| B-ENT I-NAME I-ENT I-ENT will be rewritten with first strategy as microsoft| |
| | company| B-ENT I-ENT |
| | """ |
| | if aggregation_strategy in { |
| | AggregationStrategy.NONE, |
| | AggregationStrategy.SIMPLE, |
| | }: |
| | raise ValueError("NONE and SIMPLE strategies are invalid for word aggregation") |
| |
|
| | word_entities = [] |
| | word_group = None |
| | for entity in entities: |
| | if word_group is None: |
| | word_group = [entity] |
| | elif entity["is_subword"]: |
| | word_group.append(entity) |
| | else: |
| | word_entities.append(self.aggregate_word(word_group, aggregation_strategy)) |
| | word_group = [entity] |
| | |
| | if word_group is not None: |
| | word_entities.append(self.aggregate_word(word_group, aggregation_strategy)) |
| | return word_entities |
| |
|
| | def group_sub_entities(self, entities: List[dict]) -> dict: |
| | """ |
| | Group together the adjacent tokens with the same entity predicted. |
| | |
| | Args: |
| | entities (`dict`): The entities predicted by the pipeline. |
| | """ |
| | |
| | entity = entities[0]["entity"].split("-")[-1] |
| | scores = np.nanmean([entity["score"] for entity in entities]) |
| | tokens = [entity["word"] for entity in entities] |
| |
|
| | entity_group = { |
| | "entity_group": entity, |
| | "score": np.mean(scores), |
| | "word": self.tokenizer.convert_tokens_to_string(tokens), |
| | "start": entities[0]["start"], |
| | "end": entities[-1]["end"], |
| | } |
| | return entity_group |
| |
|
| | def get_tag(self, entity_name: str) -> Tuple[str, str]: |
| | if entity_name.startswith("B-"): |
| | bi = "B" |
| | tag = entity_name[2:] |
| | elif entity_name.startswith("I-"): |
| | bi = "I" |
| | tag = entity_name[2:] |
| | else: |
| | |
| | |
| | bi = "I" |
| | tag = entity_name |
| | return bi, tag |
| |
|
| | def group_entities(self, entities: List[dict]) -> List[dict]: |
| | """ |
| | Find and group together the adjacent tokens with the same entity predicted. |
| | |
| | Args: |
| | entities (`dict`): The entities predicted by the pipeline. |
| | """ |
| |
|
| | entity_groups = [] |
| | entity_group_disagg = [] |
| |
|
| | for entity in entities: |
| | if not entity_group_disagg: |
| | entity_group_disagg.append(entity) |
| | continue |
| |
|
| | |
| | |
| | |
| | |
| | bi, tag = self.get_tag(entity["entity"]) |
| | last_bi, last_tag = self.get_tag(entity_group_disagg[-1]["entity"]) |
| |
|
| | if tag == last_tag and bi != "B": |
| | |
| | entity_group_disagg.append(entity) |
| | else: |
| | |
| | |
| | entity_groups.append(self.group_sub_entities(entity_group_disagg)) |
| | entity_group_disagg = [entity] |
| | if entity_group_disagg: |
| | |
| | entity_groups.append(self.group_sub_entities(entity_group_disagg)) |
| |
|
| | return entity_groups |
| |
|
| |
|
| | NerPipeline = TokenClassificationPipeline |
| |
|