Spaces:

DFKI-SLT
/

re_on_tacred

Runtime error

App Files Files Community

dfki-nlp commited on May 3, 2023

Commit

386fb69

1 Parent(s): 7b69b0e

Upload transformer_re_text_classification2.py

Browse files

Files changed (1) hide show

transformer_re_text_classification2.py +574 -0

transformer_re_text_classification2.py ADDED Viewed

	@@ -0,0 +1,574 @@

+"""
+workflow:
+    Document
+        -> (InputEncoding, TargetEncoding) -> TaskEncoding -> TaskBatchEncoding
+            -> ModelBatchEncoding -> ModelBatchOutput
+        -> TaskOutput
+    -> Document
+"""
+import logging
+from typing import Any, Dict, Iterator, List, Optional, Sequence, Set, Tuple, TypedDict, Union
+import numpy as np
+import torch
+from pytorch_ie.annotations import BinaryRelation, LabeledSpan, MultiLabeledBinaryRelation, Span
+from pytorch_ie.core import TaskEncoding, TaskModule
+from pytorch_ie.documents import TextDocument
+from pytorch_ie.models import (
+    TransformerTextClassificationModelBatchOutput,
+    TransformerTextClassificationModelStepBatchEncoding,
+)
+from pytorch_ie.utils.span import get_token_slice, is_contained_in
+from pytorch_ie.utils.window import get_window_around_slice
+from transformers import AutoTokenizer
+from transformers.file_utils import PaddingStrategy
+from transformers.tokenization_utils_base import BatchEncoding, TruncationStrategy
+from typing_extensions import TypeAlias
+TransformerReTextClassificationInputEncoding2: TypeAlias = Dict[str, Any]
+TransformerReTextClassificationTargetEncoding2: TypeAlias = Sequence[int]
+TransformerReTextClassificationTaskEncoding2: TypeAlias = TaskEncoding[
+    TextDocument,
+    TransformerReTextClassificationInputEncoding2,
+    TransformerReTextClassificationTargetEncoding2,
+]
+class TransformerReTextClassificationTaskOutput2(TypedDict, total=False):
+    labels: Sequence[str]
+    probabilities: Sequence[float]
+_TransformerReTextClassificationTaskModule2: TypeAlias = TaskModule[
+    # _InputEncoding, _TargetEncoding, _TaskBatchEncoding, _ModelBatchOutput, _TaskOutput
+    TextDocument,
+    TransformerReTextClassificationInputEncoding2,
+    TransformerReTextClassificationTargetEncoding2,
+    TransformerTextClassificationModelStepBatchEncoding,
+    TransformerTextClassificationModelBatchOutput,
+    TransformerReTextClassificationTaskOutput2,
+]
+HEAD = "head"
+TAIL = "tail"
+START = "start"
+END = "end"
+logger = logging.getLogger(__name__)
+class RelationArgument:
+    def __init__(
+        self,
+        entity: LabeledSpan,
+        role: str,
+        offsets: Tuple[int, int],
+        add_type_to_marker: bool,
+    ) -> None:
+        self.entity = entity
+        self.role = role
+        assert self.role in (HEAD, TAIL)
+        self.offsets = offsets
+        self.add_type_to_marker = add_type_to_marker
+    @property
+    def is_head(self) -> bool:
+        return self.role == HEAD
+    @property
+    def is_tail(self) -> bool:
+        return self.role == TAIL
+    @property
+    def as_start_marker(self) -> str:
+        return self._get_marker(is_start=True)
+    @property
+    def as_end_marker(self) -> str:
+        return self._get_marker(is_start=False)
+    def _get_marker(self, is_start: bool = True) -> str:
+        return f"[{'' if is_start else '/'}{'H' if self.is_head else 'T'}" + (
+            f":{self.entity.label}]" if self.add_type_to_marker else "]"
+        )
+    @property
+    def as_append_marker(self) -> str:
+        return f"[{'H' if self.is_head else 'T'}={self.entity.label}]"
+def _enumerate_entity_pairs(
+    entities: Sequence[Span],
+    partition: Optional[Span] = None,
+    relations: Optional[Sequence[BinaryRelation]] = None,
+):
+    """Given a list of `entities` iterate all valid pairs of entities, including inverted pairs.
+    If a `partition` is provided, restrict pairs to be contained in that. If `relations` are given,
+    return only pairs for which a predefined relation exists (e.g. in the case of relation
+    classification for train,val,test splits in supervised datasets).
+    """
+    existing_head_tail = {(relation.head, relation.tail) for relation in relations or []}
+    for head in entities:
+        if partition is not None and not is_contained_in(
+            (head.start, head.end), (partition.start, partition.end)
+        ):
+            continue
+        for tail in entities:
+            if partition is not None and not is_contained_in(
+                (tail.start, tail.end), (partition.start, partition.end)
+            ):
+                continue
+            if head == tail:
+                continue
+            if relations is not None and (head, tail) not in existing_head_tail:
+                continue
+            yield head, tail
+@TaskModule.register()
+class TransformerRETextClassificationTaskModule2(_TransformerReTextClassificationTaskModule2):
+    """Marker based relation extraction. This taskmodule prepares the input token ids in such a way
+    that before and after the candidate head and tail entities special marker tokens are inserted.
+    Then, the modified token ids can be simply passed into a transformer based text classifier
+    model.
+    parameters:
+        partition_annotation: str, optional. If specified, LabeledSpan annotations with this name are
+            expected to define partitions of the document that will be processed individually, e.g. sentences
+            or sections of the document text.
+        none_label: str, defaults to "no_relation". The relation label that indicate dummy/negative relations.
+            Predicted relations with that label will not be added to the document(s).
+        max_window: int, optional. If specified, use the tokens in a window of maximal this amount of tokens
+            around the center of head and tail entities and pass only that into the transformer.
+    """
+    PREPARED_ATTRIBUTES = ["label_to_id", "entity_labels"]
+    def __init__(
+        self,
+        tokenizer_name_or_path: str,
+        entity_annotation: str = "entities",
+        relation_annotation: str = "relations",
+        partition_annotation: Optional[str] = None,
+        none_label: str = "no_relation",
+        padding: Union[bool, str, PaddingStrategy] = True,
+        truncation: Union[bool, str, TruncationStrategy] = True,
+        max_length: Optional[int] = None,
+        pad_to_multiple_of: Optional[int] = None,
+        multi_label: bool = False,
+        label_to_id: Optional[Dict[str, int]] = None,
+        add_type_to_marker: bool = False,
+        single_argument_pair: bool = True,
+        append_markers: bool = False,
+        entity_labels: Optional[List[str]] = None,
+        max_window: Optional[int] = None,
+        log_first_n_examples: Optional[int] = None,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.save_hyperparameters()
+        self.entity_annotation = entity_annotation
+        self.relation_annotation = relation_annotation
+        self.padding = padding
+        self.truncation = truncation
+        self.label_to_id = label_to_id or {}
+        self.id_to_label = {v: k for k, v in self.label_to_id.items()}
+        self.max_length = max_length
+        self.pad_to_multiple_of = pad_to_multiple_of
+        self.multi_label = multi_label
+        self.add_type_to_marker = add_type_to_marker
+        self.single_argument_pair = single_argument_pair
+        self.append_markers = append_markers
+        self.entity_labels = entity_labels
+        self.partition_annotation = partition_annotation
+        self.none_label = none_label
+        self.max_window = max_window
+        self.log_first_n_examples = log_first_n_examples
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
+        self.argument_markers = None
+        self._logged_examples_counter = 0
+    def _prepare(self, documents: Sequence[TextDocument]) -> None:
+        entity_labels: Set[str] = set()
+        relation_labels: Set[str] = set()
+        for document in documents:
+            entities: Sequence[LabeledSpan] = document[self.entity_annotation]
+            relations: Sequence[BinaryRelation] = document[self.relation_annotation]
+            for entity in entities:
+                entity_labels.add(entity.label)
+            for relation in relations:
+                relation_labels.add(relation.label)
+        if self.none_label in relation_labels:
+            relation_labels.remove(self.none_label)
+        self.label_to_id = {label: i + 1 for i, label in enumerate(sorted(relation_labels))}
+        self.label_to_id[self.none_label] = 0
+        self.entity_labels = sorted(entity_labels)
+    def _post_prepare(self):
+        self.argument_markers = self._initialize_argument_markers()
+        self.tokenizer.add_tokens(self.argument_markers, special_tokens=True)
+        self.argument_markers_to_id = {
+            marker: self.tokenizer.vocab[marker] for marker in self.argument_markers
+        }
+        self.sep_token_id = self.tokenizer.vocab[self.tokenizer.sep_token]
+        self.id_to_label = {v: k for k, v in self.label_to_id.items()}
+    def _initialize_argument_markers(self) -> List[str]:
+        argument_markers: Set[str] = set()
+        for arg_type in [HEAD, TAIL]:
+            for arg_pos in [START, END]:
+                is_head = arg_type == HEAD
+                is_start = arg_pos == START
+                argument_markers.add(f"[{'' if is_start else '/'}{'H' if is_head else 'T'}]")
+                if self.add_type_to_marker:
+                    for entity_type in self.entity_labels:  # type: ignore
+                        argument_markers.add(
+                            f"[{'' if is_start else '/'}{'H' if is_head else 'T'}"
+                            f"{':' + entity_type if self.add_type_to_marker else ''}]"
+                        )
+                if self.append_markers:
+                    for entity_type in self.entity_labels:  # type: ignore
+                        argument_markers.add(f"[{'H' if is_head else 'T'}={entity_type}]")
+        return sorted(list(argument_markers))
+    def _encode_text(
+        self,
+        document: TextDocument,
+        partition: Optional[Span] = None,
+        add_special_tokens: bool = True,
+    ) -> BatchEncoding:
+        text = (
+            document.text[partition.start : partition.end]
+            if partition is not None
+            else document.text
+        )
+        encoding = self.tokenizer(
+            text,
+            padding=False,
+            truncation=self.truncation,
+            max_length=self.max_length,
+            is_split_into_words=False,
+            return_offsets_mapping=False,
+            add_special_tokens=add_special_tokens,
+        )
+        return encoding
+    def encode_input(
+        self,
+        document: TextDocument,
+        is_training: bool = False,
+    ) -> Optional[
+        Union[
+            TransformerReTextClassificationTaskEncoding2,
+            Sequence[TransformerReTextClassificationTaskEncoding2],
+        ]
+    ]:
+        assert (
+            self.argument_markers is not None
+        ), "No argument markers available, was `prepare` already called?"
+        entities: Sequence[Span] = document[self.entity_annotation]
+        relations: Sequence[BinaryRelation] = document[self.relation_annotation]
+        partitions: Sequence[Optional[Span]]
+        if self.partition_annotation is not None:
+            partitions = document[self.partition_annotation]
+        else:
+            # use single dummy partition
+            partitions = [None]
+        task_encodings: List[TransformerReTextClassificationTaskEncoding2] = []
+        for partition_idx, partition in enumerate(partitions):
+            partition_offset = 0 if partition is None else partition.start
+            add_special_tokens = self.max_window is None
+            encoding = self._encode_text(
+                document=document, partition=partition, add_special_tokens=add_special_tokens
+            )
+            for (head, tail,) in _enumerate_entity_pairs(
+                entities=entities,
+                partition=partition,
+                relations=relations,
+            ):
+                head_token_slice = get_token_slice(
+                    character_slice=(head.start, head.end),
+                    char_to_token_mapper=encoding.char_to_token,
+                    character_offset=partition_offset,
+                )
+                tail_token_slice = get_token_slice(
+                    character_slice=(tail.start, tail.end),
+                    char_to_token_mapper=encoding.char_to_token,
+                    character_offset=partition_offset,
+                )
+                # this happens if the head/tail start/end does not match a token start/end
+                if head_token_slice is None or tail_token_slice is None:
+                    # if statistics is not None:
+                    #     statistics["entity_token_alignment_error"][
+                    #         relation_mapping.get((head, tail), "TO_PREDICT")
+                    #     ] += 1
+                    logger.warning(
+                        f"Skipping invalid example {document.id}, cannot get token slice(s)"
+                    )
+                    continue
+                input_ids = encoding["input_ids"]
+                # not sure if this is the correct way to get the tokens corresponding to the input_ids
+                tokens = encoding.encodings[0].tokens
+                # windowing
+                if self.max_window is not None:
+                    head_start, head_end = head_token_slice
+                    tail_start, tail_end = tail_token_slice
+                    # The actual number of tokens will be lower than max_window because we add the
+                    # 4 marker tokens (before / after the head /tail) and the default special tokens
+                    # (e.g. CLS and SEP).
+                    num_added_special_tokens = len(
+                        self.tokenizer.build_inputs_with_special_tokens([])
+                    )
+                    max_tokens = self.max_window - 4 - num_added_special_tokens
+                    # the slice from the beginning of the first entity to the end of the second is required
+                    slice_required = (min(head_start, tail_start), max(head_end, tail_end))
+                    window_slice = get_window_around_slice(
+                        slice=slice_required,
+                        max_window_size=max_tokens,
+                        available_input_length=len(input_ids),
+                    )
+                    # this happens if slice_required does not fit into max_tokens
+                    if window_slice is None:
+                        # if statistics is not None:
+                        #     statistics["out_of_token_window"][
+                        #         relation_mapping.get((head, tail), "TO_PREDICT")
+                        #     ] += 1
+                        continue
+                    window_start, window_end = window_slice
+                    input_ids = input_ids[window_start:window_end]
+                    head_token_slice = head_start - window_start, head_end - window_start
+                    tail_token_slice = tail_start - window_start, tail_end - window_start
+                # maybe expand to n-ary relations?
+                head_arg = RelationArgument(head, HEAD, head_token_slice, self.add_type_to_marker)
+                tail_arg = RelationArgument(tail, TAIL, tail_token_slice, self.add_type_to_marker)
+                arg_list = [head_arg, tail_arg]
+                if head_token_slice[0] < tail_token_slice[0]:
+                    assert (
+                        head_token_slice[1] <= tail_token_slice[0]
+                    ), f"the head and tail entities are not allowed to overlap in {document.id}"
+                else:
+                    assert (
+                        tail_token_slice[1] <= head_token_slice[0]
+                    ), f"the head and tail entities are not allowed to overlap in {document.id}"
+                    # expand to n-ary relations?
+                    arg_list.reverse()
+                first_arg_start_id = self.argument_markers_to_id[arg_list[0].as_start_marker]
+                first_arg_end_id = self.argument_markers_to_id[arg_list[0].as_end_marker]
+                second_arg_start_id = self.argument_markers_to_id[arg_list[1].as_start_marker]
+                second_arg_end_id = self.argument_markers_to_id[arg_list[1].as_end_marker]
+                new_input_ids = (
+                    input_ids[: arg_list[0].offsets[0]]
+                    + [first_arg_start_id]
+                    + input_ids[arg_list[0].offsets[0] : arg_list[0].offsets[1]]
+                    + [first_arg_end_id]
+                    + input_ids[arg_list[0].offsets[1] : arg_list[1].offsets[0]]
+                    + [second_arg_start_id]
+                    + input_ids[arg_list[1].offsets[0] : arg_list[1].offsets[1]]
+                    + [second_arg_end_id]
+                    + input_ids[arg_list[1].offsets[1] :]
+                )
+                if self.append_markers:
+                    new_input_ids.extend(
+                        [
+                            self.argument_markers_to_id[head_arg.as_append_marker],
+                            self.sep_token_id,
+                            self.argument_markers_to_id[tail_arg.as_append_marker],
+                            self.sep_token_id,
+                        ]
+                    )
+                # when windowing is used, we have to add the special tokens manually
+                if not add_special_tokens:
+                    new_input_ids = self.tokenizer.build_inputs_with_special_tokens(
+                        token_ids_0=new_input_ids
+                    )
+                # lots of logging from here on
+                log_this_example = (
+                    self.log_first_n_examples is not None
+                    and self._logged_examples_counter <= self.log_first_n_examples
+                )
+                if log_this_example:
+                    self._log_example(document, arg_list, new_input_ids, relations, tokens)
+                task_encodings.append(
+                    TaskEncoding(
+                        document=document,
+                        inputs={"input_ids": new_input_ids},
+                        metadata={
+                            HEAD: head,
+                            TAIL: tail,
+                        },
+                    )
+                )
+        return task_encodings
+    def _log_example(
+        self,
+        document: TextDocument,
+        arg_list: List[RelationArgument],
+        input_ids: List[int],
+        relations: Sequence[BinaryRelation],
+        tokens: List[str],
+    ):
+        first_arg_start = arg_list[0].as_start_marker
+        first_arg_end = arg_list[0].as_end_marker
+        second_arg_start = arg_list[1].as_start_marker
+        second_arg_end = arg_list[1].as_end_marker
+        new_tokens = (
+            tokens[: arg_list[0].offsets[0]]
+            + [first_arg_start]
+            + tokens[arg_list[0].offsets[0] : arg_list[0].offsets[1]]
+            + [first_arg_end]
+            + tokens[arg_list[0].offsets[1] : arg_list[1].offsets[0]]
+            + [second_arg_start]
+            + tokens[arg_list[1].offsets[0] : arg_list[1].offsets[1]]
+            + [second_arg_end]
+            + tokens[arg_list[1].offsets[1] :]
+        )
+        head_idx = 0 if arg_list[0].role == HEAD else 1
+        tail_idx = 0 if arg_list[0].role == TAIL else 1
+        if self.append_markers:
+            head_marker = arg_list[head_idx].as_append_marker
+            tail_marker = arg_list[tail_idx].as_append_marker
+            new_tokens.extend(
+                [head_marker, self.tokenizer.sep_token, tail_marker, self.tokenizer.sep_token]
+            )
+        logger.info("*** Example ***")
+        logger.info("doc id: %s", document.id)
+        logger.info("tokens: %s", " ".join([str(x) for x in new_tokens]))
+        logger.info("input_ids: %s", " ".join([str(x) for x in input_ids]))
+        rel_labels = [relation.label for relation in relations]
+        rel_label_ids = [self.label_to_id[label] for label in rel_labels]
+        logger.info("Expected labels: %s (ids = %s)", rel_labels, rel_label_ids)
+        self._logged_examples_counter += 1
+    def encode_target(
+        self,
+        task_encoding: TransformerReTextClassificationTaskEncoding2,
+    ) -> TransformerReTextClassificationTargetEncoding2:
+        metadata = task_encoding.metadata
+        document = task_encoding.document
+        relations: Sequence[BinaryRelation] = document[self.relation_annotation]
+        head_tail_to_labels = {
+            (relation.head, relation.tail): [relation.label] for relation in relations
+        }
+        labels = head_tail_to_labels.get((metadata[HEAD], metadata[TAIL]), [self.none_label])
+        target = [self.label_to_id[label] for label in labels]
+        return target
+    def unbatch_output(
+        self, model_output: TransformerTextClassificationModelBatchOutput
+    ) -> Sequence[TransformerReTextClassificationTaskOutput2]:
+        logits = model_output["logits"]
+        output_label_probs = logits.sigmoid() if self.multi_label else logits.softmax(dim=-1)
+        output_label_probs = output_label_probs.detach().cpu().numpy()
+        unbatched_output = []
+        if self.multi_label:
+            raise NotImplementedError
+        else:
+            label_ids = np.argmax(output_label_probs, axis=-1)
+            for batch_idx, label_id in enumerate(label_ids):
+                label = self.id_to_label[label_id]
+                prob = float(output_label_probs[batch_idx, label_id])
+                result: TransformerReTextClassificationTaskOutput2 = {
+                    "labels": [label],
+                    "probabilities": [prob],
+                }
+                unbatched_output.append(result)
+        return unbatched_output
+    def create_annotations_from_output(
+        self,
+        task_encoding: TransformerReTextClassificationTaskEncoding2,
+        task_output: TransformerReTextClassificationTaskOutput2,
+    ) -> Iterator[Tuple[str, Union[BinaryRelation, MultiLabeledBinaryRelation]]]:
+        labels = task_output["labels"]
+        probabilities = task_output["probabilities"]
+        if labels != [self.none_label]:
+            yield (
+                self.relation_annotation,
+                BinaryRelation(
+                    head=task_encoding.metadata[HEAD],
+                    tail=task_encoding.metadata[TAIL],
+                    label=labels[0],
+                    score=probabilities[0],
+                ),
+            )
+    def collate(
+        self, task_encodings: Sequence[TransformerReTextClassificationTaskEncoding2]
+    ) -> TransformerTextClassificationModelStepBatchEncoding:
+        input_features = [task_encoding.inputs for task_encoding in task_encodings]
+        inputs: Dict[str, torch.Tensor] = self.tokenizer.pad(
+            input_features,
+            padding=self.padding,
+            max_length=self.max_length,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            return_tensors="pt",
+        )
+        if not task_encodings[0].has_targets:
+            return inputs, None
+        target_list: List[TransformerReTextClassificationTargetEncoding2] = [
+            task_encoding.targets for task_encoding in task_encodings
+        ]
+        targets = torch.tensor(target_list, dtype=torch.int64)
+        if not self.multi_label:
+            targets = targets.flatten()
+        return inputs, targets