Spaces:

spitzc32
/

bilstm_api

Runtime error

File size: 4,347 Bytes

24d0437

from typing import Dict, List, Optional
from flair.data import _PartOfSentence, DataPoint, Label

class Token(_PartOfSentence):
    """
    This class represents one word in a tokenized sentence. Each token may have any number of tags. It may also point
    to its head in a dependency tree.

    :param text: Single text(Token) from the sequence
    :param head_id: the location of the text (For Document)
    :param whitespace_after: if token has whitespace
    :param start_position: what character number in document does this token start?
    :param sentence: If token belongs to sentence, indicate here which var it belongs to
    """

    def __init__(
        self,
        text: str,
        head_id: int = None,
        whitespace_after: int = 1,
        start_position: int = 0,
        sentence=None,
    ):
        super().__init__(sentence=sentence)

        self.form: str = text
        self._internal_index: Optional[int] = None
        self.head_id: Optional[int] = head_id
        self.whitespace_after: int = whitespace_after

        self.start_pos = start_position
        self.end_pos = start_position + len(text)

        self._embeddings: Dict = {}
        self.tags_proba_dist: Dict[str, List[Label]] = {}

    @property
    def idx(self) -> int:
        if isinstance(self._internal_index, int):
            return self._internal_index
        else:
            raise ValueError

    @property
    def text(self):
        return self.form

    @property
    def unlabeled_identifier(self) -> str:
        return f'Token[{self.idx-1}]: "{self.text}"'

    def add_tags_proba_dist(self, tag_type: str, tags: List[Label]):
        self.tags_proba_dist[tag_type] = tags

    def get_tags_proba_dist(self, tag_type: str) -> List[Label]:
        if tag_type in self.tags_proba_dist:
            return self.tags_proba_dist[tag_type]
        return []

    def get_head(self):
        return self.sentence.get_token(self.head_id)

    @property
    def start_position(self) -> int:
        return self.start_pos

    @property
    def end_position(self) -> int:
        return self.end_pos

    @property
    def embedding(self):
        return self.get_embedding()

    def __repr__(self):
        return self.__str__()

    def add_label(self, typename: str, value: str, score: float = 1.0):
        """
        The Token is a special _PartOfSentence in that it may be initialized without a Sentence.
        Therefore, labels get added only to the Sentence if it exists
        """
        if self.sentence:
            super().add_label(typename=typename, value=value, score=score)
        else:
            DataPoint.add_label(self, typename=typename, value=value, score=score)

    def set_label(self, typename: str, value: str, score: float = 1.0):
        """
        The Token is a special _PartOfSentence in that it may be initialized without a Sentence.
        Therefore, labels get set only to the Sentence if it exists
        """
        if self.sentence:
            super().set_label(typename=typename, value=value, score=score)
        else:
            DataPoint.set_label(self, typename=typename, value=value, score=score)


class Span(_PartOfSentence):
    """
    This class represents one textual span consisting of Tokens. It may be used for the instance that the 
    tokens form in a nested nature, meaning the tokens combined together forms a long phrase.

    :param tokens: List of tokens in the span
    """

    def __init__(self, tokens: List[Token]):
        super().__init__(tokens[0].sentence)
        self.tokens = tokens
        super()._init_labels()

    @property
    def start_position(self) -> int:
        return self.tokens[0].start_position

    @property
    def end_position(self) -> int:
        return self.tokens[-1].end_position

    @property
    def text(self) -> str:
        return " ".join([t.text for t in self.tokens])

    @property
    def unlabeled_identifier(self) -> str:
        return f'Span[{self.tokens[0].idx -1}:{self.tokens[-1].idx}]: "{self.text}"'

    def __repr__(self):
        return self.__str__()

    def __getitem__(self, idx: int) -> Token:
        return self.tokens[idx]

    def __iter__(self):
        return iter(self.tokens)

    def __len__(self) -> int:
        return len(self.tokens)

    @property
    def embedding(self):
        pass