File size: 4,347 Bytes
24d0437
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
from typing import Dict, List, Optional
from flair.data import _PartOfSentence, DataPoint, Label

class Token(_PartOfSentence):
    """
    This class represents one word in a tokenized sentence. Each token may have any number of tags. It may also point
    to its head in a dependency tree.

    :param text: Single text(Token) from the sequence
    :param head_id: the location of the text (For Document)
    :param whitespace_after: if token has whitespace
    :param start_position: what character number in document does this token start?
    :param sentence: If token belongs to sentence, indicate here which var it belongs to
    """

    def __init__(
        self,
        text: str,
        head_id: int = None,
        whitespace_after: int = 1,
        start_position: int = 0,
        sentence=None,
    ):
        super().__init__(sentence=sentence)

        self.form: str = text
        self._internal_index: Optional[int] = None
        self.head_id: Optional[int] = head_id
        self.whitespace_after: int = whitespace_after

        self.start_pos = start_position
        self.end_pos = start_position + len(text)

        self._embeddings: Dict = {}
        self.tags_proba_dist: Dict[str, List[Label]] = {}

    @property
    def idx(self) -> int:
        if isinstance(self._internal_index, int):
            return self._internal_index
        else:
            raise ValueError

    @property
    def text(self):
        return self.form

    @property
    def unlabeled_identifier(self) -> str:
        return f'Token[{self.idx-1}]: "{self.text}"'

    def add_tags_proba_dist(self, tag_type: str, tags: List[Label]):
        self.tags_proba_dist[tag_type] = tags

    def get_tags_proba_dist(self, tag_type: str) -> List[Label]:
        if tag_type in self.tags_proba_dist:
            return self.tags_proba_dist[tag_type]
        return []

    def get_head(self):
        return self.sentence.get_token(self.head_id)

    @property
    def start_position(self) -> int:
        return self.start_pos

    @property
    def end_position(self) -> int:
        return self.end_pos

    @property
    def embedding(self):
        return self.get_embedding()

    def __repr__(self):
        return self.__str__()

    def add_label(self, typename: str, value: str, score: float = 1.0):
        """
        The Token is a special _PartOfSentence in that it may be initialized without a Sentence.
        Therefore, labels get added only to the Sentence if it exists
        """
        if self.sentence:
            super().add_label(typename=typename, value=value, score=score)
        else:
            DataPoint.add_label(self, typename=typename, value=value, score=score)

    def set_label(self, typename: str, value: str, score: float = 1.0):
        """
        The Token is a special _PartOfSentence in that it may be initialized without a Sentence.
        Therefore, labels get set only to the Sentence if it exists
        """
        if self.sentence:
            super().set_label(typename=typename, value=value, score=score)
        else:
            DataPoint.set_label(self, typename=typename, value=value, score=score)


class Span(_PartOfSentence):
    """
    This class represents one textual span consisting of Tokens. It may be used for the instance that the 
    tokens form in a nested nature, meaning the tokens combined together forms a long phrase.

    :param tokens: List of tokens in the span
    """

    def __init__(self, tokens: List[Token]):
        super().__init__(tokens[0].sentence)
        self.tokens = tokens
        super()._init_labels()

    @property
    def start_position(self) -> int:
        return self.tokens[0].start_position

    @property
    def end_position(self) -> int:
        return self.tokens[-1].end_position

    @property
    def text(self) -> str:
        return " ".join([t.text for t in self.tokens])

    @property
    def unlabeled_identifier(self) -> str:
        return f'Span[{self.tokens[0].idx -1}:{self.tokens[-1].idx}]: "{self.text}"'

    def __repr__(self):
        return self.__str__()

    def __getitem__(self, idx: int) -> Token:
        return self.tokens[idx]

    def __iter__(self):
        return iter(self.tokens)

    def __len__(self) -> int:
        return len(self.tokens)

    @property
    def embedding(self):
        pass