bilstm_api / part /data.py
spitzc32
Added initial structure of the model
24d0437
from typing import Dict, List, Optional
from flair.data import _PartOfSentence, DataPoint, Label
class Token(_PartOfSentence):
"""
This class represents one word in a tokenized sentence. Each token may have any number of tags. It may also point
to its head in a dependency tree.
:param text: Single text(Token) from the sequence
:param head_id: the location of the text (For Document)
:param whitespace_after: if token has whitespace
:param start_position: what character number in document does this token start?
:param sentence: If token belongs to sentence, indicate here which var it belongs to
"""
def __init__(
self,
text: str,
head_id: int = None,
whitespace_after: int = 1,
start_position: int = 0,
sentence=None,
):
super().__init__(sentence=sentence)
self.form: str = text
self._internal_index: Optional[int] = None
self.head_id: Optional[int] = head_id
self.whitespace_after: int = whitespace_after
self.start_pos = start_position
self.end_pos = start_position + len(text)
self._embeddings: Dict = {}
self.tags_proba_dist: Dict[str, List[Label]] = {}
@property
def idx(self) -> int:
if isinstance(self._internal_index, int):
return self._internal_index
else:
raise ValueError
@property
def text(self):
return self.form
@property
def unlabeled_identifier(self) -> str:
return f'Token[{self.idx-1}]: "{self.text}"'
def add_tags_proba_dist(self, tag_type: str, tags: List[Label]):
self.tags_proba_dist[tag_type] = tags
def get_tags_proba_dist(self, tag_type: str) -> List[Label]:
if tag_type in self.tags_proba_dist:
return self.tags_proba_dist[tag_type]
return []
def get_head(self):
return self.sentence.get_token(self.head_id)
@property
def start_position(self) -> int:
return self.start_pos
@property
def end_position(self) -> int:
return self.end_pos
@property
def embedding(self):
return self.get_embedding()
def __repr__(self):
return self.__str__()
def add_label(self, typename: str, value: str, score: float = 1.0):
"""
The Token is a special _PartOfSentence in that it may be initialized without a Sentence.
Therefore, labels get added only to the Sentence if it exists
"""
if self.sentence:
super().add_label(typename=typename, value=value, score=score)
else:
DataPoint.add_label(self, typename=typename, value=value, score=score)
def set_label(self, typename: str, value: str, score: float = 1.0):
"""
The Token is a special _PartOfSentence in that it may be initialized without a Sentence.
Therefore, labels get set only to the Sentence if it exists
"""
if self.sentence:
super().set_label(typename=typename, value=value, score=score)
else:
DataPoint.set_label(self, typename=typename, value=value, score=score)
class Span(_PartOfSentence):
"""
This class represents one textual span consisting of Tokens. It may be used for the instance that the
tokens form in a nested nature, meaning the tokens combined together forms a long phrase.
:param tokens: List of tokens in the span
"""
def __init__(self, tokens: List[Token]):
super().__init__(tokens[0].sentence)
self.tokens = tokens
super()._init_labels()
@property
def start_position(self) -> int:
return self.tokens[0].start_position
@property
def end_position(self) -> int:
return self.tokens[-1].end_position
@property
def text(self) -> str:
return " ".join([t.text for t in self.tokens])
@property
def unlabeled_identifier(self) -> str:
return f'Span[{self.tokens[0].idx -1}:{self.tokens[-1].idx}]: "{self.text}"'
def __repr__(self):
return self.__str__()
def __getitem__(self, idx: int) -> Token:
return self.tokens[idx]
def __iter__(self):
return iter(self.tokens)
def __len__(self) -> int:
return len(self.tokens)
@property
def embedding(self):
pass