|
|
from .instance import NER_DependencyInstance |
|
|
from .instance import Sentence |
|
|
from .prepare_data import ROOT, END, MAX_CHAR_LENGTH |
|
|
|
|
|
class Reader(object): |
|
|
def __init__(self, file_path, alphabets): |
|
|
self.__source_file = open(file_path, 'r') |
|
|
self.alphabets = alphabets |
|
|
|
|
|
def close(self): |
|
|
self.__source_file.close() |
|
|
|
|
|
def getNext(self, lower_case=False, symbolic_root=False, symbolic_end=False): |
|
|
line = self.__source_file.readline() |
|
|
|
|
|
while len(line) > 0 and len(line.strip()) == 0: |
|
|
line = self.__source_file.readline() |
|
|
if len(line) == 0: |
|
|
return None |
|
|
|
|
|
lines = [] |
|
|
while len(line.strip()) > 0: |
|
|
line = line.strip() |
|
|
lines.append(line.split('\t')) |
|
|
line = self.__source_file.readline() |
|
|
|
|
|
length = len(lines) |
|
|
if length == 0: |
|
|
return None |
|
|
|
|
|
heads = [] |
|
|
tokens_dict = {} |
|
|
ids_dict = {} |
|
|
for alphabet_name in self.alphabets.keys(): |
|
|
tokens_dict[alphabet_name] = [] |
|
|
ids_dict[alphabet_name] = [] |
|
|
if symbolic_root: |
|
|
for alphabet_name, alphabet in self.alphabets.items(): |
|
|
if alphabet_name.startswith('char'): |
|
|
tokens_dict[alphabet_name].append([ROOT, ]) |
|
|
ids_dict[alphabet_name].append([alphabet.get_index(ROOT), ]) |
|
|
else: |
|
|
tokens_dict[alphabet_name].append(ROOT) |
|
|
ids_dict[alphabet_name].append(alphabet.get_index(ROOT)) |
|
|
heads.append(0) |
|
|
|
|
|
for tokens in lines: |
|
|
chars = [] |
|
|
char_ids = [] |
|
|
if lower_case: |
|
|
tokens[1] = tokens[1].lower() |
|
|
for char in tokens[1]: |
|
|
chars.append(char) |
|
|
char_ids.append(self.alphabets['char_alphabet'].get_index(char)) |
|
|
if len(chars) > MAX_CHAR_LENGTH: |
|
|
chars = chars[:MAX_CHAR_LENGTH] |
|
|
char_ids = char_ids[:MAX_CHAR_LENGTH] |
|
|
tokens_dict['char_alphabet'].append(chars) |
|
|
ids_dict['char_alphabet'].append(char_ids) |
|
|
|
|
|
word = tokens[1] |
|
|
|
|
|
pos = tokens[2] |
|
|
ner = tokens[3] |
|
|
head = int(tokens[4]) |
|
|
arc_tag = tokens[5] |
|
|
if len(tokens) > 6: |
|
|
auto_label = tokens[6] |
|
|
tokens_dict['auto_label_alphabet'].append(auto_label) |
|
|
ids_dict['auto_label_alphabet'].append(self.alphabets['auto_label_alphabet'].get_index(auto_label)) |
|
|
tokens_dict['word_alphabet'].append(word) |
|
|
ids_dict['word_alphabet'].append(self.alphabets['word_alphabet'].get_index(word)) |
|
|
tokens_dict['pos_alphabet'].append(pos) |
|
|
ids_dict['pos_alphabet'].append(self.alphabets['pos_alphabet'].get_index(pos)) |
|
|
tokens_dict['ner_alphabet'].append(ner) |
|
|
ids_dict['ner_alphabet'].append(self.alphabets['ner_alphabet'].get_index(ner)) |
|
|
tokens_dict['arc_alphabet'].append(arc_tag) |
|
|
ids_dict['arc_alphabet'].append(self.alphabets['arc_alphabet'].get_index(arc_tag)) |
|
|
heads.append(head) |
|
|
|
|
|
if symbolic_end: |
|
|
for alphabet_name, alphabet in self.alphabets.items(): |
|
|
if alphabet_name.startswith('char'): |
|
|
tokens_dict[alphabet_name].append([END, ]) |
|
|
ids_dict[alphabet_name].append([alphabet.get_index(END), ]) |
|
|
else: |
|
|
tokens_dict[alphabet_name] = [END] |
|
|
ids_dict[alphabet_name] = [alphabet.get_index(END)] |
|
|
heads.append(0) |
|
|
|
|
|
return NER_DependencyInstance(Sentence(tokens_dict['word_alphabet'], ids_dict['word_alphabet'], |
|
|
tokens_dict['char_alphabet'], ids_dict['char_alphabet']), |
|
|
tokens_dict, ids_dict, heads) |