# -*- coding: utf-8 -*- from collections import namedtuple from collections.abc import Iterable from parser.utils.field import Field CoNLL = namedtuple(typename='CoNLL', field_names=['ID', 'FORM', 'LEMMA', 'CPOS', 'POS', 'FEATS', 'HEAD', 'DEPREL', 'PHEAD', 'PDEPREL'], defaults=[None]*10) class Sentence(object): def __init__(self, fields, values): for field, value in zip(fields, values): if isinstance(field, Iterable): for j in range(len(field)): setattr(self, field[j].name, value) else: setattr(self, field.name, value) self.fields = fields @property def values(self): for field in self.fields: if isinstance(field, Iterable): yield getattr(self, field[0].name) else: yield getattr(self, field.name) def __len__(self): return len(next(iter(self.values))) def __repr__(self): return '\n'.join('\t'.join(map(str, line)) for line in zip(*self.values)) + '\n' class Corpus(object): def __init__(self, fields, sentences): super(Corpus, self).__init__() self.fields = fields self.sentences = sentences def __len__(self): return len(self.sentences) def __repr__(self): return '\n'.join(str(sentence) for sentence in self) def __getitem__(self, index): return self.sentences[index] def __getattr__(self, name): print(name) if not hasattr(self.sentences[0], name): raise AttributeError for sentence in self.sentences: yield getattr(sentence, name) def __setattr__(self, name, value): if name in ['fields', 'sentences']: self.__dict__[name] = value else: for i, sentence in enumerate(self.sentences): setattr(sentence, name, value[i]) @classmethod def load(cls, path, fields): start, sentences = 0, [] fields = [field if field is not None else Field(str(i)) for i, field in enumerate(fields)] with open(path, 'r') as f: lines = [line.strip() for line in f] for i, line in enumerate(lines): if not line: values = list(zip(*[l.split('\t') for l in lines[start:i]])) sentences.append(Sentence(fields, values)) start = i + 1 return cls(fields, sentences) def save(self, path): with open(path, 'w') as f: f.write(f"{self}\n")