Spaces:
Runtime error
Runtime error
| from typing import List | |
| import spacy | |
| from util.process_data import Token, Sample, SampleList | |
| class Tokenizer(): | |
| def __init__(self, spacy_model: str): | |
| self.__spacy_model = spacy.load(spacy_model) | |
| def run(self, sample_list: SampleList): | |
| self.__tokenize(sample_list.samples, self.__spacy_model) | |
| def __tokenize(self, samples: List[Sample], spacy_model): | |
| doc_pipe = spacy_model.pipe([sample.text.replace('\xa0', ' ') for sample in samples]) | |
| for sample, doc in zip(samples, doc_pipe): | |
| sample.tokens = [Token( | |
| text=x.text, | |
| start=x.idx, | |
| end=x.idx + len(x.text) | |
| ) for x in doc] | |
| while '\n' in sample.tokens[-1].text or ' ' in sample.tokens[-1].text: | |
| sample.tokens = sample.tokens[:-1] | |