import os import logging from stanza.models.common import utils from stanza.models.constituency.utils import retag_tags from stanza.models.constituency.trainer import Trainer from stanza.models.constituency.tree_reader import read_trees from stanza.utils.get_tqdm import get_tqdm logger = logging.getLogger('stanza') tqdm = get_tqdm() def read_tokenized_file(tokenized_file): """ Read sentences from a tokenized file, potentially replacing _ with space for languages such as VI """ with open(tokenized_file, encoding='utf-8') as fin: lines = fin.readlines() lines = [x.strip() for x in lines] lines = [x for x in lines if x] docs = [[word if all(x == '_' for x in word) else word.replace("_", " ") for word in sentence.split()] for sentence in lines] ids = [None] * len(docs) return docs, ids def read_xml_tree_file(tree_file): """ Read sentences from a file of the format unique to VLSP test sets in particular, it should be multiple blocks of ~~(tree ...)~~ """ with open(tree_file, encoding='utf-8') as fin: lines = fin.readlines() lines = [x.strip() for x in lines] lines = [x for x in lines if x] docs = [] ids = [] tree_id = None tree_text = [] for line in lines: if line.startswith(" 1: tree_id = tree_id[1] if tree_id.endswith(">"): tree_id = tree_id[:-1] tree_id = int(tree_id) else: tree_id = None elif line.startswith("