import os
import logging
from stanza.models.common import utils
from stanza.models.constituency.utils import retag_tags
from stanza.models.constituency.trainer import Trainer
from stanza.models.constituency.tree_reader import read_trees
from stanza.utils.get_tqdm import get_tqdm
logger = logging.getLogger('stanza')
tqdm = get_tqdm()
def read_tokenized_file(tokenized_file):
"""
Read sentences from a tokenized file, potentially replacing _ with space for languages such as VI
"""
with open(tokenized_file, encoding='utf-8') as fin:
lines = fin.readlines()
lines = [x.strip() for x in lines]
lines = [x for x in lines if x]
docs = [[word if all(x == '_' for x in word) else word.replace("_", " ") for word in sentence.split()] for sentence in lines]
ids = [None] * len(docs)
return docs, ids
def read_xml_tree_file(tree_file):
"""
Read sentences from a file of the format unique to VLSP test sets
in particular, it should be multiple blocks of
(tree ...)
"""
with open(tree_file, encoding='utf-8') as fin:
lines = fin.readlines()
lines = [x.strip() for x in lines]
lines = [x for x in lines if x]
docs = []
ids = []
tree_id = None
tree_text = []
for line in lines:
if line.startswith(" 1:
tree_id = tree_id[1]
if tree_id.endswith(">"):
tree_id = tree_id[:-1]
tree_id = int(tree_id)
else:
tree_id = None
elif line.startswith("