Albin Thörn Cleland
Clean initial commit with LFS
19b8775
"""
Utilities for the processing of constituency treebanks
"""
import os
import shutil
from stanza.models.constituency import parse_tree
SHARDS = ("train", "dev", "test")
def copy_dev_test(base_path, input_dataset, output_dataset):
shutil.copy2(os.path.join(base_path, "%s_dev.mrg" % input_dataset),
os.path.join(base_path, "%s_dev.mrg" % output_dataset))
shutil.copy2(os.path.join(base_path, "%s_test.mrg" % input_dataset),
os.path.join(base_path, "%s_test.mrg" % output_dataset))
def write_dataset(datasets, output_dir, dataset_name):
for dataset, shard in zip(datasets, SHARDS):
output_filename = os.path.join(output_dir, "%s_%s.mrg" % (dataset_name, shard))
print("Writing {} trees to {}".format(len(dataset), output_filename))
parse_tree.Tree.write_treebank(dataset, output_filename)
def split_treebank(treebank, train_size, dev_size):
"""
Split a treebank deterministically
"""
train_end = int(len(treebank) * train_size)
dev_end = int(len(treebank) * (train_size + dev_size))
return treebank[:train_end], treebank[train_end:dev_end], treebank[dev_end:]