stanza-digphil / stanza /tests /tokenization /test_tokenize_data.py

Albin Thörn Cleland

Clean initial commit with LFS

19b8775 about 1 month ago

9.84 kB

	"""
	Very simple test of the mwt counting functionality in tokenization/data.py

	TODO: could add a bunch more simple tests, including tests of reading
	the data from a temp file, for example
	"""

	import pytest
	import tempfile
	import numpy as np

	import stanza

	from stanza import Pipeline
	from stanza.tests import *
	from stanza.models.tokenization.data import DataLoader, NUMERIC_RE

	pytestmark = [pytest.mark.travis, pytest.mark.pipeline]

	def write_tokenizer_input(test_dir, raw_text, labels):
	"""
	Writes raw_text and labels to randomly named files in test_dir

	Note that the tempfiles are not set to automatically clean up.
	This will not be a problem if you put them in a tempdir.
	"""
	with tempfile.NamedTemporaryFile(mode='w', encoding='utf-8', dir=test_dir, delete=False) as fout:
	txt_file = fout.name
	fout.write(raw_text)

	with tempfile.NamedTemporaryFile(mode='w', encoding='utf-8', dir=test_dir, delete=False) as fout:
	label_file = fout.name
	fout.write(labels)

	return txt_file, label_file

	# A single slice of the German tokenization data with no MWT in it
	NO_MWT_TEXT = "Sehr gute Beratung, schnelle Behebung der Probleme"
	NO_MWT_LABELS = "00010000100000000110000000010000000010001000000002"

	# A single slice of the German tokenization data with an MWT in it
	MWT_TEXT = " Die Kosten sind definitiv auch im Rahmen."
	MWT_LABELS = "000100000010000100000000010000100300000012"

	FAKE_PROPERTIES = {
	"lang":"de",
	'feat_funcs': ("space_before","capitalized"),
	'max_seqlen': 300,
	'use_dictionary': False,
	}

	def test_has_mwt():
	"""
	One dataset has no mwt, the other does
	"""
	with tempfile.TemporaryDirectory(dir=TEST_WORKING_DIR) as test_dir:
	txt_file, label_file = write_tokenizer_input(test_dir, NO_MWT_TEXT, NO_MWT_LABELS)
	data = DataLoader(args=FAKE_PROPERTIES, input_files={'txt': txt_file, 'label': label_file})
	assert not data.has_mwt()

	txt_file, label_file = write_tokenizer_input(test_dir, MWT_TEXT, MWT_LABELS)
	data = DataLoader(args=FAKE_PROPERTIES, input_files={'txt': txt_file, 'label': label_file})
	assert data.has_mwt()

	@pytest.fixture(scope="module")
	def tokenizer():
	pipeline = Pipeline("en", dir=TEST_MODELS_DIR, download_method=None, processors="tokenize")
	tokenizer = pipeline.processors['tokenize']
	return tokenizer

	@pytest.fixture(scope="module")
	def zhtok():
	pipeline = Pipeline("zh-hans", dir=TEST_MODELS_DIR, download_method=None, processors="tokenize")
	tokenizer = pipeline.processors['tokenize']
	return tokenizer

	EXPECTED_TWO_NL_RAW = [[('T', 0), ('h', 0), ('i', 0), ('s', 0), (' ', 0), ('i', 0), ('s', 0), (' ', 0), ('a', 0), (' ', 0), ('t', 0), ('e', 0), ('s', 0), ('t', 0)], [('f', 0), ('o', 0), ('o', 0)]]
	# in this test, the newline after test becomes a space labeled 0
	EXPECTED_ONE_NL_RAW = [[('T', 0), ('h', 0), ('i', 0), ('s', 0), (' ', 0), ('i', 0), ('s', 0), (' ', 0), ('a', 0), (' ', 0), ('t', 0), ('e', 0), ('s', 0), ('t', 0), (' ', 0), ('f', 0), ('o', 0), ('o', 0)]]
	EXPECTED_SKIP_NL_RAW = [[('T', 0), ('h', 0), ('i', 0), ('s', 0), (' ', 0), ('i', 0), ('s', 0), (' ', 0), ('a', 0), (' ', 0), ('t', 0), ('e', 0), ('s', 0), ('t', 0), ('f', 0), ('o', 0), ('o', 0)]]

	def test_convert_units_raw_text(tokenizer):
	"""
	Tests converting a couple small segments to units
	"""
	raw_text = "This is a test\n\nfoo"
	batches = DataLoader(tokenizer.config, input_text=raw_text, vocab=tokenizer.vocab, evaluation=True, dictionary=tokenizer.trainer.dictionary)
	assert batches.data == EXPECTED_TWO_NL_RAW

	raw_text = "This is a test\nfoo"
	batches = DataLoader(tokenizer.config, input_text=raw_text, vocab=tokenizer.vocab, evaluation=True, dictionary=tokenizer.trainer.dictionary)
	assert batches.data == EXPECTED_ONE_NL_RAW

	skip_newline_config = dict(tokenizer.config)
	skip_newline_config['skip_newline'] = True
	batches = DataLoader(skip_newline_config, input_text=raw_text, vocab=tokenizer.vocab, evaluation=True, dictionary=tokenizer.trainer.dictionary)
	assert batches.data == EXPECTED_SKIP_NL_RAW


	EXPECTED_TWO_NL_FILE = [[('T', 0), ('h', 0), ('i', 0), ('s', 0), (' ', 0), ('i', 0), ('s', 0), (' ', 0), ('a', 0), (' ', 0), ('t', 0), ('e', 0), ('s', 0), ('t', 0), ('.', 1)], [('f', 0), ('o', 0), ('o', 0)]]
	EXPECTED_TWO_NL_FILE_LABELS = [np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], dtype=np.int32),
	np.array([0, 0, 0], dtype=np.int32)]

	# in this test, the newline after test becomes a space labeled 0
	EXPECTED_ONE_NL_FILE = [[('T', 0), ('h', 0), ('i', 0), ('s', 0), (' ', 0), ('i', 0), ('s', 0), (' ', 0), ('a', 0), (' ', 0), ('t', 0), ('e', 0), ('s', 0), ('t', 0), ('.', 1), (' ', 0), ('f', 0), ('o', 0), ('o', 0)]]
	EXPECTED_ONE_NL_FILE_LABELS = [np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0], dtype=np.int32)]

	EXPECTED_SKIP_NL_FILE = [[('T', 0), ('h', 0), ('i', 0), ('s', 0), (' ', 0), ('i', 0), ('s', 0), (' ', 0), ('a', 0), (' ', 0), ('t', 0), ('e', 0), ('s', 0), ('t', 0), ('.', 1), ('f', 0), ('o', 0), ('o', 0)]]
	EXPECTED_SKIP_NL_FILE_LABELS = [np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0], dtype=np.int32)]

	def check_labels(labels, expected_labels):
	assert len(labels) == len(expected_labels)
	for label, expected in zip(labels, expected_labels):
	assert np.array_equiv(label, expected)

	def test_convert_units_file(tokenizer):
	"""
	Tests reading some text from a file and converting that to units
	"""
	with tempfile.TemporaryDirectory(dir=TEST_WORKING_DIR) as test_dir:
	# two nl test case, read from file
	labels = "00000000000000000001\n\n000\n\n"
	raw_text = "This is a test.\n\nfoo\n\n"
	txt_file, label_file = write_tokenizer_input(test_dir, raw_text, labels)

	batches = DataLoader(tokenizer.config, input_files={'txt': txt_file, 'label': label_file}, vocab=tokenizer.vocab, evaluation=True, dictionary=tokenizer.trainer.dictionary)
	assert batches.data == EXPECTED_TWO_NL_FILE
	check_labels(batches.labels(), EXPECTED_TWO_NL_FILE_LABELS)

	# one nl test case, read from file
	labels = "000000000000000000010000\n\n"
	raw_text = "This is a test.\nfoo\n\n"
	txt_file, label_file = write_tokenizer_input(test_dir, raw_text, labels)

	batches = DataLoader(tokenizer.config, input_files={'txt': txt_file, 'label': label_file}, vocab=tokenizer.vocab, evaluation=True, dictionary=tokenizer.trainer.dictionary)
	assert batches.data == EXPECTED_ONE_NL_FILE
	check_labels(batches.labels(), EXPECTED_ONE_NL_FILE_LABELS)

	skip_newline_config = dict(tokenizer.config)
	skip_newline_config['skip_newline'] = True
	labels = "000000000000000000010000\n\n"
	raw_text = "This is a test.\nfoo\n\n"
	txt_file, label_file = write_tokenizer_input(test_dir, raw_text, labels)

	batches = DataLoader(skip_newline_config, input_files={'txt': txt_file, 'label': label_file}, vocab=tokenizer.vocab, evaluation=True, dictionary=tokenizer.trainer.dictionary)
	assert batches.data == EXPECTED_SKIP_NL_FILE
	check_labels(batches.labels(), EXPECTED_SKIP_NL_FILE_LABELS)


	def test_dictionary(zhtok):
	"""
	Tests some features of the zh tokenizer dictionary

	The expectation is that the Chinese tokenizer will be serialized with a dictionary
	(if it ever gets serialized without, this test will warn us!)
	"""
	assert zhtok.trainer.lexicon is not None
	assert zhtok.trainer.dictionary is not None

	assert "老师" in zhtok.trainer.lexicon
	# egg-white-stuff, eg protein
	assert "蛋白质" in zhtok.trainer.lexicon
	# egg-white
	assert "蛋白" in zhtok.trainer.dictionary['prefixes']
	# egg
	assert "蛋" in zhtok.trainer.dictionary['prefixes']
	# white-stuff
	assert "白质" in zhtok.trainer.dictionary['suffixes']
	# stuff
	assert "质" in zhtok.trainer.dictionary['suffixes']

	def test_dictionary_feats(zhtok):
	"""
	Test the results of running a sentence into the dictionary featurizer
	"""
	raw_text = "我想吃蛋白质"
	batches = DataLoader(zhtok.config, input_text=raw_text, vocab=zhtok.vocab, evaluation=True, dictionary=zhtok.trainer.dictionary)
	data = batches.data
	assert len(data) == 1
	assert len(data[0]) == 6

	expected_features = [
	# in our example, the 2-grams made by the one character words at the start
	# don't form any prefixes or suffixes
	[0, 0, 0, 0, 0, 0, 0, 0],
	[0, 0, 0, 0, 0, 0, 0, 0],
	[0, 0, 0, 0, 0, 0, 0, 0],
	[1, 1, 0, 0, 0, 0, 0, 0],
	[0, 0, 0, 0, 1, 0, 0, 0],
	[0, 0, 0, 0, 0, 1, 0, 0],
	]

	for i, expected in enumerate(expected_features):
	dict_features = batches.extract_dict_feat(data[0], i)
	assert dict_features == expected


	def test_numeric_re():
	"""
	Test the "is numeric" function

	This function is entirely based on an RE in data.py
	"""
	# the last one is Thai
	matches = ["57", "135245345", "12535.", "852358.458345", "435345...345345", "111,,,111,,,111,,,111", "5318008", "５", "๕"]

	# note that we might want to consider .4 a numeric token after all
	# however, changing that means retraining all the models
	# the really long one only works if NUMERIC_RE avoids catastrophic backtracking
	not_matches = [".4", "54353a", "5453 35345", "aaa143234", "a,a,a,a", "sh'reyan", "asdaf786876asdfasdf", "",
	"11111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111a"]

	for x in matches:
	assert NUMERIC_RE.match(x) is not None
	for x in not_matches:
	assert NUMERIC_RE.match(x) is None