""" A few tests of the utils module for the sentiment datasets """ import os import pytest import stanza from stanza.models.classifiers import data from stanza.models.classifiers.data import SentimentDatum from stanza.models.classifiers.utils import WVType from stanza.utils.datasets.sentiment import process_utils from stanza.tests import TEST_MODELS_DIR from stanza.tests.classifiers.test_data import train_file, dev_file, test_file def test_write_list(tmp_path, train_file): """ Test that writing a single list of items to an output file works """ train_set = data.read_dataset(train_file, WVType.OTHER, 1) dataset_file = tmp_path / "foo.json" process_utils.write_list(dataset_file, train_set) train_copy = data.read_dataset(dataset_file, WVType.OTHER, 1) assert train_copy == train_set def test_write_dataset(tmp_path, train_file, dev_file, test_file): """ Test that writing all three parts of a dataset works """ dataset = [data.read_dataset(filename, WVType.OTHER, 1) for filename in (train_file, dev_file, test_file)] process_utils.write_dataset(dataset, tmp_path, "en_test") expected_files = ['en_test.train.json', 'en_test.dev.json', 'en_test.test.json'] dataset_files = os.listdir(tmp_path) assert sorted(dataset_files) == sorted(expected_files) for filename, expected in zip(expected_files, dataset): written = data.read_dataset(tmp_path / filename, WVType.OTHER, 1) assert written == expected def test_read_snippets(tmp_path): """ Test the basic operation of the read_snippets function """ filename = tmp_path / "foo.csv" with open(filename, "w", encoding="utf-8") as fout: fout.write("FOO\tThis is a test\thappy\n") fout.write("FOO\tThis is a second sentence\tsad\n") nlp = stanza.Pipeline("en", dir=TEST_MODELS_DIR, processors="tokenize", download_method=None) mapping = {"happy": 0, "sad": 1} snippets = process_utils.read_snippets(filename, 2, 1, "en", mapping, nlp=nlp) assert len(snippets) == 2 assert snippets == [SentimentDatum(sentiment=0, text=['This', 'is', 'a', 'test']), SentimentDatum(sentiment=1, text=['This', 'is', 'a', 'second', 'sentence'])] def test_read_snippets_two_columns(tmp_path): """ Test what happens when multiple columns are combined for the sentiment value """ filename = tmp_path / "foo.csv" with open(filename, "w", encoding="utf-8") as fout: fout.write("FOO\tThis is a test\thappy\tfoo\n") fout.write("FOO\tThis is a second sentence\tsad\tbar\n") fout.write("FOO\tThis is a third sentence\tsad\tfoo\n") nlp = stanza.Pipeline("en", dir=TEST_MODELS_DIR, processors="tokenize", download_method=None) mapping = {("happy", "foo"): 0, ("sad", "bar"): 1, ("sad", "foo"): 2} snippets = process_utils.read_snippets(filename, (2,3), 1, "en", mapping, nlp=nlp) assert len(snippets) == 3 assert snippets == [SentimentDatum(sentiment=0, text=['This', 'is', 'a', 'test']), SentimentDatum(sentiment=1, text=['This', 'is', 'a', 'second', 'sentence']), SentimentDatum(sentiment=2, text=['This', 'is', 'a', 'third', 'sentence'])]