Spaces:
Sleeping
Sleeping
File size: 4,851 Bytes
f6d689c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
import json
import os
import tempfile
import unittest
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from src.data.dataset import (
EmotionDataset,
EmotionExample,
SummarizationDataset,
SummarizationExample,
TopicDataset,
TopicExample,
load_emotion_jsonl,
load_summarization_jsonl,
load_topic_jsonl,
)
class TestDatasets(unittest.TestCase):
def test_summarization_dataset(self):
examples = [
SummarizationExample(source="Source 1", summary="Summary 1"),
SummarizationExample(source="Source 2", summary="Summary 2"),
]
dataset = SummarizationDataset(examples)
self.assertEqual(len(dataset), 2)
self.assertEqual(dataset[0], examples[0])
self.assertEqual(dataset[1], examples[1])
def test_emotion_dataset_auto_binarizer(self):
examples = [
EmotionExample(text="Text 1", emotions=["joy", "love"]),
EmotionExample(text="Text 2", emotions=["sadness"]),
]
dataset = EmotionDataset(examples)
self.assertEqual(len(dataset), 2)
self.assertEqual(dataset[0], examples[0])
self.assertTrue(hasattr(dataset, "binarizer"))
self.assertIsInstance(dataset.binarizer, MultiLabelBinarizer)
self.assertIn("joy", dataset.emotion_classes)
self.assertIn("sadness", dataset.emotion_classes)
def test_emotion_dataset_provided_binarizer(self):
examples = [EmotionExample(text="Text 1", emotions=["joy"])]
binarizer = MultiLabelBinarizer()
binarizer.fit([["joy", "sadness"]])
dataset = EmotionDataset(examples, binarizer=binarizer)
self.assertEqual(dataset.binarizer, binarizer)
self.assertEqual(set(dataset.emotion_classes), {"joy", "sadness"})
def test_topic_dataset_auto_encoder(self):
examples = [
TopicExample(text="Text 1", topic="sports"),
TopicExample(text="Text 2", topic="politics"),
]
dataset = TopicDataset(examples)
self.assertEqual(len(dataset), 2)
self.assertEqual(dataset[0], examples[0])
self.assertTrue(hasattr(dataset, "encoder"))
self.assertIsInstance(dataset.encoder, LabelEncoder)
self.assertIn("sports", dataset.topic_classes)
def test_topic_dataset_provided_encoder(self):
examples = [TopicExample(text="Text 1", topic="sports")]
encoder = LabelEncoder()
encoder.fit(["sports", "tech"])
dataset = TopicDataset(examples, encoder=encoder)
self.assertEqual(dataset.encoder, encoder)
self.assertEqual(set(dataset.topic_classes), {"sports", "tech"})
class TestDataLoading(unittest.TestCase):
def setUp(self):
self.temp_dir = tempfile.TemporaryDirectory()
self.jsonl_path = os.path.join(self.temp_dir.name, "data.jsonl")
def tearDown(self):
self.temp_dir.cleanup()
def test_load_summarization_jsonl(self):
data = [
{"source": "S1", "summary": "Sum1"},
{"source": "S2", "summary": "Sum2"},
]
with open(self.jsonl_path, "w") as f:
for item in data:
f.write(json.dumps(item) + "\n")
examples = load_summarization_jsonl(self.jsonl_path)
self.assertEqual(len(examples), 2)
self.assertEqual(examples[0].source, "S1")
self.assertEqual(examples[0].summary, "Sum1")
def test_load_emotion_jsonl(self):
data = [
{"text": "T1", "emotions": ["e1"]},
{"text": "T2", "emotions": ["e2", "e3"]},
]
with open(self.jsonl_path, "w") as f:
for item in data:
f.write(json.dumps(item) + "\n")
examples = load_emotion_jsonl(self.jsonl_path)
self.assertEqual(len(examples), 2)
self.assertEqual(examples[0].text, "T1")
self.assertEqual(examples[0].emotions, ["e1"])
def test_load_topic_jsonl(self):
data = [
{"text": "T1", "topic": "top1"},
{"text": "T2", "topic": "top2"},
]
with open(self.jsonl_path, "w") as f:
for item in data:
f.write(json.dumps(item) + "\n")
examples = load_topic_jsonl(self.jsonl_path)
self.assertEqual(len(examples), 2)
self.assertEqual(examples[0].text, "T1")
self.assertEqual(examples[0].topic, "top1")
def test_load_json_array(self):
data = [
{"source": "S1", "summary": "Sum1"},
{"source": "S2", "summary": "Sum2"},
]
with open(self.jsonl_path, "w") as f:
json.dump(data, f)
examples = load_summarization_jsonl(self.jsonl_path)
self.assertEqual(len(examples), 2)
self.assertEqual(examples[0].source, "S1")
if __name__ == "__main__":
unittest.main()
|