SummaryGenerator / tests /test_data_cleaning.py
Adive01's picture
Upload tests/test_data_cleaning.py with huggingface_hub
9237ef1 verified
raw
history blame contribute delete
830 Bytes
import pytest
from datasets import Dataset
from mlplo.data_cleaning import is_valid_example, deduplicate_split
def test_is_valid_example():
assert is_valid_example(
{"text": "A " * 50, "summary": "B " * 10},
"text", "summary",
min_document_words=10, max_document_words=100, min_summary_words=5
)
# Too short document
assert not is_valid_example(
{"text": "A " * 5, "summary": "B " * 10},
"text", "summary",
min_document_words=10, max_document_words=100, min_summary_words=5
)
def test_deduplicate_split():
data = {"text": ["A", "B", "A", "C"], "summary": ["1", "2", "3", "4"]}
ds = Dataset.from_dict(data)
dedup, removed = deduplicate_split(ds, "text")
assert removed == 1
assert len(dedup) == 3
assert dedup["text"] == ["A", "B", "C"]