Spaces:
Running
Running
| import pytest | |
| from datasets import Dataset | |
| from mlplo.data_cleaning import is_valid_example, deduplicate_split | |
| def test_is_valid_example(): | |
| assert is_valid_example( | |
| {"text": "A " * 50, "summary": "B " * 10}, | |
| "text", "summary", | |
| min_document_words=10, max_document_words=100, min_summary_words=5 | |
| ) | |
| # Too short document | |
| assert not is_valid_example( | |
| {"text": "A " * 5, "summary": "B " * 10}, | |
| "text", "summary", | |
| min_document_words=10, max_document_words=100, min_summary_words=5 | |
| ) | |
| def test_deduplicate_split(): | |
| data = {"text": ["A", "B", "A", "C"], "summary": ["1", "2", "3", "4"]} | |
| ds = Dataset.from_dict(data) | |
| dedup, removed = deduplicate_split(ds, "text") | |
| assert removed == 1 | |
| assert len(dedup) == 3 | |
| assert dedup["text"] == ["A", "B", "C"] | |