File size: 1,240 Bytes
198ccb0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 |
"""Tests for text processing utilities."""
import pytest
from utils.text_processing import normalise_text, create_vocab
def test_normalise_text():
"""Test text normalization."""
# Test basic normalization
assert normalise_text("Привет, мир!") == "привет мир"
assert normalise_text("Hello, World!") == "hello world"
assert normalise_text("Test123") == "test123"
# Test with special characters
assert normalise_text("Text@#$%^&*()") == "text"
# Test empty string
assert normalise_text("") == ""
# Test whitespace handling
assert normalise_text(" test ") == "test"
def test_create_vocab():
"""Test vocabulary creation."""
text = "word1 word2 word1 word3"
vocab = create_vocab(text, vocab_size=10)
# Check special tokens
assert "#PAD#" in vocab
assert "#UNKN#" in vocab
assert vocab["#PAD#"] == 0
assert vocab["#UNKN#"] == 1
# Check words are included
assert "word1" in vocab
assert "word2" in vocab
assert "word3" in vocab
# Check vocab size limit
vocab_limited = create_vocab(text, vocab_size=2)
# Should have 2 special tokens + 2 words = 4 total
assert len(vocab_limited) <= 4
|