"""Tests for text processing utilities.""" import pytest from utils.text_processing import normalise_text, create_vocab def test_normalise_text(): """Test text normalization.""" # Test basic normalization assert normalise_text("Привет, мир!") == "привет мир" assert normalise_text("Hello, World!") == "hello world" assert normalise_text("Test123") == "test123" # Test with special characters assert normalise_text("Text@#$%^&*()") == "text" # Test empty string assert normalise_text("") == "" # Test whitespace handling assert normalise_text(" test ") == "test" def test_create_vocab(): """Test vocabulary creation.""" text = "word1 word2 word1 word3" vocab = create_vocab(text, vocab_size=10) # Check special tokens assert "#PAD#" in vocab assert "#UNKN#" in vocab assert vocab["#PAD#"] == 0 assert vocab["#UNKN#"] == 1 # Check words are included assert "word1" in vocab assert "word2" in vocab assert "word3" in vocab # Check vocab size limit vocab_limited = create_vocab(text, vocab_size=2) # Should have 2 special tokens + 2 words = 4 total assert len(vocab_limited) <= 4