Spaces:

solarevat
/

multilabel-news-classifier

Sleeping

multilabel-news-classifier / tests /test_text_processing.py

Solareva Taisia

chore(release): initial public snapshot

198ccb0 about 2 months ago

1.24 kB

	"""Tests for text processing utilities."""

	import pytest
	from utils.text_processing import normalise_text, create_vocab


	def test_normalise_text():
	"""Test text normalization."""
	# Test basic normalization
	assert normalise_text("Привет, мир!") == "привет мир"
	assert normalise_text("Hello, World!") == "hello world"
	assert normalise_text("Test123") == "test123"

	# Test with special characters
	assert normalise_text("Text@#$%^&*()") == "text"

	# Test empty string
	assert normalise_text("") == ""

	# Test whitespace handling
	assert normalise_text(" test ") == "test"


	def test_create_vocab():
	"""Test vocabulary creation."""
	text = "word1 word2 word1 word3"
	vocab = create_vocab(text, vocab_size=10)

	# Check special tokens
	assert "#PAD#" in vocab
	assert "#UNKN#" in vocab
	assert vocab["#PAD#"] == 0
	assert vocab["#UNKN#"] == 1

	# Check words are included
	assert "word1" in vocab
	assert "word2" in vocab
	assert "word3" in vocab

	# Check vocab size limit
	vocab_limited = create_vocab(text, vocab_size=2)
	# Should have 2 special tokens + 2 words = 4 total
	assert len(vocab_limited) <= 4