| | """
|
| | Unit Tests for Text Preprocessing Module.
|
| |
|
| | Tests text cleaning, normalization, and utility functions.
|
| | """
|
| |
|
| | import pytest
|
| |
|
| | from app.utils.preprocessing import (
|
| | clean_text,
|
| | normalize_text,
|
| | convert_devanagari_digits,
|
| | truncate_text,
|
| | remove_urls,
|
| | extract_numbers,
|
| | mask_sensitive_data,
|
| | )
|
| |
|
| |
|
| | class TestCleanText:
|
| | """Tests for clean_text function."""
|
| |
|
| | def test_empty_string(self):
|
| | """Test empty string returns empty string."""
|
| | assert clean_text("") == ""
|
| |
|
| | def test_none_returns_empty(self):
|
| | """Test None or falsy value returns empty string."""
|
| | assert clean_text(None) == ""
|
| |
|
| | def test_removes_extra_whitespace(self):
|
| | """Test extra whitespace is normalized."""
|
| | text = "Hello world here"
|
| | result = clean_text(text)
|
| | assert result == "Hello world here"
|
| |
|
| | def test_removes_leading_trailing_whitespace(self):
|
| | """Test leading/trailing whitespace is stripped."""
|
| | text = " Hello world "
|
| | result = clean_text(text)
|
| | assert result == "Hello world"
|
| |
|
| | def test_removes_control_characters(self):
|
| | """Test control characters are removed."""
|
| | text = "Hello\x00\x07world"
|
| | result = clean_text(text)
|
| | assert "\x00" not in result
|
| | assert "\x07" not in result
|
| | assert "Hello" in result
|
| | assert "world" in result
|
| |
|
| | def test_preserves_normal_text(self):
|
| | """Test normal text is preserved."""
|
| | text = "Hello, how are you?"
|
| | result = clean_text(text)
|
| | assert result == text
|
| |
|
| | def test_normalizes_newlines_and_tabs(self):
|
| | """Test newlines and tabs are normalized to spaces."""
|
| | text = "Hello\nworld\there"
|
| | result = clean_text(text)
|
| | assert result == "Hello world here"
|
| |
|
| | def test_handles_unicode(self):
|
| | """Test Unicode text is preserved."""
|
| | text = "नमस्ते दुनिया"
|
| | result = clean_text(text)
|
| | assert result == text
|
| |
|
| |
|
| | class TestNormalizeText:
|
| | """Tests for normalize_text function."""
|
| |
|
| | def test_basic_normalization(self):
|
| | """Test basic text normalization."""
|
| | text = " Hello world "
|
| | result = normalize_text(text)
|
| | assert result == "Hello world"
|
| |
|
| | def test_lowercase_option(self):
|
| | """Test lowercase option."""
|
| | text = "Hello WORLD"
|
| | result = normalize_text(text, lowercase=True)
|
| | assert result == "hello world"
|
| |
|
| | def test_without_lowercase(self):
|
| | """Test preserves case by default."""
|
| | text = "Hello WORLD"
|
| | result = normalize_text(text, lowercase=False)
|
| | assert result == "Hello WORLD"
|
| |
|
| | def test_converts_devanagari_digits(self):
|
| | """Test Devanagari digits are converted."""
|
| | text = "Amount: ५०००"
|
| | result = normalize_text(text)
|
| | assert "5000" in result
|
| |
|
| |
|
| | class TestConvertDevanagariDigits:
|
| | """Tests for convert_devanagari_digits function."""
|
| |
|
| | def test_converts_all_digits(self):
|
| | """Test all Devanagari digits are converted."""
|
| | text = "०१२३४५६७८९"
|
| | result = convert_devanagari_digits(text)
|
| | assert result == "0123456789"
|
| |
|
| | def test_preserves_latin_digits(self):
|
| | """Test Latin digits are preserved."""
|
| | text = "123456"
|
| | result = convert_devanagari_digits(text)
|
| | assert result == "123456"
|
| |
|
| | def test_mixed_digits(self):
|
| | """Test mixed Devanagari and Latin digits."""
|
| | text = "Phone: ९८76543२१०"
|
| | result = convert_devanagari_digits(text)
|
| | assert result == "Phone: 9876543210"
|
| |
|
| | def test_preserves_non_digit_text(self):
|
| | """Test non-digit text is preserved."""
|
| | text = "नमस्ते"
|
| | result = convert_devanagari_digits(text)
|
| | assert result == "नमस्ते"
|
| |
|
| | def test_empty_string(self):
|
| | """Test empty string returns empty."""
|
| | assert convert_devanagari_digits("") == ""
|
| |
|
| | def test_phone_number_in_hindi(self):
|
| | """Test phone number conversion in Hindi context."""
|
| | text = "कॉल करें ९८७६५४३२१०"
|
| | result = convert_devanagari_digits(text)
|
| | assert "9876543210" in result
|
| |
|
| |
|
| | class TestTruncateText:
|
| | """Tests for truncate_text function."""
|
| |
|
| | def test_short_text_unchanged(self):
|
| | """Test text shorter than limit is unchanged."""
|
| | text = "Hello world"
|
| | result = truncate_text(text, max_length=100)
|
| | assert result == text
|
| |
|
| | def test_long_text_truncated(self):
|
| | """Test text longer than limit is truncated."""
|
| | text = "a" * 100
|
| | result = truncate_text(text, max_length=50)
|
| | assert len(result) == 50
|
| | assert result.endswith("...")
|
| |
|
| | def test_custom_suffix(self):
|
| | """Test custom truncation suffix."""
|
| | text = "a" * 100
|
| | result = truncate_text(text, max_length=50, suffix="[...]")
|
| | assert result.endswith("[...]")
|
| |
|
| | def test_exact_length(self):
|
| | """Test text at exact length is unchanged."""
|
| | text = "a" * 50
|
| | result = truncate_text(text, max_length=50)
|
| | assert result == text
|
| |
|
| | def test_default_max_length(self):
|
| | """Test default max_length is 5000."""
|
| | text = "a" * 5000
|
| | result = truncate_text(text)
|
| | assert len(result) == 5000
|
| |
|
| |
|
| | class TestRemoveUrls:
|
| | """Tests for remove_urls function."""
|
| |
|
| | def test_removes_http_url(self):
|
| | """Test HTTP URLs are removed."""
|
| | text = "Visit http://example.com for more info"
|
| | result = remove_urls(text)
|
| | assert "http://example.com" not in result
|
| | assert "Visit" in result
|
| |
|
| | def test_removes_https_url(self):
|
| | """Test HTTPS URLs are removed."""
|
| | text = "Visit https://secure.example.com for more info"
|
| | result = remove_urls(text)
|
| | assert "https://secure.example.com" not in result
|
| |
|
| | def test_removes_multiple_urls(self):
|
| | """Test multiple URLs are removed."""
|
| | text = "Visit http://one.com and http://two.com"
|
| | result = remove_urls(text)
|
| | assert "http://one.com" not in result
|
| | assert "http://two.com" not in result
|
| |
|
| | def test_preserves_non_url_text(self):
|
| | """Test non-URL text is preserved."""
|
| | text = "Hello world, no URLs here"
|
| | result = remove_urls(text)
|
| | assert result == text
|
| |
|
| | def test_removes_complex_url(self):
|
| | """Test complex URLs with paths are removed."""
|
| | text = "Click http://example.com/path/to/page?query=value"
|
| | result = remove_urls(text)
|
| | assert "http://example.com" not in result
|
| |
|
| |
|
| | class TestExtractNumbers:
|
| | """Tests for extract_numbers function."""
|
| |
|
| | def test_extracts_single_number(self):
|
| | """Test extracts single number."""
|
| | text = "Amount is 5000"
|
| | result = extract_numbers(text)
|
| | assert "5000" in result
|
| |
|
| | def test_extracts_multiple_numbers(self):
|
| | """Test extracts multiple numbers."""
|
| | text = "Account 123456 and phone 9876543210"
|
| | result = extract_numbers(text)
|
| | assert "123456" in result
|
| | assert "9876543210" in result
|
| |
|
| | def test_handles_devanagari_digits(self):
|
| | """Test handles Devanagari digits."""
|
| | text = "Amount ५०००"
|
| | result = extract_numbers(text)
|
| | assert "5000" in result
|
| |
|
| | def test_no_numbers(self):
|
| | """Test returns empty list when no numbers."""
|
| | text = "No numbers here"
|
| | result = extract_numbers(text)
|
| | assert result == []
|
| |
|
| | def test_mixed_devanagari_and_latin(self):
|
| | """Test mixed digit systems."""
|
| | text = "Phone ९८76543२१० account 123"
|
| | result = extract_numbers(text)
|
| | assert "9876543210" in result
|
| | assert "123" in result
|
| |
|
| |
|
| | class TestMaskSensitiveData:
|
| | """Tests for mask_sensitive_data function."""
|
| |
|
| | def test_masks_upi_id(self):
|
| | """Test UPI ID is masked."""
|
| | text = "Send to scammer@paytm"
|
| | result = mask_sensitive_data(text)
|
| | assert "scammer@paytm" not in result
|
| | assert "[UPI_MASKED]" in result
|
| |
|
| | def test_masks_bank_account(self):
|
| | """Test bank account number is masked."""
|
| | text = "Account: 123456789012345"
|
| | result = mask_sensitive_data(text)
|
| | assert "123456789012345" not in result
|
| | assert "[ACCOUNT_MASKED]" in result
|
| |
|
| | def test_masks_phone_number(self):
|
| | """Test phone number is masked."""
|
| | text = "Call 9876543210"
|
| | result = mask_sensitive_data(text)
|
| | assert "9876543210" not in result
|
| |
|
| | assert "[PHONE_MASKED]" in result or "[ACCOUNT_MASKED]" in result
|
| |
|
| | def test_masks_phone_with_plus91(self):
|
| | """Test phone with +91 prefix is masked."""
|
| | text = "Call +91 9876543210"
|
| | result = mask_sensitive_data(text)
|
| | assert "9876543210" not in result
|
| |
|
| | assert "[PHONE_MASKED]" in result or "[ACCOUNT_MASKED]" in result
|
| |
|
| | def test_preserves_non_sensitive_text(self):
|
| | """Test non-sensitive text is preserved."""
|
| | text = "Hello, how are you?"
|
| | result = mask_sensitive_data(text)
|
| | assert result == text
|
| |
|
| | def test_masks_multiple_sensitive_items(self):
|
| | """Test masks multiple sensitive items in one text."""
|
| | text = "Send to fraud@ybl, call 9876543210, account 123456789012"
|
| | result = mask_sensitive_data(text)
|
| |
|
| | assert "fraud@ybl" not in result
|
| | assert "9876543210" not in result
|
| | assert "123456789012" not in result
|
| |
|
| |
|
| | class TestPreprocessingEdgeCases:
|
| | """Edge case tests for preprocessing functions."""
|
| |
|
| | def test_clean_text_with_emojis(self):
|
| | """Test clean_text preserves emojis."""
|
| | text = "Hello 😀 world 🎉"
|
| | result = clean_text(text)
|
| | assert "😀" in result
|
| | assert "🎉" in result
|
| |
|
| | def test_normalize_very_long_text(self):
|
| | """Test normalize handles very long text."""
|
| | text = "word " * 10000
|
| | result = normalize_text(text)
|
| | assert len(result) > 0
|
| |
|
| | def test_devanagari_mixed_with_special_chars(self):
|
| | """Test Devanagari digits with special characters."""
|
| | text = "Amount: ₹५,०००/-"
|
| | result = convert_devanagari_digits(text)
|
| | assert "5" in result
|
| | assert "0" in result
|
| |
|
| | def test_url_with_hindi_text(self):
|
| | """Test URL removal with surrounding Hindi text."""
|
| | text = "यहाँ क्लिक करें http://fake.com जीतने के लिए"
|
| | result = remove_urls(text)
|
| | assert "http://fake.com" not in result
|
| | assert "यहाँ क्लिक करें" in result
|
| |
|