scam / tests /unit /test_preprocessing.py
Gankit12's picture
Upload 129 files
31f0e50 verified
"""
Unit Tests for Text Preprocessing Module.
Tests text cleaning, normalization, and utility functions.
"""
import pytest
from app.utils.preprocessing import (
clean_text,
normalize_text,
convert_devanagari_digits,
truncate_text,
remove_urls,
extract_numbers,
mask_sensitive_data,
)
class TestCleanText:
"""Tests for clean_text function."""
def test_empty_string(self):
"""Test empty string returns empty string."""
assert clean_text("") == ""
def test_none_returns_empty(self):
"""Test None or falsy value returns empty string."""
assert clean_text(None) == ""
def test_removes_extra_whitespace(self):
"""Test extra whitespace is normalized."""
text = "Hello world here"
result = clean_text(text)
assert result == "Hello world here"
def test_removes_leading_trailing_whitespace(self):
"""Test leading/trailing whitespace is stripped."""
text = " Hello world "
result = clean_text(text)
assert result == "Hello world"
def test_removes_control_characters(self):
"""Test control characters are removed."""
text = "Hello\x00\x07world"
result = clean_text(text)
assert "\x00" not in result
assert "\x07" not in result
assert "Hello" in result
assert "world" in result
def test_preserves_normal_text(self):
"""Test normal text is preserved."""
text = "Hello, how are you?"
result = clean_text(text)
assert result == text
def test_normalizes_newlines_and_tabs(self):
"""Test newlines and tabs are normalized to spaces."""
text = "Hello\nworld\there"
result = clean_text(text)
assert result == "Hello world here"
def test_handles_unicode(self):
"""Test Unicode text is preserved."""
text = "नमस्ते दुनिया"
result = clean_text(text)
assert result == text
class TestNormalizeText:
"""Tests for normalize_text function."""
def test_basic_normalization(self):
"""Test basic text normalization."""
text = " Hello world "
result = normalize_text(text)
assert result == "Hello world"
def test_lowercase_option(self):
"""Test lowercase option."""
text = "Hello WORLD"
result = normalize_text(text, lowercase=True)
assert result == "hello world"
def test_without_lowercase(self):
"""Test preserves case by default."""
text = "Hello WORLD"
result = normalize_text(text, lowercase=False)
assert result == "Hello WORLD"
def test_converts_devanagari_digits(self):
"""Test Devanagari digits are converted."""
text = "Amount: ५०००"
result = normalize_text(text)
assert "5000" in result
class TestConvertDevanagariDigits:
"""Tests for convert_devanagari_digits function."""
def test_converts_all_digits(self):
"""Test all Devanagari digits are converted."""
text = "०१२३४५६७८९"
result = convert_devanagari_digits(text)
assert result == "0123456789"
def test_preserves_latin_digits(self):
"""Test Latin digits are preserved."""
text = "123456"
result = convert_devanagari_digits(text)
assert result == "123456"
def test_mixed_digits(self):
"""Test mixed Devanagari and Latin digits."""
text = "Phone: ९८76543२१०"
result = convert_devanagari_digits(text)
assert result == "Phone: 9876543210"
def test_preserves_non_digit_text(self):
"""Test non-digit text is preserved."""
text = "नमस्ते"
result = convert_devanagari_digits(text)
assert result == "नमस्ते"
def test_empty_string(self):
"""Test empty string returns empty."""
assert convert_devanagari_digits("") == ""
def test_phone_number_in_hindi(self):
"""Test phone number conversion in Hindi context."""
text = "कॉल करें ९८७६५४३२१०"
result = convert_devanagari_digits(text)
assert "9876543210" in result
class TestTruncateText:
"""Tests for truncate_text function."""
def test_short_text_unchanged(self):
"""Test text shorter than limit is unchanged."""
text = "Hello world"
result = truncate_text(text, max_length=100)
assert result == text
def test_long_text_truncated(self):
"""Test text longer than limit is truncated."""
text = "a" * 100
result = truncate_text(text, max_length=50)
assert len(result) == 50
assert result.endswith("...")
def test_custom_suffix(self):
"""Test custom truncation suffix."""
text = "a" * 100
result = truncate_text(text, max_length=50, suffix="[...]")
assert result.endswith("[...]")
def test_exact_length(self):
"""Test text at exact length is unchanged."""
text = "a" * 50
result = truncate_text(text, max_length=50)
assert result == text
def test_default_max_length(self):
"""Test default max_length is 5000."""
text = "a" * 5000
result = truncate_text(text)
assert len(result) == 5000
class TestRemoveUrls:
"""Tests for remove_urls function."""
def test_removes_http_url(self):
"""Test HTTP URLs are removed."""
text = "Visit http://example.com for more info"
result = remove_urls(text)
assert "http://example.com" not in result
assert "Visit" in result
def test_removes_https_url(self):
"""Test HTTPS URLs are removed."""
text = "Visit https://secure.example.com for more info"
result = remove_urls(text)
assert "https://secure.example.com" not in result
def test_removes_multiple_urls(self):
"""Test multiple URLs are removed."""
text = "Visit http://one.com and http://two.com"
result = remove_urls(text)
assert "http://one.com" not in result
assert "http://two.com" not in result
def test_preserves_non_url_text(self):
"""Test non-URL text is preserved."""
text = "Hello world, no URLs here"
result = remove_urls(text)
assert result == text
def test_removes_complex_url(self):
"""Test complex URLs with paths are removed."""
text = "Click http://example.com/path/to/page?query=value"
result = remove_urls(text)
assert "http://example.com" not in result
class TestExtractNumbers:
"""Tests for extract_numbers function."""
def test_extracts_single_number(self):
"""Test extracts single number."""
text = "Amount is 5000"
result = extract_numbers(text)
assert "5000" in result
def test_extracts_multiple_numbers(self):
"""Test extracts multiple numbers."""
text = "Account 123456 and phone 9876543210"
result = extract_numbers(text)
assert "123456" in result
assert "9876543210" in result
def test_handles_devanagari_digits(self):
"""Test handles Devanagari digits."""
text = "Amount ५०००"
result = extract_numbers(text)
assert "5000" in result
def test_no_numbers(self):
"""Test returns empty list when no numbers."""
text = "No numbers here"
result = extract_numbers(text)
assert result == []
def test_mixed_devanagari_and_latin(self):
"""Test mixed digit systems."""
text = "Phone ९८76543२१० account 123"
result = extract_numbers(text)
assert "9876543210" in result
assert "123" in result
class TestMaskSensitiveData:
"""Tests for mask_sensitive_data function."""
def test_masks_upi_id(self):
"""Test UPI ID is masked."""
text = "Send to scammer@paytm"
result = mask_sensitive_data(text)
assert "scammer@paytm" not in result
assert "[UPI_MASKED]" in result
def test_masks_bank_account(self):
"""Test bank account number is masked."""
text = "Account: 123456789012345"
result = mask_sensitive_data(text)
assert "123456789012345" not in result
assert "[ACCOUNT_MASKED]" in result
def test_masks_phone_number(self):
"""Test phone number is masked."""
text = "Call 9876543210"
result = mask_sensitive_data(text)
assert "9876543210" not in result
# Phone number gets masked (either as phone or account since 10 digits)
assert "[PHONE_MASKED]" in result or "[ACCOUNT_MASKED]" in result
def test_masks_phone_with_plus91(self):
"""Test phone with +91 prefix is masked."""
text = "Call +91 9876543210"
result = mask_sensitive_data(text)
assert "9876543210" not in result
# Phone number gets masked (either as phone or account)
assert "[PHONE_MASKED]" in result or "[ACCOUNT_MASKED]" in result
def test_preserves_non_sensitive_text(self):
"""Test non-sensitive text is preserved."""
text = "Hello, how are you?"
result = mask_sensitive_data(text)
assert result == text
def test_masks_multiple_sensitive_items(self):
"""Test masks multiple sensitive items in one text."""
text = "Send to fraud@ybl, call 9876543210, account 123456789012"
result = mask_sensitive_data(text)
assert "fraud@ybl" not in result
assert "9876543210" not in result
assert "123456789012" not in result
class TestPreprocessingEdgeCases:
"""Edge case tests for preprocessing functions."""
def test_clean_text_with_emojis(self):
"""Test clean_text preserves emojis."""
text = "Hello 😀 world 🎉"
result = clean_text(text)
assert "😀" in result
assert "🎉" in result
def test_normalize_very_long_text(self):
"""Test normalize handles very long text."""
text = "word " * 10000
result = normalize_text(text)
assert len(result) > 0
def test_devanagari_mixed_with_special_chars(self):
"""Test Devanagari digits with special characters."""
text = "Amount: ₹५,०००/-"
result = convert_devanagari_digits(text)
assert "5" in result
assert "0" in result
def test_url_with_hindi_text(self):
"""Test URL removal with surrounding Hindi text."""
text = "यहाँ क्लिक करें http://fake.com जीतने के लिए"
result = remove_urls(text)
assert "http://fake.com" not in result
assert "यहाँ क्लिक करें" in result