Spaces:
Running
Running
File size: 5,278 Bytes
0304d75 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 | """
tests/test_text_preprocessing.py
==================================
Unit tests for utils/text_preprocessing.py β clean_text.
"""
import pytest
from utils.text_preprocessing import clean_text
class TestCleanTextBasic:
def test_empty_string_returns_empty(self):
assert clean_text("") == ""
def test_none_returns_empty(self):
assert clean_text(None) == "" # type: ignore[arg-type]
def test_non_string_returns_empty(self):
assert clean_text(123) == "" # type: ignore[arg-type]
def test_plain_text_unchanged(self):
text = "I am feeling stressed today"
assert clean_text(text) == text
def test_whitespace_only_returns_stripped(self):
assert clean_text(" ") == ""
class TestURLRemoval:
def test_http_url_removed(self):
result = clean_text("check out https://example.com for info")
assert "https://" not in result
assert "example.com" not in result
def test_www_url_removed(self):
result = clean_text("visit www.example.com today")
assert "www.example.com" not in result
def test_text_around_url_preserved(self):
result = clean_text("check out https://example.com for info")
assert "check" in result
assert "info" in result
class TestEmailRemoval:
def test_email_removed(self):
result = clean_text("contact me at user@example.com please")
assert "user@example.com" not in result
def test_text_around_email_preserved(self):
result = clean_text("contact me at user@example.com please")
assert "contact" in result
assert "please" in result
class TestHTMLStripping:
def test_bold_tag_removed(self):
result = clean_text("<b>hello</b> world")
assert "<b>" not in result
assert "</b>" not in result
assert "hello" in result
def test_paragraph_tag_removed(self):
result = clean_text("<p>stressed today</p>")
assert "<p>" not in result
assert "stressed" in result
def test_html_entity_unescaped(self):
result = clean_text("I & my team are done")
assert "&" not in result
assert "&" in result
def test_numeric_entity_unescaped(self):
result = clean_text("it's over")
assert "'" not in result
assert "it" in result
class TestEmojiNormalization:
def test_happy_emoji_replaced_with_text(self):
result = clean_text("feeling π today")
assert "π" not in result
assert "happy" in result.lower()
def test_crying_emoji_replaced_with_text(self):
result = clean_text("I am π all day")
assert "π" not in result
assert "crying" in result.lower()
def test_anxious_emoji_replaced_with_text(self):
result = clean_text("so π° about the exam")
assert "π°" not in result
assert "anxious" in result.lower()
def test_multiple_emojis_all_replaced(self):
result = clean_text("ππ")
assert "π" not in result
assert "π" not in result
class TestRepeatedCharNormalization:
def test_excessive_repetition_compressed(self):
result = clean_text("sooooooo tired")
# "sooooooo" should be compressed to "soo" (max 2 repetitions)
assert "oooooooo" not in result
def test_normal_repetition_preserved(self):
# Three repetitions or fewer remain
result = clean_text("noo way")
assert "noo" in result
def test_repeated_exclamation_compressed(self):
result = clean_text("great!!!!!")
assert "!!!!!" not in result
class TestWhitespaceNormalization:
def test_multiple_spaces_collapsed(self):
result = clean_text("I am fine")
assert " " not in result
def test_tabs_collapsed(self):
result = clean_text("I\tam\tfine")
assert "\t" not in result
def test_leading_trailing_stripped(self):
result = clean_text(" hello world ")
assert result == "hello world"
def test_newlines_collapsed(self):
result = clean_text("line one\n\nline two")
assert "\n\n" not in result
class TestUnicodeNormalization:
def test_half_width_chars_normalised(self):
# Full-width digit 'οΌ' should normalise to '1'
result = clean_text("\uff11 stress")
assert "\uff11" not in result
def test_ligature_normalised(self):
# ο¬ (fi ligature) β fi
result = clean_text("ο¬ne")
assert "ο¬" not in result
class TestNormalizeRepeatedFlag:
def test_flag_off_preserves_repetition(self):
result = clean_text("sooooooo tired", normalize_repeated=False)
assert "oooooooo" in result
def test_flag_on_compresses(self):
result = clean_text("sooooooo tired", normalize_repeated=True)
assert "oooooooo" not in result
class TestCombinedCleaning:
def test_html_url_emoji_combined(self):
text = '<p>feeling π° β see https://example.com</p>'
result = clean_text(text)
assert "<p>" not in result
assert "https://" not in result
assert "π°" not in result
# Semantic content preserved
assert "feeling" in result
assert "anxious" in result
|