File size: 5,278 Bytes
0304d75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
"""
tests/test_text_preprocessing.py
==================================
Unit tests for utils/text_preprocessing.py β€” clean_text.
"""

import pytest

from utils.text_preprocessing import clean_text


class TestCleanTextBasic:
    def test_empty_string_returns_empty(self):
        assert clean_text("") == ""

    def test_none_returns_empty(self):
        assert clean_text(None) == ""  # type: ignore[arg-type]

    def test_non_string_returns_empty(self):
        assert clean_text(123) == ""  # type: ignore[arg-type]

    def test_plain_text_unchanged(self):
        text = "I am feeling stressed today"
        assert clean_text(text) == text

    def test_whitespace_only_returns_stripped(self):
        assert clean_text("   ") == ""


class TestURLRemoval:
    def test_http_url_removed(self):
        result = clean_text("check out https://example.com for info")
        assert "https://" not in result
        assert "example.com" not in result

    def test_www_url_removed(self):
        result = clean_text("visit www.example.com today")
        assert "www.example.com" not in result

    def test_text_around_url_preserved(self):
        result = clean_text("check out https://example.com for info")
        assert "check" in result
        assert "info" in result


class TestEmailRemoval:
    def test_email_removed(self):
        result = clean_text("contact me at user@example.com please")
        assert "user@example.com" not in result

    def test_text_around_email_preserved(self):
        result = clean_text("contact me at user@example.com please")
        assert "contact" in result
        assert "please" in result


class TestHTMLStripping:
    def test_bold_tag_removed(self):
        result = clean_text("<b>hello</b> world")
        assert "<b>" not in result
        assert "</b>" not in result
        assert "hello" in result

    def test_paragraph_tag_removed(self):
        result = clean_text("<p>stressed today</p>")
        assert "<p>" not in result
        assert "stressed" in result

    def test_html_entity_unescaped(self):
        result = clean_text("I &amp; my team are done")
        assert "&amp;" not in result
        assert "&" in result

    def test_numeric_entity_unescaped(self):
        result = clean_text("it&#39;s over")
        assert "&#39;" not in result
        assert "it" in result


class TestEmojiNormalization:
    def test_happy_emoji_replaced_with_text(self):
        result = clean_text("feeling 😊 today")
        assert "😊" not in result
        assert "happy" in result.lower()

    def test_crying_emoji_replaced_with_text(self):
        result = clean_text("I am 😭 all day")
        assert "😭" not in result
        assert "crying" in result.lower()

    def test_anxious_emoji_replaced_with_text(self):
        result = clean_text("so 😰 about the exam")
        assert "😰" not in result
        assert "anxious" in result.lower()

    def test_multiple_emojis_all_replaced(self):
        result = clean_text("😊😭")
        assert "😊" not in result
        assert "😭" not in result


class TestRepeatedCharNormalization:
    def test_excessive_repetition_compressed(self):
        result = clean_text("sooooooo tired")
        # "sooooooo" should be compressed to "soo" (max 2 repetitions)
        assert "oooooooo" not in result

    def test_normal_repetition_preserved(self):
        # Three repetitions or fewer remain
        result = clean_text("noo way")
        assert "noo" in result

    def test_repeated_exclamation_compressed(self):
        result = clean_text("great!!!!!")
        assert "!!!!!" not in result


class TestWhitespaceNormalization:
    def test_multiple_spaces_collapsed(self):
        result = clean_text("I   am   fine")
        assert "   " not in result

    def test_tabs_collapsed(self):
        result = clean_text("I\tam\tfine")
        assert "\t" not in result

    def test_leading_trailing_stripped(self):
        result = clean_text("  hello world  ")
        assert result == "hello world"

    def test_newlines_collapsed(self):
        result = clean_text("line one\n\nline two")
        assert "\n\n" not in result


class TestUnicodeNormalization:
    def test_half_width_chars_normalised(self):
        # Full-width digit 'οΌ‘' should normalise to '1'
        result = clean_text("\uff11 stress")
        assert "\uff11" not in result

    def test_ligature_normalised(self):
        # fi (fi ligature) β†’ fi
        result = clean_text("fine")
        assert "fi" not in result


class TestNormalizeRepeatedFlag:
    def test_flag_off_preserves_repetition(self):
        result = clean_text("sooooooo tired", normalize_repeated=False)
        assert "oooooooo" in result

    def test_flag_on_compresses(self):
        result = clean_text("sooooooo tired", normalize_repeated=True)
        assert "oooooooo" not in result


class TestCombinedCleaning:
    def test_html_url_emoji_combined(self):
        text = '<p>feeling 😰 β€” see https://example.com</p>'
        result = clean_text(text)
        assert "<p>" not in result
        assert "https://" not in result
        assert "😰" not in result
        # Semantic content preserved
        assert "feeling" in result
        assert "anxious" in result