File size: 4,562 Bytes
f64b002
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
"""
Tests for data normalization and deduplication utilities.
"""

import pytest
from datetime import datetime, timezone

from app.utils import (
    normalize_whitespace,
    strip_html,
    clean_text,
    canonical_title,
    normalize_url,
    generate_dedup_key,
    truncate_text,
    safe_parse_date,
)


class TestNormalizeWhitespace:
    def test_multiple_spaces(self):
        assert normalize_whitespace("hello   world") == "hello world"
    
    def test_tabs_and_newlines(self):
        assert normalize_whitespace("hello\t\nworld") == "hello world"
    
    def test_leading_trailing(self):
        assert normalize_whitespace("  hello world  ") == "hello world"
    
    def test_empty_string(self):
        assert normalize_whitespace("") == ""
    
    def test_none_safe(self):
        assert normalize_whitespace(None) == ""


class TestStripHtml:
    def test_simple_tags(self):
        assert strip_html("<p>Hello</p>") == "Hello"
    
    def test_nested_tags(self):
        assert strip_html("<div><p>Hello <b>World</b></p></div>") == "Hello World"
    
    def test_no_html(self):
        assert strip_html("Plain text") == "Plain text"
    
    def test_empty(self):
        assert strip_html("") == ""


class TestCleanText:
    def test_combined_cleaning(self):
        result = clean_text("<p>Hello    World</p>")
        assert result == "Hello World"
    
    def test_preserves_content(self):
        result = clean_text("Copper prices rose 3.5% today")
        assert "3.5%" in result


class TestCanonicalTitle:
    def test_lowercase(self):
        assert canonical_title("HELLO World") == "hello world"
    
    def test_punctuation_removed(self):
        result = canonical_title("Hello, World! How's it going?")
        assert "," not in result
        assert "!" not in result
        assert "'" not in result
    
    def test_whitespace_normalized(self):
        result = canonical_title("Hello   World")
        assert result == "hello world"


class TestNormalizeUrl:
    def test_removes_tracking_params(self):
        url = "https://example.com/news?utm_source=twitter&id=123"
        result = normalize_url(url)
        assert "utm_source" not in result
        assert "id=123" in result
    
    def test_removes_fragment(self):
        url = "https://example.com/page#section"
        result = normalize_url(url)
        assert "#section" not in result
    
    def test_lowercase_domain(self):
        url = "https://EXAMPLE.COM/Page"
        result = normalize_url(url)
        assert "example.com" in result
    
    def test_empty_url(self):
        assert normalize_url("") == ""


class TestGenerateDedupKey:
    def test_url_based_key(self):
        key1 = generate_dedup_key(url="https://example.com/news/1")
        key2 = generate_dedup_key(url="https://example.com/news/1")
        assert key1 == key2
    
    def test_different_urls_different_keys(self):
        key1 = generate_dedup_key(url="https://example.com/news/1")
        key2 = generate_dedup_key(url="https://example.com/news/2")
        assert key1 != key2
    
    def test_content_based_fallback(self):
        key = generate_dedup_key(
            title="Copper prices rise",
            published_at=datetime(2026, 1, 1, tzinfo=timezone.utc),
            source="Reuters"
        )
        assert len(key) == 32
    
    def test_key_length(self):
        key = generate_dedup_key(url="https://example.com/test")
        assert len(key) == 32


class TestTruncateText:
    def test_short_text_unchanged(self):
        text = "Short text"
        assert truncate_text(text, 100) == text
    
    def test_long_text_truncated(self):
        text = "This is a very long text that should be truncated"
        result = truncate_text(text, 20)
        assert len(result) <= 20
        assert result.endswith("...")
    
    def test_empty_text(self):
        assert truncate_text("", 10) == ""
    
    def test_none_text(self):
        assert truncate_text(None, 10) == ""


class TestSafeParseDate:
    def test_iso_format(self):
        result = safe_parse_date("2026-01-15T10:30:00Z")
        assert result.year == 2026
        assert result.month == 1
        assert result.day == 15
    
    def test_date_only(self):
        result = safe_parse_date("2026-01-15")
        assert result.year == 2026
    
    def test_invalid_returns_none(self):
        result = safe_parse_date("not a date")
        assert result is None
    
    def test_empty_returns_none(self):
        result = safe_parse_date("")
        assert result is None