""" Tests for data normalization and deduplication utilities. """ import pytest from datetime import datetime, timezone from app.utils import ( normalize_whitespace, strip_html, clean_text, canonical_title, normalize_url, generate_dedup_key, truncate_text, safe_parse_date, ) class TestNormalizeWhitespace: def test_multiple_spaces(self): assert normalize_whitespace("hello world") == "hello world" def test_tabs_and_newlines(self): assert normalize_whitespace("hello\t\nworld") == "hello world" def test_leading_trailing(self): assert normalize_whitespace(" hello world ") == "hello world" def test_empty_string(self): assert normalize_whitespace("") == "" def test_none_safe(self): assert normalize_whitespace(None) == "" class TestStripHtml: def test_simple_tags(self): assert strip_html("

Hello

") == "Hello" def test_nested_tags(self): assert strip_html("

Hello World

") == "Hello World" def test_no_html(self): assert strip_html("Plain text") == "Plain text" def test_empty(self): assert strip_html("") == "" class TestCleanText: def test_combined_cleaning(self): result = clean_text("

Hello World

") assert result == "Hello World" def test_preserves_content(self): result = clean_text("Copper prices rose 3.5% today") assert "3.5%" in result class TestCanonicalTitle: def test_lowercase(self): assert canonical_title("HELLO World") == "hello world" def test_punctuation_removed(self): result = canonical_title("Hello, World! How's it going?") assert "," not in result assert "!" not in result assert "'" not in result def test_whitespace_normalized(self): result = canonical_title("Hello World") assert result == "hello world" class TestNormalizeUrl: def test_removes_tracking_params(self): url = "https://example.com/news?utm_source=twitter&id=123" result = normalize_url(url) assert "utm_source" not in result assert "id=123" in result def test_removes_fragment(self): url = "https://example.com/page#section" result = normalize_url(url) assert "#section" not in result def test_lowercase_domain(self): url = "https://EXAMPLE.COM/Page" result = normalize_url(url) assert "example.com" in result def test_empty_url(self): assert normalize_url("") == "" class TestGenerateDedupKey: def test_url_based_key(self): key1 = generate_dedup_key(url="https://example.com/news/1") key2 = generate_dedup_key(url="https://example.com/news/1") assert key1 == key2 def test_different_urls_different_keys(self): key1 = generate_dedup_key(url="https://example.com/news/1") key2 = generate_dedup_key(url="https://example.com/news/2") assert key1 != key2 def test_content_based_fallback(self): key = generate_dedup_key( title="Copper prices rise", published_at=datetime(2026, 1, 1, tzinfo=timezone.utc), source="Reuters" ) assert len(key) == 32 def test_key_length(self): key = generate_dedup_key(url="https://example.com/test") assert len(key) == 32 class TestTruncateText: def test_short_text_unchanged(self): text = "Short text" assert truncate_text(text, 100) == text def test_long_text_truncated(self): text = "This is a very long text that should be truncated" result = truncate_text(text, 20) assert len(result) <= 20 assert result.endswith("...") def test_empty_text(self): assert truncate_text("", 10) == "" def test_none_text(self): assert truncate_text(None, 10) == "" class TestSafeParseDate: def test_iso_format(self): result = safe_parse_date("2026-01-15T10:30:00Z") assert result.year == 2026 assert result.month == 1 assert result.day == 15 def test_date_only(self): result = safe_parse_date("2026-01-15") assert result.year == 2026 def test_invalid_returns_none(self): result = safe_parse_date("not a date") assert result is None def test_empty_returns_none(self): result = safe_parse_date("") assert result is None