""" Tests for data ingestion and management. """ import pytest from datetime import datetime, timezone, timedelta from unittest.mock import patch, MagicMock class TestLanguageDetection: """Tests for language detection.""" def test_detect_english(self): """Test detection of English text.""" from app.data_manager import detect_language result = detect_language("Copper prices rose sharply today") assert result == "en" def test_detect_non_english(self): """Test detection of non-English text.""" from app.data_manager import detect_language # German result = detect_language("Die Kupferpreise sind heute gestiegen") assert result != "en" def test_detect_empty_text(self): """Test detection with empty text.""" from app.data_manager import detect_language result = detect_language("") assert result is None def test_detect_short_text(self): """Test detection with very short text.""" from app.data_manager import detect_language # Short text may fail detection result = detect_language("Hi") # Should handle gracefully assert result is None or isinstance(result, str) class TestLanguageFiltering: """Tests for language filtering.""" def test_filter_keeps_english(self, sample_articles): """Test that English articles are kept.""" from app.data_manager import filter_by_language articles = [ {"title": "Copper prices rise", "description": "Copper up today"}, {"title": "Mining output increases", "description": "Good news"}, ] filtered, count = filter_by_language(articles, "en") assert len(filtered) == 2 assert count == 0 def test_filter_removes_non_english(self): """Test that non-English articles are filtered.""" from app.data_manager import filter_by_language articles = [ {"title": "Copper prices rise", "description": "Copper up today"}, {"title": "Kupferpreise steigen", "description": "Kupfer heute höher"}, ] filtered, count = filter_by_language(articles, "en") assert len(filtered) == 1 assert count == 1 class TestFuzzyDeduplication: """Tests for fuzzy title matching.""" def test_exact_duplicate(self): """Test that exact duplicates are detected.""" from app.data_manager import is_fuzzy_duplicate existing = ["Copper prices surge on supply concerns"] new_title = "Copper prices surge on supply concerns" assert is_fuzzy_duplicate(new_title, existing, threshold=85) is True def test_similar_titles(self): """Test that similar titles are detected.""" from app.data_manager import is_fuzzy_duplicate existing = ["Copper prices surge on supply concerns"] new_title = "Copper prices rise on supply concerns" # Similar # Should be detected as duplicate with default threshold result = is_fuzzy_duplicate(new_title, existing, threshold=85) assert result is True def test_different_titles(self): """Test that different titles are not marked as duplicates.""" from app.data_manager import is_fuzzy_duplicate existing = ["Copper prices surge on supply concerns"] new_title = "Gold reaches new all-time high" # Different topic assert is_fuzzy_duplicate(new_title, existing, threshold=85) is False def test_empty_existing_titles(self): """Test with no existing titles.""" from app.data_manager import is_fuzzy_duplicate existing = [] new_title = "Any title here" assert is_fuzzy_duplicate(new_title, existing, threshold=85) is False class TestRSSParsing: """Tests for RSS feed parsing.""" def test_rss_query_building(self): """Test RSS query URL building.""" query = "copper OR copper price OR copper futures" language = "en" # URL encoding from urllib.parse import quote encoded_query = quote(query) url = f"https://news.google.com/rss/search?q={encoded_query}&hl={language}&gl=US&ceid=US:en" assert "copper" in url assert "hl=en" in url class TestPriceIngestion: """Tests for price data ingestion.""" def test_symbol_parsing(self): """Test multi-symbol parsing.""" symbols_str = "HG=F,DX-Y.NYB,CL=F,FXI" symbols = symbols_str.split(",") assert len(symbols) == 4 assert "HG=F" in symbols assert "DX-Y.NYB" in symbols def test_lookback_calculation(self): """Test lookback date calculation.""" lookback_days = 365 end_date = datetime.now(timezone.utc) start_date = end_date - timedelta(days=lookback_days) delta = end_date - start_date assert delta.days == lookback_days def test_price_bar_fields(self): """Test that price bars have required fields.""" required_fields = ["date", "open", "high", "low", "close", "volume"] sample_bar = { "date": datetime.now(), "open": 4.0, "high": 4.1, "low": 3.9, "close": 4.05, "volume": 50000, } for field in required_fields: assert field in sample_bar class TestDatabaseUpsert: """Tests for database upsert logic.""" def test_upsert_key_generation(self): """Test unique key generation for upsert.""" from app.utils import generate_dedup_key # Same URL should give same key url = "https://example.com/article/123" key1 = generate_dedup_key("Title 1", url) key2 = generate_dedup_key("Title 2", url) # Keys based on URL should be consistent # (depends on implementation - may include title or not) assert isinstance(key1, str) assert isinstance(key2, str) def test_date_normalization(self): """Test date normalization for comparison.""" dt1 = datetime(2026, 1, 1, 10, 30, 0, tzinfo=timezone.utc) dt2 = datetime(2026, 1, 1, 14, 45, 0, tzinfo=timezone.utc) # Same date, different time date1 = dt1.date() date2 = dt2.date() assert date1 == date2 class TestDataValidation: """Tests for data validation.""" def test_price_validation(self): """Test that prices are positive.""" prices = [4.0, 4.1, 4.05, 3.95] assert all(p > 0 for p in prices) def test_volume_validation(self): """Test that volume is non-negative.""" volumes = [50000, 0, 100000] assert all(v >= 0 for v in volumes) def test_date_validation(self): """Test date is not in future.""" from datetime import datetime, timezone test_date = datetime(2025, 1, 1, tzinfo=timezone.utc) now = datetime.now(timezone.utc) # For historical data, date should be in past or present assert test_date <= now or True # Flexible for test dates def test_sentiment_score_range(self): """Test that sentiment scores are in valid range.""" scores = [0.5, -0.3, 0.8, -0.9, 0.0] assert all(-1 <= s <= 1 for s in scores)