Spaces:

ifieryarrows
/

copper-mind

Running

App Files Files Community

copper-mind / tests /test_data_ingestion.py

ifieryarrows

Sync from GitHub

a9fae67 verified 3 months ago

raw

history blame contribute delete

7.66 kB

	"""
	Tests for data ingestion and management.
	"""

	import pytest
	from datetime import datetime, timezone, timedelta
	from unittest.mock import patch, MagicMock


	class TestLanguageDetection:
	"""Tests for language detection."""

	def test_detect_english(self):
	"""Test detection of English text."""
	from app.data_manager import detect_language

	result = detect_language("Copper prices rose sharply today")
	assert result == "en"

	def test_detect_non_english(self):
	"""Test detection of non-English text."""
	from app.data_manager import detect_language

	# German
	result = detect_language("Die Kupferpreise sind heute gestiegen")
	assert result != "en"

	def test_detect_empty_text(self):
	"""Test detection with empty text."""
	from app.data_manager import detect_language

	result = detect_language("")
	assert result is None

	def test_detect_short_text(self):
	"""Test detection with very short text."""
	from app.data_manager import detect_language

	# Short text may fail detection
	result = detect_language("Hi")
	# Should handle gracefully
	assert result is None or isinstance(result, str)


	class TestLanguageFiltering:
	"""Tests for language filtering."""

	def test_filter_keeps_english(self, sample_articles):
	"""Test that English articles are kept."""
	from app.data_manager import filter_by_language

	articles = [
	{"title": "Copper prices rise", "description": "Copper up today"},
	{"title": "Mining output increases", "description": "Good news"},
	]

	filtered, count = filter_by_language(articles, "en")

	assert len(filtered) == 2
	assert count == 0

	def test_filter_removes_non_english(self):
	"""Test that non-English articles are filtered."""
	from app.data_manager import filter_by_language

	articles = [
	{"title": "Copper prices rise", "description": "Copper up today"},
	{"title": "Kupferpreise steigen", "description": "Kupfer heute höher"},
	]

	filtered, count = filter_by_language(articles, "en")

	assert len(filtered) == 1
	assert count == 1


	class TestFuzzyDeduplication:
	"""Tests for fuzzy title matching."""

	def test_exact_duplicate(self):
	"""Test that exact duplicates are detected."""
	from app.data_manager import is_fuzzy_duplicate

	existing = ["Copper prices surge on supply concerns"]
	new_title = "Copper prices surge on supply concerns"

	assert is_fuzzy_duplicate(new_title, existing, threshold=85) is True

	def test_similar_titles(self):
	"""Test that similar titles are detected."""
	from app.data_manager import is_fuzzy_duplicate

	existing = ["Copper prices surge on supply concerns"]
	new_title = "Copper prices rise on supply concerns" # Similar

	# Should be detected as duplicate with default threshold
	result = is_fuzzy_duplicate(new_title, existing, threshold=85)
	assert result is True

	def test_different_titles(self):
	"""Test that different titles are not marked as duplicates."""
	from app.data_manager import is_fuzzy_duplicate

	existing = ["Copper prices surge on supply concerns"]
	new_title = "Gold reaches new all-time high" # Different topic

	assert is_fuzzy_duplicate(new_title, existing, threshold=85) is False

	def test_empty_existing_titles(self):
	"""Test with no existing titles."""
	from app.data_manager import is_fuzzy_duplicate

	existing = []
	new_title = "Any title here"

	assert is_fuzzy_duplicate(new_title, existing, threshold=85) is False


	class TestRSSParsing:
	"""Tests for RSS feed parsing."""

	def test_rss_query_building(self):
	"""Test RSS query URL building."""
	query = "copper OR copper price OR copper futures"
	language = "en"

	# URL encoding
	from urllib.parse import quote
	encoded_query = quote(query)

	url = f"https://news.google.com/rss/search?q={encoded_query}&hl={language}&gl=US&ceid=US:en"

	assert "copper" in url
	assert "hl=en" in url


	class TestPriceIngestion:
	"""Tests for price data ingestion."""

	def test_symbol_parsing(self):
	"""Test multi-symbol parsing."""
	symbols_str = "HG=F,DX-Y.NYB,CL=F,FXI"
	symbols = symbols_str.split(",")

	assert len(symbols) == 4
	assert "HG=F" in symbols
	assert "DX-Y.NYB" in symbols

	def test_lookback_calculation(self):
	"""Test lookback date calculation."""
	lookback_days = 365
	end_date = datetime.now(timezone.utc)
	start_date = end_date - timedelta(days=lookback_days)

	delta = end_date - start_date
	assert delta.days == lookback_days

	def test_price_bar_fields(self):
	"""Test that price bars have required fields."""
	required_fields = ["date", "open", "high", "low", "close", "volume"]

	sample_bar = {
	"date": datetime.now(),
	"open": 4.0,
	"high": 4.1,
	"low": 3.9,
	"close": 4.05,
	"volume": 50000,
	}

	for field in required_fields:
	assert field in sample_bar


	class TestDatabaseUpsert:
	"""Tests for database upsert logic."""

	def test_upsert_key_generation(self):
	"""Test unique key generation for upsert."""
	from app.utils import generate_dedup_key

	# Same URL should give same key
	url = "https://example.com/article/123"
	key1 = generate_dedup_key("Title 1", url)
	key2 = generate_dedup_key("Title 2", url)

	# Keys based on URL should be consistent
	# (depends on implementation - may include title or not)
	assert isinstance(key1, str)
	assert isinstance(key2, str)

	def test_date_normalization(self):
	"""Test date normalization for comparison."""
	dt1 = datetime(2026, 1, 1, 10, 30, 0, tzinfo=timezone.utc)
	dt2 = datetime(2026, 1, 1, 14, 45, 0, tzinfo=timezone.utc)

	# Same date, different time
	date1 = dt1.date()
	date2 = dt2.date()

	assert date1 == date2


	class TestDataValidation:
	"""Tests for data validation."""

	def test_price_validation(self):
	"""Test that prices are positive."""
	prices = [4.0, 4.1, 4.05, 3.95]

	assert all(p > 0 for p in prices)

	def test_volume_validation(self):
	"""Test that volume is non-negative."""
	volumes = [50000, 0, 100000]

	assert all(v >= 0 for v in volumes)

	def test_date_validation(self):
	"""Test date is not in future."""
	from datetime import datetime, timezone

	test_date = datetime(2025, 1, 1, tzinfo=timezone.utc)
	now = datetime.now(timezone.utc)

	# For historical data, date should be in past or present
	assert test_date <= now or True # Flexible for test dates

	def test_sentiment_score_range(self):
	"""Test that sentiment scores are in valid range."""
	scores = [0.5, -0.3, 0.8, -0.9, 0.0]

	assert all(-1 <= s <= 1 for s in scores)