Spaces:

solarevat
/

multilabel-news-classifier

Sleeping

multilabel-news-classifier / tests /test_analytics.py

Solareva Taisia

chore(release): initial public snapshot

198ccb0 2 months ago

13.7 kB

	"""Tests for advanced analytics."""

	import pytest
	import pandas as pd
	import numpy as np
	from analysis.predictive_intervals import (
	calculate_predictive_interval,
	rank_by_predictive_interval,
	get_top_positive_by_interval,
	get_top_negative_by_interval,
	calculate_intervals_for_dataframe
	)
	from analysis.category_analytics import CategoryAnalytics
	from analysis.thread_analysis import ThreadAnalyzer


	class TestPredictiveIntervals:
	"""Test suite for predictive intervals."""

	def test_calculate_predictive_interval_high_positive(self):
	"""Test predictive interval with high positive ratio."""
	interval = calculate_predictive_interval(
	positive_count=80,
	negative_count=20,
	neutral_count=0
	)

	assert 0.0 <= interval <= 1.0
	assert interval > 0.5 # Should be high for mostly positive

	def test_calculate_predictive_interval_high_negative(self):
	"""Test predictive interval with high negative ratio."""
	interval = calculate_predictive_interval(
	positive_count=20,
	negative_count=80,
	neutral_count=0
	)

	assert 0.0 <= interval <= 1.0
	assert interval < 0.5 # Should be low for mostly negative

	def test_calculate_predictive_interval_small_sample(self):
	"""Test predictive interval with small sample."""
	# Single positive comment
	interval_small = calculate_predictive_interval(
	positive_count=1,
	negative_count=0,
	neutral_count=0
	)

	# Many positive comments
	interval_large = calculate_predictive_interval(
	positive_count=100,
	negative_count=0,
	neutral_count=0
	)

	# Small sample should have lower interval (more uncertainty)
	assert interval_small < interval_large

	def test_calculate_predictive_interval_confidence_levels(self):
	"""Test different confidence levels."""
	interval_90 = calculate_predictive_interval(
	positive_count=80,
	negative_count=20,
	confidence_level=0.90
	)

	interval_95 = calculate_predictive_interval(
	positive_count=80,
	negative_count=20,
	confidence_level=0.95
	)

	interval_99 = calculate_predictive_interval(
	positive_count=80,
	negative_count=20,
	confidence_level=0.99
	)

	# Higher confidence = lower bound (more conservative)
	assert interval_90 >= interval_95 >= interval_99

	def test_rank_by_predictive_interval(self):
	"""Test ranking by predictive interval."""
	data = [
	{"id": "item1", "positive_count": 80, "negative_count": 20},
	{"id": "item2", "positive_count": 1, "negative_count": 0},
	{"id": "item3", "positive_count": 50, "negative_count": 50},
	]

	ranked = rank_by_predictive_interval(data)

	assert len(ranked) == 3
	assert "predictive_interval" in ranked[0]
	assert ranked[0]["predictive_interval"] >= ranked[-1]["predictive_interval"]

	def test_get_top_positive_by_interval(self):
	"""Test getting top positive items."""
	data = [
	{"id": "item1", "positive_count": 80, "negative_count": 20},
	{"id": "item2", "positive_count": 1, "negative_count": 0},
	{"id": "item3", "positive_count": 50, "negative_count": 50},
	]

	top = get_top_positive_by_interval(data, top_k=2)

	assert len(top) == 2
	assert top[0]["predictive_interval"] >= top[1]["predictive_interval"]

	def test_get_top_negative_by_interval(self):
	"""Test getting top negative items."""
	data = [
	{"id": "item1", "positive_count": 20, "negative_count": 80},
	{"id": "item2", "positive_count": 0, "negative_count": 1},
	{"id": "item3", "positive_count": 50, "negative_count": 50},
	]

	top = get_top_negative_by_interval(data, top_k=2)

	assert len(top) == 2
	# Most negative should have lowest interval
	assert top[0]["predictive_interval"] <= top[1]["predictive_interval"]

	def test_calculate_intervals_for_dataframe(self):
	"""Test calculating intervals for DataFrame."""
	df = pd.DataFrame({
	"id": ["item1", "item2"],
	"positive_count": [80, 20],
	"negative_count": [20, 80]
	})

	df_result = calculate_intervals_for_dataframe(df)

	assert "predictive_interval" in df_result.columns
	assert len(df_result) == 2
	assert df_result.loc[0, "predictive_interval"] > df_result.loc[1, "predictive_interval"]


	class TestCategoryAnalytics:
	"""Test suite for category analytics."""

	@pytest.fixture
	def sample_data(self):
	"""Create sample data for testing."""
	return [
	{"category": "politics", "text": "Отличная новость!"},
	{"category": "politics", "text": "Ужасная ситуация..."},
	{"category": "economy", "text": "Нормально"},
	]

	def test_category_analytics_initialization(self):
	"""Test category analytics initialization."""
	analytics = CategoryAnalytics()
	assert analytics is not None
	assert analytics.analyzer is not None

	def test_analyze_category_sentiment(self, sample_data):
	"""Test category sentiment analysis."""
	analytics = CategoryAnalytics()

	# This will actually run sentiment analysis, so it may be slow
	# For faster tests, we could mock the analyzer
	stats = analytics.analyze_category_sentiment(sample_data)

	assert "politics" in stats
	assert "economy" in stats
	assert "total_comments" in stats["politics"]
	assert "positive_count" in stats["politics"]
	assert "predictive_interval" in stats["politics"]

	def test_rank_categories_by_sentiment(self, sample_data):
	"""Test ranking categories by sentiment."""
	analytics = CategoryAnalytics()
	stats = analytics.analyze_category_sentiment(sample_data)

	ranked = analytics.rank_categories_by_sentiment(stats, sort_by="predictive_interval")

	assert len(ranked) >= 1
	assert "category" in ranked[0]
	assert "predictive_interval" in ranked[0]

	def test_get_top_positive_categories(self, sample_data):
	"""Test getting top positive categories."""
	analytics = CategoryAnalytics()
	stats = analytics.analyze_category_sentiment(sample_data)

	top = analytics.get_top_positive_categories(stats, top_k=5)

	assert len(top) <= 5
	if len(top) > 1:
	assert top[0]["predictive_interval"] >= top[1]["predictive_interval"]

	def test_analyze_from_dataframe(self):
	"""Test analyzing from DataFrame."""
	df = pd.DataFrame({
	"category": ["politics", "politics", "economy"],
	"text": ["Отлично!", "Ужасно!", "Нормально"]
	})

	analytics = CategoryAnalytics()
	stats_df = analytics.analyze_from_dataframe(df)

	assert isinstance(stats_df, pd.DataFrame)
	assert "category" in stats_df.columns
	assert "predictive_interval" in stats_df.columns


	class TestThreadAnalyzer:
	"""Test suite for thread analyzer."""

	@pytest.fixture
	def sample_data(self):
	"""Create sample data for testing."""
	return [
	{"news_id": "1", "id": "1", "text": "Отлично!"},
	{"news_id": "1", "id": "2", "text": "Ужасно!"},
	{"news_id": "2", "id": "3", "text": "Нормально"},
	]

	def test_thread_analyzer_initialization(self):
	"""Test thread analyzer initialization."""
	analyzer = ThreadAnalyzer()
	assert analyzer is not None
	assert analyzer.analyzer is not None

	def test_calculate_thread_lengths(self, sample_data):
	"""Test calculating thread lengths."""
	analyzer = ThreadAnalyzer()
	lengths = analyzer.calculate_thread_lengths(sample_data)

	assert "1" in lengths
	assert "2" in lengths
	assert lengths["1"] == 2
	assert lengths["2"] == 1

	def test_calculate_temperature(self, sample_data):
	"""Test calculating temperature."""
	analyzer = ThreadAnalyzer()
	temperatures = analyzer.calculate_temperature(sample_data)

	assert "1" in temperatures
	assert "2" in temperatures
	assert 0.0 <= temperatures["1"] <= 1.0
	assert 0.0 <= temperatures["2"] <= 1.0

	def test_analyze_correlation(self):
	"""Test correlation analysis."""
	analyzer = ThreadAnalyzer()

	thread_lengths = {"1": 10, "2": 5, "3": 20}
	temperatures = {"1": 0.3, "2": 0.5, "3": 0.7}

	results = analyzer.analyze_correlation(thread_lengths, temperatures)

	assert "correlation" in results
	assert "p_value" in results
	assert "significant" in results
	assert "sample_size" in results
	assert -1.0 <= results["correlation"] <= 1.0
	assert 0.0 <= results["p_value"] <= 1.0

	def test_analyze_correlation_insufficient_data(self):
	"""Test correlation with insufficient data."""
	analyzer = ThreadAnalyzer()

	thread_lengths = {"1": 10}
	temperatures = {"2": 0.5} # No overlap

	results = analyzer.analyze_correlation(thread_lengths, temperatures)

	assert results.get("sample_size", 0) < 2
	assert "error" in results or results.get("correlation", 0) == 0.0

	def test_analyze_from_dataframe(self):
	"""Test analyzing from DataFrame."""
	df = pd.DataFrame({
	"news_id": ["1", "1", "2"],
	"text": ["Отлично!", "Ужасно!", "Нормально"]
	})

	analyzer = ThreadAnalyzer()
	thread_stats, correlation = analyzer.analyze_from_dataframe(df)

	assert isinstance(thread_stats, pd.DataFrame)
	assert "news_id" in thread_stats.columns
	assert "thread_length" in thread_stats.columns
	assert "temperature" in thread_stats.columns
	assert isinstance(correlation, dict)
	assert "correlation" in correlation


	class TestAnalyticsAPI:
	"""Test suite for analytics API endpoints."""

	@pytest.fixture
	def client(self):
	"""Create test client."""
	from fastapi.testclient import TestClient
	from api.main import app
	return TestClient(app)

	def test_predictive_intervals_endpoint(self, client):
	"""Test predictive intervals endpoint."""
	request_data = {
	"data": [
	{"id": "item1", "positive_count": 80, "negative_count": 20, "neutral_count": 0},
	{"id": "item2", "positive_count": 1, "negative_count": 0, "neutral_count": 0}
	],
	"confidence_level": 0.95
	}

	response = client.post(
	"/analytics/predictive-intervals",
	json=request_data
	)

	assert response.status_code in [200, 500] # May fail if model not loaded

	if response.status_code == 200:
	data = response.json()
	assert "ranked_data" in data
	assert "top_positive" in data
	assert "top_negative" in data

	def test_category_sentiment_endpoint(self, client):
	"""Test category sentiment endpoint."""
	request_data = {
	"data": [
	{"category": "politics", "text": "Отлично!"},
	{"category": "politics", "text": "Ужасно!"},
	{"category": "economy", "text": "Нормально"}
	]
	}

	response = client.post(
	"/analytics/category-sentiment",
	json=request_data
	)

	# May fail if model not loaded
	assert response.status_code in [200, 500]

	if response.status_code == 200:
	data = response.json()
	assert "category_stats" in data
	assert "top_positive_categories" in data
	assert "top_negative_categories" in data

	def test_thread_analysis_endpoint(self, client):
	"""Test thread analysis endpoint."""
	request_data = {
	"data": [
	{"news_id": "1", "text": "Отлично!"},
	{"news_id": "1", "text": "Ужасно!"},
	{"news_id": "2", "text": "Нормально"}
	]
	}

	response = client.post(
	"/analytics/thread-analysis",
	json=request_data
	)

	# May fail if model not loaded
	assert response.status_code in [200, 500]

	if response.status_code == 200:
	data = response.json()
	assert "thread_stats" in data
	assert "correlation" in data
	assert "correlation" in data["correlation"]

	def test_analytics_health(self, client):
	"""Test analytics health endpoint."""
	response = client.get("/analytics/health")

	assert response.status_code == 200
	data = response.json()
	assert "status" in data