Spaces:

neural-thinker
/

cidadao.ai-models

Sleeping

App Files Files Community

cidadao.ai-models / tests /test_anomaly_detector.py

neural-thinker

feat: initial cidadao.ai-models deployment

b95e73a 4 months ago

raw

history blame

10.3 kB

	"""
	Tests for Anomaly Detection Module

	Comprehensive test suite for anomaly detector.
	"""

	import pytest
	import asyncio
	from typing import List, Dict, Any

	import sys
	import os
	sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(__file__))))

	from src.models.anomaly_detection import AnomalyDetector


	class TestAnomalyDetector:
	"""Test suite for AnomalyDetector."""

	@pytest.fixture
	def detector(self):
	"""Create anomaly detector instance."""
	return AnomalyDetector()

	@pytest.fixture
	def sample_contracts(self):
	"""Sample contract data for testing."""
	return [
	{
	"id": "CT001",
	"description": "Aquisição de computadores",
	"value": 50000.0,
	"supplier": "Tech Company A",
	"date": "2024-01-15",
	"organ": "Ministry of Education"
	},
	{
	"id": "CT002",
	"description": "Aquisição de computadores",
	"value": 500000.0, # Anomaly: 10x higher
	"supplier": "Tech Company B",
	"date": "2024-01-20",
	"organ": "Ministry of Education"
	},
	{
	"id": "CT003",
	"description": "Serviços de consultoria",
	"value": 75000.0,
	"supplier": "Consulting Inc",
	"date": "2024-02-01",
	"organ": "Ministry of Health"
	}
	]

	def test_detector_initialization(self, detector):
	"""Test detector is properly initialized."""
	assert detector is not None
	assert detector.model_name == "anomaly_detector"
	assert hasattr(detector, '_thresholds')
	assert detector._thresholds['value_threshold'] == 1000000

	def test_detector_training(self, detector, sample_contracts):
	"""Test detector training process."""
	# Run training
	result = asyncio.run(detector.train(sample_contracts))

	assert result['status'] == 'trained'
	assert result['samples'] == len(sample_contracts)
	assert result['model'] == 'anomaly_detector'
	assert detector._is_trained is True

	def test_anomaly_detection_high_value(self, detector, sample_contracts):
	"""Test detection of high value anomalies."""
	# Train first
	asyncio.run(detector.train(sample_contracts))

	# Run prediction
	results = asyncio.run(detector.predict(sample_contracts))

	# Should detect high value anomaly
	assert len(results) > 0

	# Find the high value contract
	high_value_result = next(
	(r for r in results if r['contract_id'] == 'CT002'),
	None
	)

	assert high_value_result is not None
	assert high_value_result['is_anomaly'] is True
	assert high_value_result['anomaly_type'] == 'high_value'
	assert high_value_result['confidence'] > 0.8

	def test_anomaly_detection_frequency(self, detector):
	"""Test detection of frequency anomalies."""
	# Create contracts with same supplier
	contracts = [
	{
	"id": f"CT{i:03d}",
	"description": "Service contract",
	"value": 50000.0,
	"supplier": "Same Supplier LLC", # All same supplier
	"date": f"2024-01-{i+1:02d}",
	"organ": "Ministry X"
	}
	for i in range(15) # 15 contracts to same supplier
	]

	# Add one normal contract
	contracts.append({
	"id": "CT999",
	"description": "Different service",
	"value": 45000.0,
	"supplier": "Other Company",
	"date": "2024-02-01",
	"organ": "Ministry X"
	})

	# Train and predict
	asyncio.run(detector.train(contracts))
	results = asyncio.run(detector.predict(contracts))

	# Should detect frequency anomaly
	frequency_anomalies = [
	r for r in results
	if r.get('anomaly_type') == 'suspicious_frequency'
	]

	assert len(frequency_anomalies) > 0
	assert frequency_anomalies[0]['supplier'] == 'Same Supplier LLC'

	def test_no_anomalies_normal_data(self, detector):
	"""Test no anomalies detected in normal data."""
	# Create normal contracts
	normal_contracts = [
	{
	"id": f"CT{i:03d}",
	"description": f"Service type {i % 3}",
	"value": 50000.0 + (i * 1000), # Small variations
	"supplier": f"Company {chr(65 + i % 5)}", # 5 different suppliers
	"date": f"2024-01-{(i % 28) + 1:02d}",
	"organ": f"Ministry {i % 3}"
	}
	for i in range(20)
	]

	# Train and predict
	asyncio.run(detector.train(normal_contracts))
	results = asyncio.run(detector.predict(normal_contracts))

	# Should have few or no anomalies
	anomalies = [r for r in results if r.get('is_anomaly', False)]
	assert len(anomalies) < 3 # Less than 15% anomalies

	def test_empty_data_handling(self, detector):
	"""Test handling of empty data."""
	# Train with empty data
	result = asyncio.run(detector.train([]))
	assert result['status'] == 'trained'
	assert result['samples'] == 0

	# Predict with empty data
	results = asyncio.run(detector.predict([]))
	assert results == []

	def test_invalid_data_handling(self, detector):
	"""Test handling of invalid data."""
	invalid_contracts = [
	{"id": "CT001"}, # Missing required fields
	{"id": "CT002", "value": "not_a_number"}, # Invalid type
	None, # Null entry
	]

	# Should handle gracefully
	try:
	asyncio.run(detector.train(invalid_contracts))
	results = asyncio.run(detector.predict(invalid_contracts))
	# Should either skip invalid entries or return empty
	assert isinstance(results, list)
	except Exception as e:
	# Should raise meaningful error
	assert "invalid" in str(e).lower() or "error" in str(e).lower()

	def test_threshold_configuration(self):
	"""Test custom threshold configuration."""
	# Create detector with custom thresholds
	custom_detector = AnomalyDetector()
	custom_detector._thresholds = {
	"value_threshold": 100000, # Lower threshold
	"frequency_threshold": 5, # Lower frequency
	"pattern_threshold": 0.9 # Higher pattern threshold
	}

	assert custom_detector._thresholds['value_threshold'] == 100000
	assert custom_detector._thresholds['frequency_threshold'] == 5
	assert custom_detector._thresholds['pattern_threshold'] == 0.9

	@pytest.mark.parametrize("num_contracts,expected_performance", [
	(10, 0.1), # 10 contracts should process in < 0.1s
	(100, 0.5), # 100 contracts should process in < 0.5s
	(1000, 2.0), # 1000 contracts should process in < 2s
	])
	def test_performance(self, detector, num_contracts, expected_performance):
	"""Test performance with different data sizes."""
	import time

	# Generate test data
	contracts = [
	{
	"id": f"CT{i:06d}",
	"description": f"Contract {i}",
	"value": 50000.0 + (i * 100),
	"supplier": f"Company {i % 20}",
	"date": f"2024-01-{(i % 28) + 1:02d}",
	"organ": f"Ministry {i % 5}"
	}
	for i in range(num_contracts)
	]

	# Measure prediction time
	asyncio.run(detector.train(contracts[:100])) # Train on subset

	start_time = time.time()
	results = asyncio.run(detector.predict(contracts))
	elapsed_time = time.time() - start_time

	assert elapsed_time < expected_performance
	assert len(results) <= len(contracts)


	@pytest.mark.asyncio
	class TestAsyncAnomalyDetector:
	"""Async test suite for AnomalyDetector."""

	async def test_concurrent_predictions(self):
	"""Test concurrent prediction requests."""
	detector = AnomalyDetector()

	# Create multiple contract sets
	contract_sets = [
	[
	{
	"id": f"SET{set_id}-CT{i:03d}",
	"description": f"Contract {i}",
	"value": 50000.0 * (set_id + 1),
	"supplier": f"Company {i}",
	"date": "2024-01-15",
	"organ": f"Ministry {set_id}"
	}
	for i in range(10)
	]
	for set_id in range(5)
	]

	# Train detector
	await detector.train(contract_sets[0])

	# Run concurrent predictions
	tasks = [
	detector.predict(contracts)
	for contracts in contract_sets
	]

	results = await asyncio.gather(*tasks)

	# All should complete successfully
	assert len(results) == 5
	for result in results:
	assert isinstance(result, list)

	async def test_model_state_persistence(self):
	"""Test model state is maintained across predictions."""
	detector = AnomalyDetector()

	# Initial training
	train_data = [
	{
	"id": f"CT{i:03d}",
	"description": "Initial contract",
	"value": 100000.0,
	"supplier": f"Company {i}",
	"date": "2024-01-01",
	"organ": "Ministry A"
	}
	for i in range(50)
	]

	await detector.train(train_data)
	assert detector._is_trained is True

	# Multiple predictions shouldn't affect trained state
	for _ in range(10):
	await detector.predict(train_data[:10])
	assert detector._is_trained is True