""" Tests for Anomaly Detection Module Comprehensive test suite for anomaly detector. """ import pytest import asyncio from typing import List, Dict, Any import sys import os sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) from src.models.anomaly_detection import AnomalyDetector class TestAnomalyDetector: """Test suite for AnomalyDetector.""" @pytest.fixture def detector(self): """Create anomaly detector instance.""" return AnomalyDetector() @pytest.fixture def sample_contracts(self): """Sample contract data for testing.""" return [ { "id": "CT001", "description": "Aquisição de computadores", "value": 50000.0, "supplier": "Tech Company A", "date": "2024-01-15", "organ": "Ministry of Education" }, { "id": "CT002", "description": "Aquisição de computadores", "value": 500000.0, # Anomaly: 10x higher "supplier": "Tech Company B", "date": "2024-01-20", "organ": "Ministry of Education" }, { "id": "CT003", "description": "Serviços de consultoria", "value": 75000.0, "supplier": "Consulting Inc", "date": "2024-02-01", "organ": "Ministry of Health" } ] def test_detector_initialization(self, detector): """Test detector is properly initialized.""" assert detector is not None assert detector.model_name == "anomaly_detector" assert hasattr(detector, '_thresholds') assert detector._thresholds['value_threshold'] == 1000000 def test_detector_training(self, detector, sample_contracts): """Test detector training process.""" # Run training result = asyncio.run(detector.train(sample_contracts)) assert result['status'] == 'trained' assert result['samples'] == len(sample_contracts) assert result['model'] == 'anomaly_detector' assert detector._is_trained is True def test_anomaly_detection_high_value(self, detector, sample_contracts): """Test detection of high value anomalies.""" # Train first asyncio.run(detector.train(sample_contracts)) # Run prediction results = asyncio.run(detector.predict(sample_contracts)) # Should detect high value anomaly assert len(results) > 0 # Find the high value contract high_value_result = next( (r for r in results if r['contract_id'] == 'CT002'), None ) assert high_value_result is not None assert high_value_result['is_anomaly'] is True assert high_value_result['anomaly_type'] == 'high_value' assert high_value_result['confidence'] > 0.8 def test_anomaly_detection_frequency(self, detector): """Test detection of frequency anomalies.""" # Create contracts with same supplier contracts = [ { "id": f"CT{i:03d}", "description": "Service contract", "value": 50000.0, "supplier": "Same Supplier LLC", # All same supplier "date": f"2024-01-{i+1:02d}", "organ": "Ministry X" } for i in range(15) # 15 contracts to same supplier ] # Add one normal contract contracts.append({ "id": "CT999", "description": "Different service", "value": 45000.0, "supplier": "Other Company", "date": "2024-02-01", "organ": "Ministry X" }) # Train and predict asyncio.run(detector.train(contracts)) results = asyncio.run(detector.predict(contracts)) # Should detect frequency anomaly frequency_anomalies = [ r for r in results if r.get('anomaly_type') == 'suspicious_frequency' ] assert len(frequency_anomalies) > 0 assert frequency_anomalies[0]['supplier'] == 'Same Supplier LLC' def test_no_anomalies_normal_data(self, detector): """Test no anomalies detected in normal data.""" # Create normal contracts normal_contracts = [ { "id": f"CT{i:03d}", "description": f"Service type {i % 3}", "value": 50000.0 + (i * 1000), # Small variations "supplier": f"Company {chr(65 + i % 5)}", # 5 different suppliers "date": f"2024-01-{(i % 28) + 1:02d}", "organ": f"Ministry {i % 3}" } for i in range(20) ] # Train and predict asyncio.run(detector.train(normal_contracts)) results = asyncio.run(detector.predict(normal_contracts)) # Should have few or no anomalies anomalies = [r for r in results if r.get('is_anomaly', False)] assert len(anomalies) < 3 # Less than 15% anomalies def test_empty_data_handling(self, detector): """Test handling of empty data.""" # Train with empty data result = asyncio.run(detector.train([])) assert result['status'] == 'trained' assert result['samples'] == 0 # Predict with empty data results = asyncio.run(detector.predict([])) assert results == [] def test_invalid_data_handling(self, detector): """Test handling of invalid data.""" invalid_contracts = [ {"id": "CT001"}, # Missing required fields {"id": "CT002", "value": "not_a_number"}, # Invalid type None, # Null entry ] # Should handle gracefully try: asyncio.run(detector.train(invalid_contracts)) results = asyncio.run(detector.predict(invalid_contracts)) # Should either skip invalid entries or return empty assert isinstance(results, list) except Exception as e: # Should raise meaningful error assert "invalid" in str(e).lower() or "error" in str(e).lower() def test_threshold_configuration(self): """Test custom threshold configuration.""" # Create detector with custom thresholds custom_detector = AnomalyDetector() custom_detector._thresholds = { "value_threshold": 100000, # Lower threshold "frequency_threshold": 5, # Lower frequency "pattern_threshold": 0.9 # Higher pattern threshold } assert custom_detector._thresholds['value_threshold'] == 100000 assert custom_detector._thresholds['frequency_threshold'] == 5 assert custom_detector._thresholds['pattern_threshold'] == 0.9 @pytest.mark.parametrize("num_contracts,expected_performance", [ (10, 0.1), # 10 contracts should process in < 0.1s (100, 0.5), # 100 contracts should process in < 0.5s (1000, 2.0), # 1000 contracts should process in < 2s ]) def test_performance(self, detector, num_contracts, expected_performance): """Test performance with different data sizes.""" import time # Generate test data contracts = [ { "id": f"CT{i:06d}", "description": f"Contract {i}", "value": 50000.0 + (i * 100), "supplier": f"Company {i % 20}", "date": f"2024-01-{(i % 28) + 1:02d}", "organ": f"Ministry {i % 5}" } for i in range(num_contracts) ] # Measure prediction time asyncio.run(detector.train(contracts[:100])) # Train on subset start_time = time.time() results = asyncio.run(detector.predict(contracts)) elapsed_time = time.time() - start_time assert elapsed_time < expected_performance assert len(results) <= len(contracts) @pytest.mark.asyncio class TestAsyncAnomalyDetector: """Async test suite for AnomalyDetector.""" async def test_concurrent_predictions(self): """Test concurrent prediction requests.""" detector = AnomalyDetector() # Create multiple contract sets contract_sets = [ [ { "id": f"SET{set_id}-CT{i:03d}", "description": f"Contract {i}", "value": 50000.0 * (set_id + 1), "supplier": f"Company {i}", "date": "2024-01-15", "organ": f"Ministry {set_id}" } for i in range(10) ] for set_id in range(5) ] # Train detector await detector.train(contract_sets[0]) # Run concurrent predictions tasks = [ detector.predict(contracts) for contracts in contract_sets ] results = await asyncio.gather(*tasks) # All should complete successfully assert len(results) == 5 for result in results: assert isinstance(result, list) async def test_model_state_persistence(self): """Test model state is maintained across predictions.""" detector = AnomalyDetector() # Initial training train_data = [ { "id": f"CT{i:03d}", "description": "Initial contract", "value": 100000.0, "supplier": f"Company {i}", "date": "2024-01-01", "organ": "Ministry A" } for i in range(50) ] await detector.train(train_data) assert detector._is_trained is True # Multiple predictions shouldn't affect trained state for _ in range(10): await detector.predict(train_data[:10]) assert detector._is_trained is True