File size: 10,344 Bytes
b95e73a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
"""
Tests for Anomaly Detection Module

Comprehensive test suite for anomaly detector.
"""

import pytest
import asyncio
from typing import List, Dict, Any

import sys
import os
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(__file__))))

from src.models.anomaly_detection import AnomalyDetector


class TestAnomalyDetector:
    """Test suite for AnomalyDetector."""
    
    @pytest.fixture
    def detector(self):
        """Create anomaly detector instance."""
        return AnomalyDetector()
    
    @pytest.fixture
    def sample_contracts(self):
        """Sample contract data for testing."""
        return [
            {
                "id": "CT001",
                "description": "Aquisição de computadores",
                "value": 50000.0,
                "supplier": "Tech Company A",
                "date": "2024-01-15",
                "organ": "Ministry of Education"
            },
            {
                "id": "CT002",
                "description": "Aquisição de computadores",
                "value": 500000.0,  # Anomaly: 10x higher
                "supplier": "Tech Company B",
                "date": "2024-01-20",
                "organ": "Ministry of Education"
            },
            {
                "id": "CT003",
                "description": "Serviços de consultoria",
                "value": 75000.0,
                "supplier": "Consulting Inc",
                "date": "2024-02-01",
                "organ": "Ministry of Health"
            }
        ]
    
    def test_detector_initialization(self, detector):
        """Test detector is properly initialized."""
        assert detector is not None
        assert detector.model_name == "anomaly_detector"
        assert hasattr(detector, '_thresholds')
        assert detector._thresholds['value_threshold'] == 1000000
    
    def test_detector_training(self, detector, sample_contracts):
        """Test detector training process."""
        # Run training
        result = asyncio.run(detector.train(sample_contracts))
        
        assert result['status'] == 'trained'
        assert result['samples'] == len(sample_contracts)
        assert result['model'] == 'anomaly_detector'
        assert detector._is_trained is True
    
    def test_anomaly_detection_high_value(self, detector, sample_contracts):
        """Test detection of high value anomalies."""
        # Train first
        asyncio.run(detector.train(sample_contracts))
        
        # Run prediction
        results = asyncio.run(detector.predict(sample_contracts))
        
        # Should detect high value anomaly
        assert len(results) > 0
        
        # Find the high value contract
        high_value_result = next(
            (r for r in results if r['contract_id'] == 'CT002'),
            None
        )
        
        assert high_value_result is not None
        assert high_value_result['is_anomaly'] is True
        assert high_value_result['anomaly_type'] == 'high_value'
        assert high_value_result['confidence'] > 0.8
    
    def test_anomaly_detection_frequency(self, detector):
        """Test detection of frequency anomalies."""
        # Create contracts with same supplier
        contracts = [
            {
                "id": f"CT{i:03d}",
                "description": "Service contract",
                "value": 50000.0,
                "supplier": "Same Supplier LLC",  # All same supplier
                "date": f"2024-01-{i+1:02d}",
                "organ": "Ministry X"
            }
            for i in range(15)  # 15 contracts to same supplier
        ]
        
        # Add one normal contract
        contracts.append({
            "id": "CT999",
            "description": "Different service",
            "value": 45000.0,
            "supplier": "Other Company",
            "date": "2024-02-01",
            "organ": "Ministry X"
        })
        
        # Train and predict
        asyncio.run(detector.train(contracts))
        results = asyncio.run(detector.predict(contracts))
        
        # Should detect frequency anomaly
        frequency_anomalies = [
            r for r in results 
            if r.get('anomaly_type') == 'suspicious_frequency'
        ]
        
        assert len(frequency_anomalies) > 0
        assert frequency_anomalies[0]['supplier'] == 'Same Supplier LLC'
    
    def test_no_anomalies_normal_data(self, detector):
        """Test no anomalies detected in normal data."""
        # Create normal contracts
        normal_contracts = [
            {
                "id": f"CT{i:03d}",
                "description": f"Service type {i % 3}",
                "value": 50000.0 + (i * 1000),  # Small variations
                "supplier": f"Company {chr(65 + i % 5)}",  # 5 different suppliers
                "date": f"2024-01-{(i % 28) + 1:02d}",
                "organ": f"Ministry {i % 3}"
            }
            for i in range(20)
        ]
        
        # Train and predict
        asyncio.run(detector.train(normal_contracts))
        results = asyncio.run(detector.predict(normal_contracts))
        
        # Should have few or no anomalies
        anomalies = [r for r in results if r.get('is_anomaly', False)]
        assert len(anomalies) < 3  # Less than 15% anomalies
    
    def test_empty_data_handling(self, detector):
        """Test handling of empty data."""
        # Train with empty data
        result = asyncio.run(detector.train([]))
        assert result['status'] == 'trained'
        assert result['samples'] == 0
        
        # Predict with empty data
        results = asyncio.run(detector.predict([]))
        assert results == []
    
    def test_invalid_data_handling(self, detector):
        """Test handling of invalid data."""
        invalid_contracts = [
            {"id": "CT001"},  # Missing required fields
            {"id": "CT002", "value": "not_a_number"},  # Invalid type
            None,  # Null entry
        ]
        
        # Should handle gracefully
        try:
            asyncio.run(detector.train(invalid_contracts))
            results = asyncio.run(detector.predict(invalid_contracts))
            # Should either skip invalid entries or return empty
            assert isinstance(results, list)
        except Exception as e:
            # Should raise meaningful error
            assert "invalid" in str(e).lower() or "error" in str(e).lower()
    
    def test_threshold_configuration(self):
        """Test custom threshold configuration."""
        # Create detector with custom thresholds
        custom_detector = AnomalyDetector()
        custom_detector._thresholds = {
            "value_threshold": 100000,  # Lower threshold
            "frequency_threshold": 5,    # Lower frequency
            "pattern_threshold": 0.9     # Higher pattern threshold
        }
        
        assert custom_detector._thresholds['value_threshold'] == 100000
        assert custom_detector._thresholds['frequency_threshold'] == 5
        assert custom_detector._thresholds['pattern_threshold'] == 0.9
    
    @pytest.mark.parametrize("num_contracts,expected_performance", [
        (10, 0.1),      # 10 contracts should process in < 0.1s
        (100, 0.5),     # 100 contracts should process in < 0.5s
        (1000, 2.0),    # 1000 contracts should process in < 2s
    ])
    def test_performance(self, detector, num_contracts, expected_performance):
        """Test performance with different data sizes."""
        import time
        
        # Generate test data
        contracts = [
            {
                "id": f"CT{i:06d}",
                "description": f"Contract {i}",
                "value": 50000.0 + (i * 100),
                "supplier": f"Company {i % 20}",
                "date": f"2024-01-{(i % 28) + 1:02d}",
                "organ": f"Ministry {i % 5}"
            }
            for i in range(num_contracts)
        ]
        
        # Measure prediction time
        asyncio.run(detector.train(contracts[:100]))  # Train on subset
        
        start_time = time.time()
        results = asyncio.run(detector.predict(contracts))
        elapsed_time = time.time() - start_time
        
        assert elapsed_time < expected_performance
        assert len(results) <= len(contracts)


@pytest.mark.asyncio
class TestAsyncAnomalyDetector:
    """Async test suite for AnomalyDetector."""
    
    async def test_concurrent_predictions(self):
        """Test concurrent prediction requests."""
        detector = AnomalyDetector()
        
        # Create multiple contract sets
        contract_sets = [
            [
                {
                    "id": f"SET{set_id}-CT{i:03d}",
                    "description": f"Contract {i}",
                    "value": 50000.0 * (set_id + 1),
                    "supplier": f"Company {i}",
                    "date": "2024-01-15",
                    "organ": f"Ministry {set_id}"
                }
                for i in range(10)
            ]
            for set_id in range(5)
        ]
        
        # Train detector
        await detector.train(contract_sets[0])
        
        # Run concurrent predictions
        tasks = [
            detector.predict(contracts)
            for contracts in contract_sets
        ]
        
        results = await asyncio.gather(*tasks)
        
        # All should complete successfully
        assert len(results) == 5
        for result in results:
            assert isinstance(result, list)
    
    async def test_model_state_persistence(self):
        """Test model state is maintained across predictions."""
        detector = AnomalyDetector()
        
        # Initial training
        train_data = [
            {
                "id": f"CT{i:03d}",
                "description": "Initial contract",
                "value": 100000.0,
                "supplier": f"Company {i}",
                "date": "2024-01-01",
                "organ": "Ministry A"
            }
            for i in range(50)
        ]
        
        await detector.train(train_data)
        assert detector._is_trained is True
        
        # Multiple predictions shouldn't affect trained state
        for _ in range(10):
            await detector.predict(train_data[:10])
            assert detector._is_trained is True