File size: 3,908 Bytes
38593e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
"""
Unit Tests for Monitoring Module
"""

from pathlib import Path
import tempfile

import numpy as np
import pytest

from turing.monitoring.baseline_manager import (
    BaselineManager,
    extract_baseline_statistics,
)
from turing.monitoring.drift_detector import DriftDetector
from turing.monitoring.synthetic_data_generator import SyntheticDataGenerator


class TestBaselineExtraction:
    """Tests for baseline statistics extraction."""

    @pytest.fixture
    def sample_data(self):
        texts = [
            "This is a sample comment",
            "Another test comment here",
            "Short text",
            "Longer comment with more information",
            "Medium length comment",
        ]
        labels = np.array([[1, 0, 1, 0, 0], [0, 1, 0, 1, 0], [1, 1, 0, 0, 0], [0, 0, 1, 1, 1], [1, 0, 0, 0, 1]])
        return texts, labels

    def test_extract_baseline(self, sample_data):
        texts, labels = sample_data
        baseline = extract_baseline_statistics(X_train=texts, y_train=labels, language="java")
        
        assert "text_length_distribution" in baseline
        assert "word_count_distribution" in baseline
        assert baseline["language"] == "java"
        assert baseline["num_samples"] == len(texts)


class TestDriftDetector:
    """Tests for drift detection."""

    @pytest.fixture
    def baseline(self):
        return {
            "text_length_distribution": np.array([20, 25, 30, 35]),
            "word_count_distribution": np.array([3, 4, 5, 6]),
            "label_counts": np.array([5, 3, 2, 4]),
        }

    def test_detector_init(self):
        detector = DriftDetector(p_value_threshold=0.05, alert_threshold=0.01)
        assert detector.p_value_threshold == 0.05

    def test_text_length_drift(self, baseline):
        detector = DriftDetector(p_value_threshold=0.05)
        
        prod_texts = [
            "Very long test comment with lots of additional information",
            "Another extremely long sample text",
            "Yet another quite lengthy comment",
            "More long production text",
        ]
        
        ref_texts = [text[:len(text)//2] for text in prod_texts]  # Shorter reference texts
        
        result = detector.detect_text_property_drift(prod_texts, ref_texts)
        
        assert "drifted" in result
        assert "method" in result


class TestSyntheticDataGenerator:
    """Tests for synthetic data generation."""

    @pytest.fixture
    def sample_data(self):
        texts = ["This is a sample", "Another test", "Short", "Longer text"]
        labels = np.array([0, 1, 0, 1])
        return texts, labels

    def test_generator_init(self):
        gen = SyntheticDataGenerator(seed=42)
        assert gen.seed == 42

    def test_generate_short(self, sample_data):
        texts, labels = sample_data
        gen = SyntheticDataGenerator(seed=42)
        
        short = gen.generate_short_comments(texts, ratio=0.5, n_samples=10)
        
        assert len(short) == 10
        assert np.mean([len(t) for t in short]) < np.mean([len(t) for t in texts])


class TestBaselineManager:
    """Tests for baseline management."""

    @pytest.fixture
    def temp_dir(self):
        with tempfile.TemporaryDirectory() as tmpdir:
            yield Path(tmpdir)

    def test_save_and_load(self, temp_dir):
        manager = BaselineManager(mlflow_enabled=False, local_cache_dir=temp_dir)
        
        baseline = {
            "text_length_distribution": [10, 20, 30],
            "label_counts": [5, 3],
            "language": "java",
            "num_samples": 3,
        }
        
        manager.save_baseline(baseline, "java", "test", "model")
        loaded = manager.load_baseline("java", "test", "model")
        
        assert loaded["language"] == "java"
        assert loaded["num_samples"] == 3


if __name__ == "__main__":
    pytest.main([__file__, "-v"])