File size: 12,087 Bytes
225af6a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
"""
Directional Tests for Skill Classification Model

These tests verify that specific changes to the input lead to PREDICTABLE changes
in the model's predictions. For example:
- Adding skill-specific keywords should increase confidence in related skills
- Removing domain-specific terms should decrease confidence in those domains
- Adding context about a technology should add related skill predictions

Based on Ribeiro et al. (2020) "Beyond Accuracy: Behavioral Testing of NLP models"
"""
import pytest
import numpy as np


@pytest.mark.directional
class TestDirectional:
    """Test suite for directional expectations of the model."""
    
    def test_adding_language_keyword(self, predict_with_labels, predict_text):
        """
        Test that adding programming language keywords increases language-related predictions.
        
        Adding "Java" or "Python" should make language skills more likely.
        """
        base = "Fixed bug in authentication system"
        with_java = "Fixed bug in Java authentication system"
        with_python = "Fixed bug in Python authentication system"
        
        pred_base = set(predict_with_labels(base))
        pred_java = set(predict_with_labels(with_java))
        pred_python = set(predict_with_labels(with_python))
        
        # Check if language-related labels appear (depends on your label schema)
        # Note: Adjust these checks based on actual labels in your dataset
        print(f"\nBase predictions: {pred_base}")
        print(f"With Java: {pred_java}")
        print(f"With Python: {pred_python}")
        
        # At minimum, predictions should not become drastically worse
        # It's acceptable if predictions stay the same (model might already predict Language)
        assert len(pred_java) >= len(pred_base) * 0.5, (
            "Adding Java should not drastically reduce predictions"
        )
        assert len(pred_python) >= len(pred_base) * 0.5, (
            "Adding Python should not drastically reduce predictions"
        )
    
    def test_adding_data_structure_keyword(self, predict_with_labels):
        """
        Test that adding data structure keywords increases data structure predictions.
        """
        base = "Implemented search functionality"
        with_hashmap = "Implemented search functionality using HashMap"
        with_tree = "Implemented search functionality using binary tree"
        
        pred_base = set(predict_with_labels(base))
        pred_hashmap = set(predict_with_labels(with_hashmap))
        pred_tree = set(predict_with_labels(with_tree))
        
        print(f"\nBase: {pred_base}")
        print(f"With HashMap: {pred_hashmap}")
        print(f"With Tree: {pred_tree}")
        
        # Adding data structures should increase related predictions
        # pred_hashmap and pred_tree should have more or different labels than base
        assert len(pred_hashmap) >= len(pred_base) * 0.8, (
            "Adding HashMap should not drastically reduce predictions"
        )
        assert len(pred_tree) >= len(pred_base) * 0.8, (
            "Adding tree should not drastically reduce predictions"
        )
    
    def test_adding_error_handling_context(self, predict_with_labels):
        """
        Test that adding error handling keywords increases error handling predictions.
        """
        base = "Updated user login flow"
        with_exception = "Updated user login flow with exception handling"
        with_try_catch = "Updated user login flow with try-catch blocks"
        
        pred_base = set(predict_with_labels(base))
        pred_exception = set(predict_with_labels(with_exception))
        pred_try_catch = set(predict_with_labels(with_try_catch))
        
        print(f"\nBase: {pred_base}")
        print(f"With exception: {pred_exception}")
        print(f"With try-catch: {pred_try_catch}")
        
        # Error handling keywords should not drastically reduce predictions
        # Check if "Error Handling" is in predictions (likely already there)
        has_error_handling = any("Error" in label or "Exception" in label 
                                  for label in pred_exception | pred_try_catch)
        
        assert len(pred_exception) >= len(pred_base) * 0.5, (
            "Adding error handling context should not drastically reduce predictions"
        )
        
        # At least one prediction should contain error-related terms
        print(f"Has error handling related labels: {has_error_handling}")
    
    def test_removing_specific_technology(self, predict_text):
        """
        Test that removing technology-specific keywords reduces related predictions.
        """
        with_tech = "Fixed database connection pooling issue in PostgreSQL"
        without_tech = "Fixed database connection pooling issue"
        
        pred_with = predict_text(with_tech)
        pred_without = predict_text(without_tech)
        
        # Predictions should differ when removing specific technology
        # The version with specific tech should generally have same or more predictions
        assert len(pred_with) >= len(pred_without) * 0.7, (
            "Removing technology specifics should not drastically increase predictions"
        )
    
    def test_adding_api_context(self, predict_with_labels):
        """
        Test that adding API-related keywords increases API/web service predictions.
        """
        base = "Fixed user authentication"
        with_api = "Fixed user authentication REST API endpoint"
        with_graphql = "Fixed user authentication GraphQL endpoint"
        
        pred_base = set(predict_with_labels(base))
        pred_api = set(predict_with_labels(with_api))
        pred_graphql = set(predict_with_labels(with_graphql))
        
        print(f"\nBase: {pred_base}")
        print(f"With REST API: {pred_api}")
        print(f"With GraphQL: {pred_graphql}")
        
        # API keywords should not drastically reduce predictions
        assert len(pred_api) >= len(pred_base) * 0.5, (
            "Adding REST API should not drastically reduce predictions"
        )
        assert len(pred_graphql) >= len(pred_base) * 0.5, (
            "Adding GraphQL should not drastically reduce predictions"
        )
    
    def test_adding_testing_keywords(self, predict_with_labels):
        """
        Test that adding testing-related keywords increases testing skill predictions.
        """
        base = "Implemented new feature for user management"
        with_tests = "Implemented new feature for user management with unit tests"
        with_integration = "Implemented new feature for user management with integration tests"
        
        pred_base = set(predict_with_labels(base))
        pred_unit = set(predict_with_labels(with_tests))
        pred_integration = set(predict_with_labels(with_integration))
        
        print(f"\nBase: {pred_base}")
        print(f"With unit tests: {pred_unit}")
        print(f"With integration tests: {pred_integration}")
        
        # Testing keywords should not drastically reduce predictions
        # Check if testing-related labels are present
        has_testing = any("Test" in label or "Automated" in label 
                          for label in pred_unit | pred_integration)
        
        assert len(pred_unit) >= len(pred_base) * 0.5, (
            "Adding testing keywords should not drastically reduce predictions"
        )
        
        print(f"Has testing related labels: {has_testing}")
    
    def test_adding_performance_keywords(self, predict_with_labels):
        """
        Test that adding performance-related keywords affects predictions.
        """
        base = "Optimized search algorithm"
        with_perf = "Optimized search algorithm for better performance and reduced memory usage"
        with_cache = "Optimized search algorithm with caching"
        
        pred_base = set(predict_with_labels(base))
        pred_perf = set(predict_with_labels(with_perf))
        pred_cache = set(predict_with_labels(with_cache))
        
        print(f"\nBase: {pred_base}")
        print(f"With performance: {pred_perf}")
        print(f"With caching: {pred_cache}")
        
        # Performance keywords should affect predictions  
        # More specific descriptions should generally maintain or add labels
        assert len(pred_perf) >= len(pred_base) * 0.7, (
            "Adding performance context should not drastically reduce predictions"
        )
    
    def test_adding_security_context(self, predict_with_labels):
        """
        Test that adding security keywords increases security-related predictions.
        """
        base = "Updated authentication system"
        with_security = "Updated authentication system with OAuth2 security"
        with_encryption = "Updated authentication system with password encryption"
        
        pred_base = set(predict_with_labels(base))
        pred_oauth = set(predict_with_labels(with_security))
        pred_encryption = set(predict_with_labels(with_encryption))
        
        print(f"\nBase: {pred_base}")
        print(f"With OAuth: {pred_oauth}")
        print(f"With encryption: {pred_encryption}")
        
        # Security keywords should not drastically reduce predictions
        # Authentication is already security-related, so predictions should be stable
        assert len(pred_oauth) >= len(pred_base) * 0.5, (
            "Adding OAuth2 should not drastically reduce predictions"
        )
        assert len(pred_encryption) >= len(pred_base) * 0.5, (
            "Adding encryption should not drastically reduce predictions"
        )
    
    def test_adding_devops_keywords(self, predict_with_labels):
        """
        Test that adding DevOps keywords increases DevOps-related predictions.
        """
        base = "Deployed new version"
        with_docker = "Deployed new version using Docker containers"
        with_ci = "Deployed new version through CI/CD pipeline"
        
        pred_base = set(predict_with_labels(base))
        pred_docker = set(predict_with_labels(with_docker))
        pred_ci = set(predict_with_labels(with_ci))
        
        print(f"\nBase: {pred_base}")
        print(f"With Docker: {pred_docker}")
        print(f"With CI/CD: {pred_ci}")
        
        # DevOps keywords should not drastically reduce predictions
        # Check if DevOps-related labels are present
        has_devops = any("DevOps" in label or "Operations" in label or "Deployment" in label
                         for label in pred_docker | pred_ci | pred_base)
        
        assert len(pred_docker) >= len(pred_base) * 0.5, (
            "Adding Docker should not drastically reduce predictions"
        )
        
        print(f"Has DevOps related labels: {has_devops}")
    
    def test_increasing_technical_detail(self, predict_text):
        """
        Test that adding more technical detail generally increases or maintains predictions.
        
        More specific descriptions should not drastically reduce the number of relevant skills.
        """
        vague = "Fixed bug"
        specific = "Fixed null pointer exception in user service layer"
        very_specific = "Fixed null pointer exception in UserService.getUserById() method when handling deleted users"
        
        pred_vague = predict_text(vague)
        pred_specific = predict_text(specific)
        pred_very_specific = predict_text(very_specific)
        
        print(f"\nVague ({len(pred_vague)} labels): {pred_vague}")
        print(f"Specific ({len(pred_specific)} labels): {pred_specific}")
        print(f"Very specific ({len(pred_very_specific)} labels): {pred_very_specific}")
        
        # More detail should generally add relevant skills, not remove them drastically
        # Allow some variance since very specific text might lose some general predictions
        assert len(pred_specific) >= len(pred_vague) * 0.5, (
            "Adding technical detail should not reduce predictions drastically"
        )