|
|
""" |
|
|
Directional Tests for Skill Classification Model |
|
|
|
|
|
These tests verify that specific changes to the input lead to PREDICTABLE changes |
|
|
in the model's predictions. For example: |
|
|
- Adding skill-specific keywords should increase confidence in related skills |
|
|
- Removing domain-specific terms should decrease confidence in those domains |
|
|
- Adding context about a technology should add related skill predictions |
|
|
|
|
|
Based on Ribeiro et al. (2020) "Beyond Accuracy: Behavioral Testing of NLP models" |
|
|
""" |
|
|
import pytest |
|
|
import numpy as np |
|
|
|
|
|
|
|
|
@pytest.mark.directional |
|
|
class TestDirectional: |
|
|
"""Test suite for directional expectations of the model.""" |
|
|
|
|
|
def test_adding_language_keyword(self, predict_with_labels, predict_text): |
|
|
""" |
|
|
Test that adding programming language keywords increases language-related predictions. |
|
|
|
|
|
Adding "Java" or "Python" should make language skills more likely. |
|
|
""" |
|
|
base = "Fixed bug in authentication system" |
|
|
with_java = "Fixed bug in Java authentication system" |
|
|
with_python = "Fixed bug in Python authentication system" |
|
|
|
|
|
pred_base = set(predict_with_labels(base)) |
|
|
pred_java = set(predict_with_labels(with_java)) |
|
|
pred_python = set(predict_with_labels(with_python)) |
|
|
|
|
|
|
|
|
|
|
|
print(f"\nBase predictions: {pred_base}") |
|
|
print(f"With Java: {pred_java}") |
|
|
print(f"With Python: {pred_python}") |
|
|
|
|
|
|
|
|
|
|
|
assert len(pred_java) >= len(pred_base) * 0.5, ( |
|
|
"Adding Java should not drastically reduce predictions" |
|
|
) |
|
|
assert len(pred_python) >= len(pred_base) * 0.5, ( |
|
|
"Adding Python should not drastically reduce predictions" |
|
|
) |
|
|
|
|
|
def test_adding_data_structure_keyword(self, predict_with_labels): |
|
|
""" |
|
|
Test that adding data structure keywords increases data structure predictions. |
|
|
""" |
|
|
base = "Implemented search functionality" |
|
|
with_hashmap = "Implemented search functionality using HashMap" |
|
|
with_tree = "Implemented search functionality using binary tree" |
|
|
|
|
|
pred_base = set(predict_with_labels(base)) |
|
|
pred_hashmap = set(predict_with_labels(with_hashmap)) |
|
|
pred_tree = set(predict_with_labels(with_tree)) |
|
|
|
|
|
print(f"\nBase: {pred_base}") |
|
|
print(f"With HashMap: {pred_hashmap}") |
|
|
print(f"With Tree: {pred_tree}") |
|
|
|
|
|
|
|
|
|
|
|
assert len(pred_hashmap) >= len(pred_base) * 0.8, ( |
|
|
"Adding HashMap should not drastically reduce predictions" |
|
|
) |
|
|
assert len(pred_tree) >= len(pred_base) * 0.8, ( |
|
|
"Adding tree should not drastically reduce predictions" |
|
|
) |
|
|
|
|
|
def test_adding_error_handling_context(self, predict_with_labels): |
|
|
""" |
|
|
Test that adding error handling keywords increases error handling predictions. |
|
|
""" |
|
|
base = "Updated user login flow" |
|
|
with_exception = "Updated user login flow with exception handling" |
|
|
with_try_catch = "Updated user login flow with try-catch blocks" |
|
|
|
|
|
pred_base = set(predict_with_labels(base)) |
|
|
pred_exception = set(predict_with_labels(with_exception)) |
|
|
pred_try_catch = set(predict_with_labels(with_try_catch)) |
|
|
|
|
|
print(f"\nBase: {pred_base}") |
|
|
print(f"With exception: {pred_exception}") |
|
|
print(f"With try-catch: {pred_try_catch}") |
|
|
|
|
|
|
|
|
|
|
|
has_error_handling = any("Error" in label or "Exception" in label |
|
|
for label in pred_exception | pred_try_catch) |
|
|
|
|
|
assert len(pred_exception) >= len(pred_base) * 0.5, ( |
|
|
"Adding error handling context should not drastically reduce predictions" |
|
|
) |
|
|
|
|
|
|
|
|
print(f"Has error handling related labels: {has_error_handling}") |
|
|
|
|
|
def test_removing_specific_technology(self, predict_text): |
|
|
""" |
|
|
Test that removing technology-specific keywords reduces related predictions. |
|
|
""" |
|
|
with_tech = "Fixed database connection pooling issue in PostgreSQL" |
|
|
without_tech = "Fixed database connection pooling issue" |
|
|
|
|
|
pred_with = predict_text(with_tech) |
|
|
pred_without = predict_text(without_tech) |
|
|
|
|
|
|
|
|
|
|
|
assert len(pred_with) >= len(pred_without) * 0.7, ( |
|
|
"Removing technology specifics should not drastically increase predictions" |
|
|
) |
|
|
|
|
|
def test_adding_api_context(self, predict_with_labels): |
|
|
""" |
|
|
Test that adding API-related keywords increases API/web service predictions. |
|
|
""" |
|
|
base = "Fixed user authentication" |
|
|
with_api = "Fixed user authentication REST API endpoint" |
|
|
with_graphql = "Fixed user authentication GraphQL endpoint" |
|
|
|
|
|
pred_base = set(predict_with_labels(base)) |
|
|
pred_api = set(predict_with_labels(with_api)) |
|
|
pred_graphql = set(predict_with_labels(with_graphql)) |
|
|
|
|
|
print(f"\nBase: {pred_base}") |
|
|
print(f"With REST API: {pred_api}") |
|
|
print(f"With GraphQL: {pred_graphql}") |
|
|
|
|
|
|
|
|
assert len(pred_api) >= len(pred_base) * 0.5, ( |
|
|
"Adding REST API should not drastically reduce predictions" |
|
|
) |
|
|
assert len(pred_graphql) >= len(pred_base) * 0.5, ( |
|
|
"Adding GraphQL should not drastically reduce predictions" |
|
|
) |
|
|
|
|
|
def test_adding_testing_keywords(self, predict_with_labels): |
|
|
""" |
|
|
Test that adding testing-related keywords increases testing skill predictions. |
|
|
""" |
|
|
base = "Implemented new feature for user management" |
|
|
with_tests = "Implemented new feature for user management with unit tests" |
|
|
with_integration = "Implemented new feature for user management with integration tests" |
|
|
|
|
|
pred_base = set(predict_with_labels(base)) |
|
|
pred_unit = set(predict_with_labels(with_tests)) |
|
|
pred_integration = set(predict_with_labels(with_integration)) |
|
|
|
|
|
print(f"\nBase: {pred_base}") |
|
|
print(f"With unit tests: {pred_unit}") |
|
|
print(f"With integration tests: {pred_integration}") |
|
|
|
|
|
|
|
|
|
|
|
has_testing = any("Test" in label or "Automated" in label |
|
|
for label in pred_unit | pred_integration) |
|
|
|
|
|
assert len(pred_unit) >= len(pred_base) * 0.5, ( |
|
|
"Adding testing keywords should not drastically reduce predictions" |
|
|
) |
|
|
|
|
|
print(f"Has testing related labels: {has_testing}") |
|
|
|
|
|
def test_adding_performance_keywords(self, predict_with_labels): |
|
|
""" |
|
|
Test that adding performance-related keywords affects predictions. |
|
|
""" |
|
|
base = "Optimized search algorithm" |
|
|
with_perf = "Optimized search algorithm for better performance and reduced memory usage" |
|
|
with_cache = "Optimized search algorithm with caching" |
|
|
|
|
|
pred_base = set(predict_with_labels(base)) |
|
|
pred_perf = set(predict_with_labels(with_perf)) |
|
|
pred_cache = set(predict_with_labels(with_cache)) |
|
|
|
|
|
print(f"\nBase: {pred_base}") |
|
|
print(f"With performance: {pred_perf}") |
|
|
print(f"With caching: {pred_cache}") |
|
|
|
|
|
|
|
|
|
|
|
assert len(pred_perf) >= len(pred_base) * 0.7, ( |
|
|
"Adding performance context should not drastically reduce predictions" |
|
|
) |
|
|
|
|
|
def test_adding_security_context(self, predict_with_labels): |
|
|
""" |
|
|
Test that adding security keywords increases security-related predictions. |
|
|
""" |
|
|
base = "Updated authentication system" |
|
|
with_security = "Updated authentication system with OAuth2 security" |
|
|
with_encryption = "Updated authentication system with password encryption" |
|
|
|
|
|
pred_base = set(predict_with_labels(base)) |
|
|
pred_oauth = set(predict_with_labels(with_security)) |
|
|
pred_encryption = set(predict_with_labels(with_encryption)) |
|
|
|
|
|
print(f"\nBase: {pred_base}") |
|
|
print(f"With OAuth: {pred_oauth}") |
|
|
print(f"With encryption: {pred_encryption}") |
|
|
|
|
|
|
|
|
|
|
|
assert len(pred_oauth) >= len(pred_base) * 0.5, ( |
|
|
"Adding OAuth2 should not drastically reduce predictions" |
|
|
) |
|
|
assert len(pred_encryption) >= len(pred_base) * 0.5, ( |
|
|
"Adding encryption should not drastically reduce predictions" |
|
|
) |
|
|
|
|
|
def test_adding_devops_keywords(self, predict_with_labels): |
|
|
""" |
|
|
Test that adding DevOps keywords increases DevOps-related predictions. |
|
|
""" |
|
|
base = "Deployed new version" |
|
|
with_docker = "Deployed new version using Docker containers" |
|
|
with_ci = "Deployed new version through CI/CD pipeline" |
|
|
|
|
|
pred_base = set(predict_with_labels(base)) |
|
|
pred_docker = set(predict_with_labels(with_docker)) |
|
|
pred_ci = set(predict_with_labels(with_ci)) |
|
|
|
|
|
print(f"\nBase: {pred_base}") |
|
|
print(f"With Docker: {pred_docker}") |
|
|
print(f"With CI/CD: {pred_ci}") |
|
|
|
|
|
|
|
|
|
|
|
has_devops = any("DevOps" in label or "Operations" in label or "Deployment" in label |
|
|
for label in pred_docker | pred_ci | pred_base) |
|
|
|
|
|
assert len(pred_docker) >= len(pred_base) * 0.5, ( |
|
|
"Adding Docker should not drastically reduce predictions" |
|
|
) |
|
|
|
|
|
print(f"Has DevOps related labels: {has_devops}") |
|
|
|
|
|
def test_increasing_technical_detail(self, predict_text): |
|
|
""" |
|
|
Test that adding more technical detail generally increases or maintains predictions. |
|
|
|
|
|
More specific descriptions should not drastically reduce the number of relevant skills. |
|
|
""" |
|
|
vague = "Fixed bug" |
|
|
specific = "Fixed null pointer exception in user service layer" |
|
|
very_specific = "Fixed null pointer exception in UserService.getUserById() method when handling deleted users" |
|
|
|
|
|
pred_vague = predict_text(vague) |
|
|
pred_specific = predict_text(specific) |
|
|
pred_very_specific = predict_text(very_specific) |
|
|
|
|
|
print(f"\nVague ({len(pred_vague)} labels): {pred_vague}") |
|
|
print(f"Specific ({len(pred_specific)} labels): {pred_specific}") |
|
|
print(f"Very specific ({len(pred_very_specific)} labels): {pred_very_specific}") |
|
|
|
|
|
|
|
|
|
|
|
assert len(pred_specific) >= len(pred_vague) * 0.5, ( |
|
|
"Adding technical detail should not reduce predictions drastically" |
|
|
) |
|
|
|