DaCrow13
Deploy to HF Spaces (Clean)
225af6a
"""
Directional Tests for Skill Classification Model
These tests verify that specific changes to the input lead to PREDICTABLE changes
in the model's predictions. For example:
- Adding skill-specific keywords should increase confidence in related skills
- Removing domain-specific terms should decrease confidence in those domains
- Adding context about a technology should add related skill predictions
Based on Ribeiro et al. (2020) "Beyond Accuracy: Behavioral Testing of NLP models"
"""
import pytest
import numpy as np
@pytest.mark.directional
class TestDirectional:
"""Test suite for directional expectations of the model."""
def test_adding_language_keyword(self, predict_with_labels, predict_text):
"""
Test that adding programming language keywords increases language-related predictions.
Adding "Java" or "Python" should make language skills more likely.
"""
base = "Fixed bug in authentication system"
with_java = "Fixed bug in Java authentication system"
with_python = "Fixed bug in Python authentication system"
pred_base = set(predict_with_labels(base))
pred_java = set(predict_with_labels(with_java))
pred_python = set(predict_with_labels(with_python))
# Check if language-related labels appear (depends on your label schema)
# Note: Adjust these checks based on actual labels in your dataset
print(f"\nBase predictions: {pred_base}")
print(f"With Java: {pred_java}")
print(f"With Python: {pred_python}")
# At minimum, predictions should not become drastically worse
# It's acceptable if predictions stay the same (model might already predict Language)
assert len(pred_java) >= len(pred_base) * 0.5, (
"Adding Java should not drastically reduce predictions"
)
assert len(pred_python) >= len(pred_base) * 0.5, (
"Adding Python should not drastically reduce predictions"
)
def test_adding_data_structure_keyword(self, predict_with_labels):
"""
Test that adding data structure keywords increases data structure predictions.
"""
base = "Implemented search functionality"
with_hashmap = "Implemented search functionality using HashMap"
with_tree = "Implemented search functionality using binary tree"
pred_base = set(predict_with_labels(base))
pred_hashmap = set(predict_with_labels(with_hashmap))
pred_tree = set(predict_with_labels(with_tree))
print(f"\nBase: {pred_base}")
print(f"With HashMap: {pred_hashmap}")
print(f"With Tree: {pred_tree}")
# Adding data structures should increase related predictions
# pred_hashmap and pred_tree should have more or different labels than base
assert len(pred_hashmap) >= len(pred_base) * 0.8, (
"Adding HashMap should not drastically reduce predictions"
)
assert len(pred_tree) >= len(pred_base) * 0.8, (
"Adding tree should not drastically reduce predictions"
)
def test_adding_error_handling_context(self, predict_with_labels):
"""
Test that adding error handling keywords increases error handling predictions.
"""
base = "Updated user login flow"
with_exception = "Updated user login flow with exception handling"
with_try_catch = "Updated user login flow with try-catch blocks"
pred_base = set(predict_with_labels(base))
pred_exception = set(predict_with_labels(with_exception))
pred_try_catch = set(predict_with_labels(with_try_catch))
print(f"\nBase: {pred_base}")
print(f"With exception: {pred_exception}")
print(f"With try-catch: {pred_try_catch}")
# Error handling keywords should not drastically reduce predictions
# Check if "Error Handling" is in predictions (likely already there)
has_error_handling = any("Error" in label or "Exception" in label
for label in pred_exception | pred_try_catch)
assert len(pred_exception) >= len(pred_base) * 0.5, (
"Adding error handling context should not drastically reduce predictions"
)
# At least one prediction should contain error-related terms
print(f"Has error handling related labels: {has_error_handling}")
def test_removing_specific_technology(self, predict_text):
"""
Test that removing technology-specific keywords reduces related predictions.
"""
with_tech = "Fixed database connection pooling issue in PostgreSQL"
without_tech = "Fixed database connection pooling issue"
pred_with = predict_text(with_tech)
pred_without = predict_text(without_tech)
# Predictions should differ when removing specific technology
# The version with specific tech should generally have same or more predictions
assert len(pred_with) >= len(pred_without) * 0.7, (
"Removing technology specifics should not drastically increase predictions"
)
def test_adding_api_context(self, predict_with_labels):
"""
Test that adding API-related keywords increases API/web service predictions.
"""
base = "Fixed user authentication"
with_api = "Fixed user authentication REST API endpoint"
with_graphql = "Fixed user authentication GraphQL endpoint"
pred_base = set(predict_with_labels(base))
pred_api = set(predict_with_labels(with_api))
pred_graphql = set(predict_with_labels(with_graphql))
print(f"\nBase: {pred_base}")
print(f"With REST API: {pred_api}")
print(f"With GraphQL: {pred_graphql}")
# API keywords should not drastically reduce predictions
assert len(pred_api) >= len(pred_base) * 0.5, (
"Adding REST API should not drastically reduce predictions"
)
assert len(pred_graphql) >= len(pred_base) * 0.5, (
"Adding GraphQL should not drastically reduce predictions"
)
def test_adding_testing_keywords(self, predict_with_labels):
"""
Test that adding testing-related keywords increases testing skill predictions.
"""
base = "Implemented new feature for user management"
with_tests = "Implemented new feature for user management with unit tests"
with_integration = "Implemented new feature for user management with integration tests"
pred_base = set(predict_with_labels(base))
pred_unit = set(predict_with_labels(with_tests))
pred_integration = set(predict_with_labels(with_integration))
print(f"\nBase: {pred_base}")
print(f"With unit tests: {pred_unit}")
print(f"With integration tests: {pred_integration}")
# Testing keywords should not drastically reduce predictions
# Check if testing-related labels are present
has_testing = any("Test" in label or "Automated" in label
for label in pred_unit | pred_integration)
assert len(pred_unit) >= len(pred_base) * 0.5, (
"Adding testing keywords should not drastically reduce predictions"
)
print(f"Has testing related labels: {has_testing}")
def test_adding_performance_keywords(self, predict_with_labels):
"""
Test that adding performance-related keywords affects predictions.
"""
base = "Optimized search algorithm"
with_perf = "Optimized search algorithm for better performance and reduced memory usage"
with_cache = "Optimized search algorithm with caching"
pred_base = set(predict_with_labels(base))
pred_perf = set(predict_with_labels(with_perf))
pred_cache = set(predict_with_labels(with_cache))
print(f"\nBase: {pred_base}")
print(f"With performance: {pred_perf}")
print(f"With caching: {pred_cache}")
# Performance keywords should affect predictions
# More specific descriptions should generally maintain or add labels
assert len(pred_perf) >= len(pred_base) * 0.7, (
"Adding performance context should not drastically reduce predictions"
)
def test_adding_security_context(self, predict_with_labels):
"""
Test that adding security keywords increases security-related predictions.
"""
base = "Updated authentication system"
with_security = "Updated authentication system with OAuth2 security"
with_encryption = "Updated authentication system with password encryption"
pred_base = set(predict_with_labels(base))
pred_oauth = set(predict_with_labels(with_security))
pred_encryption = set(predict_with_labels(with_encryption))
print(f"\nBase: {pred_base}")
print(f"With OAuth: {pred_oauth}")
print(f"With encryption: {pred_encryption}")
# Security keywords should not drastically reduce predictions
# Authentication is already security-related, so predictions should be stable
assert len(pred_oauth) >= len(pred_base) * 0.5, (
"Adding OAuth2 should not drastically reduce predictions"
)
assert len(pred_encryption) >= len(pred_base) * 0.5, (
"Adding encryption should not drastically reduce predictions"
)
def test_adding_devops_keywords(self, predict_with_labels):
"""
Test that adding DevOps keywords increases DevOps-related predictions.
"""
base = "Deployed new version"
with_docker = "Deployed new version using Docker containers"
with_ci = "Deployed new version through CI/CD pipeline"
pred_base = set(predict_with_labels(base))
pred_docker = set(predict_with_labels(with_docker))
pred_ci = set(predict_with_labels(with_ci))
print(f"\nBase: {pred_base}")
print(f"With Docker: {pred_docker}")
print(f"With CI/CD: {pred_ci}")
# DevOps keywords should not drastically reduce predictions
# Check if DevOps-related labels are present
has_devops = any("DevOps" in label or "Operations" in label or "Deployment" in label
for label in pred_docker | pred_ci | pred_base)
assert len(pred_docker) >= len(pred_base) * 0.5, (
"Adding Docker should not drastically reduce predictions"
)
print(f"Has DevOps related labels: {has_devops}")
def test_increasing_technical_detail(self, predict_text):
"""
Test that adding more technical detail generally increases or maintains predictions.
More specific descriptions should not drastically reduce the number of relevant skills.
"""
vague = "Fixed bug"
specific = "Fixed null pointer exception in user service layer"
very_specific = "Fixed null pointer exception in UserService.getUserById() method when handling deleted users"
pred_vague = predict_text(vague)
pred_specific = predict_text(specific)
pred_very_specific = predict_text(very_specific)
print(f"\nVague ({len(pred_vague)} labels): {pred_vague}")
print(f"Specific ({len(pred_specific)} labels): {pred_specific}")
print(f"Very specific ({len(pred_very_specific)} labels): {pred_very_specific}")
# More detail should generally add relevant skills, not remove them drastically
# Allow some variance since very specific text might lose some general predictions
assert len(pred_specific) >= len(pred_vague) * 0.5, (
"Adding technical detail should not reduce predictions drastically"
)