|
|
""" |
|
|
Invariance Tests for Skill Classification Model |
|
|
|
|
|
These tests verify that certain transformations to the input should NOT change |
|
|
the model's predictions significantly. The model should be robust to: |
|
|
- Typos and spelling variations |
|
|
- Synonym substitutions |
|
|
- Changes in formatting (punctuation, capitalization) |
|
|
- Addition of neutral words |
|
|
|
|
|
Based on Ribeiro et al. (2020) "Beyond Accuracy: Behavioral Testing of NLP models" |
|
|
""" |
|
|
import pytest |
|
|
import numpy as np |
|
|
|
|
|
|
|
|
@pytest.mark.invariance |
|
|
class TestInvariance: |
|
|
"""Test suite for invariance properties of the model.""" |
|
|
|
|
|
def test_typo_robustness(self, predict_text): |
|
|
""" |
|
|
Test that common typos do not significantly change predictions. |
|
|
|
|
|
Example: "revolutionized" vs "reovlutionized" |
|
|
""" |
|
|
original = "Fixed bug in data structure implementation using HashMap" |
|
|
typo_version = "Fixd bug in dat structure implemetation using HashMp" |
|
|
|
|
|
pred_original = set(predict_text(original)) |
|
|
pred_typo = set(predict_text(typo_version)) |
|
|
|
|
|
|
|
|
intersection = len(pred_original & pred_typo) |
|
|
union = len(pred_original | pred_typo) |
|
|
|
|
|
if union > 0: |
|
|
similarity = intersection / union |
|
|
assert similarity >= 0.7, ( |
|
|
f"Typos changed predictions too much. " |
|
|
f"Original: {pred_original}, Typo: {pred_typo}, " |
|
|
f"Similarity: {similarity:.2f}" |
|
|
) |
|
|
|
|
|
def test_synonym_substitution(self, predict_text): |
|
|
""" |
|
|
Test that synonym substitutions preserve predictions. |
|
|
|
|
|
Example: "fix" vs "resolve", "bug" vs "issue" |
|
|
""" |
|
|
test_cases = [ |
|
|
( |
|
|
"Fixed authentication bug in login system", |
|
|
"Resolved authentication issue in login system" |
|
|
), |
|
|
( |
|
|
"Implemented new feature for data processing", |
|
|
"Added new functionality for data processing" |
|
|
), |
|
|
( |
|
|
"Refactored code to improve performance", |
|
|
"Restructured code to enhance performance" |
|
|
), |
|
|
] |
|
|
|
|
|
for original, synonym_version in test_cases: |
|
|
pred_original = set(predict_text(original)) |
|
|
pred_synonym = set(predict_text(synonym_version)) |
|
|
|
|
|
intersection = len(pred_original & pred_synonym) |
|
|
union = len(pred_original | pred_synonym) |
|
|
|
|
|
if union > 0: |
|
|
similarity = intersection / union |
|
|
assert similarity >= 0.6, ( |
|
|
f"Synonyms changed predictions too much.\n" |
|
|
f"Original: '{original}' -> {pred_original}\n" |
|
|
f"Synonym: '{synonym_version}' -> {pred_synonym}\n" |
|
|
f"Similarity: {similarity:.2f}" |
|
|
) |
|
|
|
|
|
def test_case_insensitivity(self, predict_text): |
|
|
""" |
|
|
Test that capitalization changes do not affect predictions. |
|
|
""" |
|
|
original = "Fixed API endpoint for user authentication" |
|
|
uppercase = original.upper() |
|
|
lowercase = original.lower() |
|
|
mixed_case = "fIxEd ApI EnDpOiNt FoR uSeR aUtHeNtIcAtIoN" |
|
|
|
|
|
pred_original = set(predict_text(original)) |
|
|
pred_upper = set(predict_text(uppercase)) |
|
|
pred_lower = set(predict_text(lowercase)) |
|
|
pred_mixed = set(predict_text(mixed_case)) |
|
|
|
|
|
|
|
|
assert pred_original == pred_lower, ( |
|
|
f"Lowercase changed predictions: {pred_original} vs {pred_lower}" |
|
|
) |
|
|
assert pred_original == pred_upper, ( |
|
|
f"Uppercase changed predictions: {pred_original} vs {pred_upper}" |
|
|
) |
|
|
assert pred_original == pred_mixed, ( |
|
|
f"Mixed case changed predictions: {pred_original} vs {pred_mixed}" |
|
|
) |
|
|
|
|
|
def test_punctuation_robustness(self, predict_text): |
|
|
""" |
|
|
Test that punctuation changes do not significantly affect predictions. |
|
|
""" |
|
|
original = "Fixed bug in error handling logic" |
|
|
with_punctuation = "Fixed bug in error-handling logic!!!" |
|
|
extra_punctuation = "Fixed... bug in error, handling, logic." |
|
|
|
|
|
pred_original = set(predict_text(original)) |
|
|
pred_punct = set(predict_text(with_punctuation)) |
|
|
pred_extra = set(predict_text(extra_punctuation)) |
|
|
|
|
|
|
|
|
for pred, name in [(pred_punct, "with_punctuation"), (pred_extra, "extra_punctuation")]: |
|
|
intersection = len(pred_original & pred) |
|
|
union = len(pred_original | pred) |
|
|
|
|
|
if union > 0: |
|
|
similarity = intersection / union |
|
|
assert similarity >= 0.8, ( |
|
|
f"Punctuation in '{name}' changed predictions too much. " |
|
|
f"Similarity: {similarity:.2f}" |
|
|
) |
|
|
|
|
|
def test_neutral_word_addition(self, predict_text): |
|
|
""" |
|
|
Test that adding neutral/filler words does not change predictions. |
|
|
""" |
|
|
original = "Implemented authentication system" |
|
|
with_fillers = "Well, I actually implemented the authentication system here" |
|
|
|
|
|
pred_original = set(predict_text(original)) |
|
|
pred_fillers = set(predict_text(with_fillers)) |
|
|
|
|
|
intersection = len(pred_original & pred_fillers) |
|
|
union = len(pred_original | pred_fillers) |
|
|
|
|
|
if union > 0: |
|
|
similarity = intersection / union |
|
|
assert similarity >= 0.7, ( |
|
|
f"Neutral words changed predictions. " |
|
|
f"Original: {pred_original}, With fillers: {pred_fillers}, " |
|
|
f"Similarity: {similarity:.2f}" |
|
|
) |
|
|
|
|
|
def test_word_order_robustness(self, predict_text): |
|
|
""" |
|
|
Test that reasonable word reordering preserves key predictions. |
|
|
|
|
|
Note: This is a softer invariance test since word order can matter |
|
|
for some contexts, but core skills should remain. |
|
|
""" |
|
|
original = "Fixed database connection error in production" |
|
|
reordered = "In production, fixed error in database connection" |
|
|
|
|
|
pred_original = set(predict_text(original)) |
|
|
pred_reordered = set(predict_text(reordered)) |
|
|
|
|
|
intersection = len(pred_original & pred_reordered) |
|
|
union = len(pred_original | pred_reordered) |
|
|
|
|
|
if union > 0: |
|
|
similarity = intersection / union |
|
|
|
|
|
assert similarity >= 0.5, ( |
|
|
f"Word reordering changed predictions too drastically. " |
|
|
f"Similarity: {similarity:.2f}" |
|
|
) |
|
|
|
|
|
def test_whitespace_normalization(self, predict_text): |
|
|
""" |
|
|
Test that extra whitespace does not affect predictions. |
|
|
""" |
|
|
original = "Fixed memory leak in data processing pipeline" |
|
|
extra_spaces = "Fixed memory leak in data processing pipeline" |
|
|
tabs_and_newlines = "Fixed\tmemory leak\nin data\nprocessing pipeline" |
|
|
|
|
|
pred_original = set(predict_text(original)) |
|
|
pred_spaces = set(predict_text(extra_spaces)) |
|
|
pred_tabs = set(predict_text(tabs_and_newlines)) |
|
|
|
|
|
|
|
|
assert pred_original == pred_spaces, ( |
|
|
f"Extra spaces changed predictions: {pred_original} vs {pred_spaces}" |
|
|
) |
|
|
assert pred_original == pred_tabs, ( |
|
|
f"Tabs/newlines changed predictions: {pred_original} vs {pred_tabs}" |
|
|
) |
|
|
|
|
|
def test_url_removal_invariance(self, predict_text): |
|
|
""" |
|
|
Test that adding/removing URLs doesn't change skill predictions. |
|
|
""" |
|
|
without_url = "Fixed authentication bug in login system" |
|
|
with_url = "Fixed authentication bug in login system https://github.com/example/repo/issues/123" |
|
|
multiple_urls = ( |
|
|
"Fixed authentication bug https://example.com in login system " |
|
|
"See: http://docs.example.com/auth" |
|
|
) |
|
|
|
|
|
pred_original = set(predict_text(without_url)) |
|
|
pred_with_url = set(predict_text(with_url)) |
|
|
pred_multiple = set(predict_text(multiple_urls)) |
|
|
|
|
|
|
|
|
assert pred_original == pred_with_url, ( |
|
|
f"URL addition changed predictions: {pred_original} vs {pred_with_url}" |
|
|
) |
|
|
assert pred_original == pred_multiple, ( |
|
|
f"Multiple URLs changed predictions: {pred_original} vs {pred_multiple}" |
|
|
) |
|
|
|
|
|
def test_code_snippet_noise_robustness(self, predict_text): |
|
|
""" |
|
|
Test robustness to inline code snippets and markdown formatting. |
|
|
""" |
|
|
clean = "Fixed null pointer exception in user service" |
|
|
with_code = "Fixed null pointer exception in user service `getUserById()`" |
|
|
with_code_block = """ |
|
|
Fixed null pointer exception in user service |
|
|
```java |
|
|
public User getUserById(int id) { |
|
|
return null; // Fixed this |
|
|
} |
|
|
``` |
|
|
""" |
|
|
|
|
|
pred_clean = set(predict_text(clean)) |
|
|
pred_code = set(predict_text(with_code)) |
|
|
pred_block = set(predict_text(with_code_block)) |
|
|
|
|
|
|
|
|
for pred, name in [(pred_code, "inline code"), (pred_block, "code block")]: |
|
|
intersection = len(pred_clean & pred) |
|
|
union = len(pred_clean | pred) |
|
|
|
|
|
if union > 0: |
|
|
similarity = intersection / union |
|
|
assert similarity >= 0.6, ( |
|
|
f"Code snippets in '{name}' changed predictions too much. " |
|
|
f"Similarity: {similarity:.2f}" |
|
|
) |
|
|
|