| """ |
| Invariance Tests for Skill Classification Model |
| |
| These tests verify that certain transformations to the input should NOT change |
| the model's predictions significantly. The model should be robust to: |
| - Typos and spelling variations |
| - Synonym substitutions |
| - Changes in formatting (punctuation, capitalization) |
| - Addition of neutral words |
| |
| Based on Ribeiro et al. (2020) "Beyond Accuracy: Behavioral Testing of NLP models" |
| """ |
| import pytest |
| import numpy as np |
|
|
|
|
| @pytest.mark.invariance |
| class TestInvariance: |
| """Test suite for invariance properties of the model.""" |
| |
| def test_typo_robustness(self, predict_text): |
| """ |
| Test that common typos do not significantly change predictions. |
| |
| Example: "revolutionized" vs "reovlutionized" |
| """ |
| original = "Fixed bug in data structure implementation using HashMap" |
| typo_version = "Fixd bug in dat structure implemetation using HashMp" |
| |
| pred_original = set(predict_text(original)) |
| pred_typo = set(predict_text(typo_version)) |
| |
| |
| intersection = len(pred_original & pred_typo) |
| union = len(pred_original | pred_typo) |
| |
| if union > 0: |
| similarity = intersection / union |
| assert similarity >= 0.7, ( |
| f"Typos changed predictions too much. " |
| f"Original: {pred_original}, Typo: {pred_typo}, " |
| f"Similarity: {similarity:.2f}" |
| ) |
| |
| def test_synonym_substitution(self, predict_text): |
| """ |
| Test that synonym substitutions preserve predictions. |
| |
| Example: "fix" vs "resolve", "bug" vs "issue" |
| """ |
| test_cases = [ |
| ( |
| "Fixed authentication bug in login system", |
| "Resolved authentication issue in login system" |
| ), |
| ( |
| "Implemented new feature for data processing", |
| "Added new functionality for data processing" |
| ), |
| ( |
| "Refactored code to improve performance", |
| "Restructured code to enhance performance" |
| ), |
| ] |
| |
| for original, synonym_version in test_cases: |
| pred_original = set(predict_text(original)) |
| pred_synonym = set(predict_text(synonym_version)) |
| |
| intersection = len(pred_original & pred_synonym) |
| union = len(pred_original | pred_synonym) |
| |
| if union > 0: |
| similarity = intersection / union |
| assert similarity >= 0.6, ( |
| f"Synonyms changed predictions too much.\n" |
| f"Original: '{original}' -> {pred_original}\n" |
| f"Synonym: '{synonym_version}' -> {pred_synonym}\n" |
| f"Similarity: {similarity:.2f}" |
| ) |
| |
| def test_case_insensitivity(self, predict_text): |
| """ |
| Test that capitalization changes do not affect predictions. |
| """ |
| original = "Fixed API endpoint for user authentication" |
| uppercase = original.upper() |
| lowercase = original.lower() |
| mixed_case = "fIxEd ApI EnDpOiNt FoR uSeR aUtHeNtIcAtIoN" |
| |
| pred_original = set(predict_text(original)) |
| pred_upper = set(predict_text(uppercase)) |
| pred_lower = set(predict_text(lowercase)) |
| pred_mixed = set(predict_text(mixed_case)) |
| |
| |
| assert pred_original == pred_lower, ( |
| f"Lowercase changed predictions: {pred_original} vs {pred_lower}" |
| ) |
| assert pred_original == pred_upper, ( |
| f"Uppercase changed predictions: {pred_original} vs {pred_upper}" |
| ) |
| assert pred_original == pred_mixed, ( |
| f"Mixed case changed predictions: {pred_original} vs {pred_mixed}" |
| ) |
| |
| def test_punctuation_robustness(self, predict_text): |
| """ |
| Test that punctuation changes do not significantly affect predictions. |
| """ |
| original = "Fixed bug in error handling logic" |
| with_punctuation = "Fixed bug in error-handling logic!!!" |
| extra_punctuation = "Fixed... bug in error, handling, logic." |
| |
| pred_original = set(predict_text(original)) |
| pred_punct = set(predict_text(with_punctuation)) |
| pred_extra = set(predict_text(extra_punctuation)) |
| |
| |
| for pred, name in [(pred_punct, "with_punctuation"), (pred_extra, "extra_punctuation")]: |
| intersection = len(pred_original & pred) |
| union = len(pred_original | pred) |
| |
| if union > 0: |
| similarity = intersection / union |
| assert similarity >= 0.8, ( |
| f"Punctuation in '{name}' changed predictions too much. " |
| f"Similarity: {similarity:.2f}" |
| ) |
| |
| def test_neutral_word_addition(self, predict_text): |
| """ |
| Test that adding neutral/filler words does not change predictions. |
| """ |
| original = "Implemented authentication system" |
| with_fillers = "Well, I actually implemented the authentication system here" |
| |
| pred_original = set(predict_text(original)) |
| pred_fillers = set(predict_text(with_fillers)) |
| |
| intersection = len(pred_original & pred_fillers) |
| union = len(pred_original | pred_fillers) |
| |
| if union > 0: |
| similarity = intersection / union |
| assert similarity >= 0.7, ( |
| f"Neutral words changed predictions. " |
| f"Original: {pred_original}, With fillers: {pred_fillers}, " |
| f"Similarity: {similarity:.2f}" |
| ) |
| |
| def test_word_order_robustness(self, predict_text): |
| """ |
| Test that reasonable word reordering preserves key predictions. |
| |
| Note: This is a softer invariance test since word order can matter |
| for some contexts, but core skills should remain. |
| """ |
| original = "Fixed database connection error in production" |
| reordered = "In production, fixed error in database connection" |
| |
| pred_original = set(predict_text(original)) |
| pred_reordered = set(predict_text(reordered)) |
| |
| intersection = len(pred_original & pred_reordered) |
| union = len(pred_original | pred_reordered) |
| |
| if union > 0: |
| similarity = intersection / union |
| |
| assert similarity >= 0.5, ( |
| f"Word reordering changed predictions too drastically. " |
| f"Similarity: {similarity:.2f}" |
| ) |
| |
| def test_whitespace_normalization(self, predict_text): |
| """ |
| Test that extra whitespace does not affect predictions. |
| """ |
| original = "Fixed memory leak in data processing pipeline" |
| extra_spaces = "Fixed memory leak in data processing pipeline" |
| tabs_and_newlines = "Fixed\tmemory leak\nin data\nprocessing pipeline" |
| |
| pred_original = set(predict_text(original)) |
| pred_spaces = set(predict_text(extra_spaces)) |
| pred_tabs = set(predict_text(tabs_and_newlines)) |
| |
| |
| assert pred_original == pred_spaces, ( |
| f"Extra spaces changed predictions: {pred_original} vs {pred_spaces}" |
| ) |
| assert pred_original == pred_tabs, ( |
| f"Tabs/newlines changed predictions: {pred_original} vs {pred_tabs}" |
| ) |
| |
| def test_url_removal_invariance(self, predict_text): |
| """ |
| Test that adding/removing URLs doesn't change skill predictions. |
| """ |
| without_url = "Fixed authentication bug in login system" |
| with_url = "Fixed authentication bug in login system https://github.com/example/repo/issues/123" |
| multiple_urls = ( |
| "Fixed authentication bug https://example.com in login system " |
| "See: http://docs.example.com/auth" |
| ) |
| |
| pred_original = set(predict_text(without_url)) |
| pred_with_url = set(predict_text(with_url)) |
| pred_multiple = set(predict_text(multiple_urls)) |
| |
| |
| assert pred_original == pred_with_url, ( |
| f"URL addition changed predictions: {pred_original} vs {pred_with_url}" |
| ) |
| assert pred_original == pred_multiple, ( |
| f"Multiple URLs changed predictions: {pred_original} vs {pred_multiple}" |
| ) |
| |
| def test_code_snippet_noise_robustness(self, predict_text): |
| """ |
| Test robustness to inline code snippets and markdown formatting. |
| """ |
| clean = "Fixed null pointer exception in user service" |
| with_code = "Fixed null pointer exception in user service `getUserById()`" |
| with_code_block = """ |
| Fixed null pointer exception in user service |
| ```java |
| public User getUserById(int id) { |
| return null; // Fixed this |
| } |
| ``` |
| """ |
| |
| pred_clean = set(predict_text(clean)) |
| pred_code = set(predict_text(with_code)) |
| pred_block = set(predict_text(with_code_block)) |
| |
| |
| for pred, name in [(pred_code, "inline code"), (pred_block, "code block")]: |
| intersection = len(pred_clean & pred) |
| union = len(pred_clean | pred) |
| |
| if union > 0: |
| similarity = intersection / union |
| assert similarity >= 0.6, ( |
| f"Code snippets in '{name}' changed predictions too much. " |
| f"Similarity: {similarity:.2f}" |
| ) |
|
|