""" Invariance Tests for Skill Classification Model These tests verify that certain transformations to the input should NOT change the model's predictions significantly. The model should be robust to: - Typos and spelling variations - Synonym substitutions - Changes in formatting (punctuation, capitalization) - Addition of neutral words Based on Ribeiro et al. (2020) "Beyond Accuracy: Behavioral Testing of NLP models" """ import pytest import numpy as np @pytest.mark.invariance class TestInvariance: """Test suite for invariance properties of the model.""" def test_typo_robustness(self, predict_text): """ Test that common typos do not significantly change predictions. Example: "revolutionized" vs "reovlutionized" """ original = "Fixed bug in data structure implementation using HashMap" typo_version = "Fixd bug in dat structure implemetation using HashMp" pred_original = set(predict_text(original)) pred_typo = set(predict_text(typo_version)) # Calculate Jaccard similarity (should be high) intersection = len(pred_original & pred_typo) union = len(pred_original | pred_typo) if union > 0: similarity = intersection / union assert similarity >= 0.7, ( f"Typos changed predictions too much. " f"Original: {pred_original}, Typo: {pred_typo}, " f"Similarity: {similarity:.2f}" ) def test_synonym_substitution(self, predict_text): """ Test that synonym substitutions preserve predictions. Example: "fix" vs "resolve", "bug" vs "issue" """ test_cases = [ ( "Fixed authentication bug in login system", "Resolved authentication issue in login system" ), ( "Implemented new feature for data processing", "Added new functionality for data processing" ), ( "Refactored code to improve performance", "Restructured code to enhance performance" ), ] for original, synonym_version in test_cases: pred_original = set(predict_text(original)) pred_synonym = set(predict_text(synonym_version)) intersection = len(pred_original & pred_synonym) union = len(pred_original | pred_synonym) if union > 0: similarity = intersection / union assert similarity >= 0.6, ( f"Synonyms changed predictions too much.\n" f"Original: '{original}' -> {pred_original}\n" f"Synonym: '{synonym_version}' -> {pred_synonym}\n" f"Similarity: {similarity:.2f}" ) def test_case_insensitivity(self, predict_text): """ Test that capitalization changes do not affect predictions. """ original = "Fixed API endpoint for user authentication" uppercase = original.upper() lowercase = original.lower() mixed_case = "fIxEd ApI EnDpOiNt FoR uSeR aUtHeNtIcAtIoN" pred_original = set(predict_text(original)) pred_upper = set(predict_text(uppercase)) pred_lower = set(predict_text(lowercase)) pred_mixed = set(predict_text(mixed_case)) # All should produce identical predictions assert pred_original == pred_lower, ( f"Lowercase changed predictions: {pred_original} vs {pred_lower}" ) assert pred_original == pred_upper, ( f"Uppercase changed predictions: {pred_original} vs {pred_upper}" ) assert pred_original == pred_mixed, ( f"Mixed case changed predictions: {pred_original} vs {pred_mixed}" ) def test_punctuation_robustness(self, predict_text): """ Test that punctuation changes do not significantly affect predictions. """ original = "Fixed bug in error handling logic" with_punctuation = "Fixed bug in error-handling logic!!!" extra_punctuation = "Fixed... bug in error, handling, logic." pred_original = set(predict_text(original)) pred_punct = set(predict_text(with_punctuation)) pred_extra = set(predict_text(extra_punctuation)) # Check similarity for pred, name in [(pred_punct, "with_punctuation"), (pred_extra, "extra_punctuation")]: intersection = len(pred_original & pred) union = len(pred_original | pred) if union > 0: similarity = intersection / union assert similarity >= 0.8, ( f"Punctuation in '{name}' changed predictions too much. " f"Similarity: {similarity:.2f}" ) def test_neutral_word_addition(self, predict_text): """ Test that adding neutral/filler words does not change predictions. """ original = "Implemented authentication system" with_fillers = "Well, I actually implemented the authentication system here" pred_original = set(predict_text(original)) pred_fillers = set(predict_text(with_fillers)) intersection = len(pred_original & pred_fillers) union = len(pred_original | pred_fillers) if union > 0: similarity = intersection / union assert similarity >= 0.7, ( f"Neutral words changed predictions. " f"Original: {pred_original}, With fillers: {pred_fillers}, " f"Similarity: {similarity:.2f}" ) def test_word_order_robustness(self, predict_text): """ Test that reasonable word reordering preserves key predictions. Note: This is a softer invariance test since word order can matter for some contexts, but core skills should remain. """ original = "Fixed database connection error in production" reordered = "In production, fixed error in database connection" pred_original = set(predict_text(original)) pred_reordered = set(predict_text(reordered)) intersection = len(pred_original & pred_reordered) union = len(pred_original | pred_reordered) if union > 0: similarity = intersection / union # Lower threshold since word order can affect meaning assert similarity >= 0.5, ( f"Word reordering changed predictions too drastically. " f"Similarity: {similarity:.2f}" ) def test_whitespace_normalization(self, predict_text): """ Test that extra whitespace does not affect predictions. """ original = "Fixed memory leak in data processing pipeline" extra_spaces = "Fixed memory leak in data processing pipeline" tabs_and_newlines = "Fixed\tmemory leak\nin data\nprocessing pipeline" pred_original = set(predict_text(original)) pred_spaces = set(predict_text(extra_spaces)) pred_tabs = set(predict_text(tabs_and_newlines)) # Should be identical after normalization assert pred_original == pred_spaces, ( f"Extra spaces changed predictions: {pred_original} vs {pred_spaces}" ) assert pred_original == pred_tabs, ( f"Tabs/newlines changed predictions: {pred_original} vs {pred_tabs}" ) def test_url_removal_invariance(self, predict_text): """ Test that adding/removing URLs doesn't change skill predictions. """ without_url = "Fixed authentication bug in login system" with_url = "Fixed authentication bug in login system https://github.com/example/repo/issues/123" multiple_urls = ( "Fixed authentication bug https://example.com in login system " "See: http://docs.example.com/auth" ) pred_original = set(predict_text(without_url)) pred_with_url = set(predict_text(with_url)) pred_multiple = set(predict_text(multiple_urls)) # URLs should not affect predictions assert pred_original == pred_with_url, ( f"URL addition changed predictions: {pred_original} vs {pred_with_url}" ) assert pred_original == pred_multiple, ( f"Multiple URLs changed predictions: {pred_original} vs {pred_multiple}" ) def test_code_snippet_noise_robustness(self, predict_text): """ Test robustness to inline code snippets and markdown formatting. """ clean = "Fixed null pointer exception in user service" with_code = "Fixed null pointer exception in user service `getUserById()`" with_code_block = """ Fixed null pointer exception in user service ```java public User getUserById(int id) { return null; // Fixed this } ``` """ pred_clean = set(predict_text(clean)) pred_code = set(predict_text(with_code)) pred_block = set(predict_text(with_code_block)) # Core skills should be preserved for pred, name in [(pred_code, "inline code"), (pred_block, "code block")]: intersection = len(pred_clean & pred) union = len(pred_clean | pred) if union > 0: similarity = intersection / union assert similarity >= 0.6, ( f"Code snippets in '{name}' changed predictions too much. " f"Similarity: {similarity:.2f}" )