Spaces:
Sleeping
Sleeping
| """ | |
| Invariance Tests for Skill Classification Model | |
| These tests verify that certain transformations to the input should NOT change | |
| the model's predictions significantly. The model should be robust to: | |
| - Typos and spelling variations | |
| - Synonym substitutions | |
| - Changes in formatting (punctuation, capitalization) | |
| - Addition of neutral words | |
| Based on Ribeiro et al. (2020) "Beyond Accuracy: Behavioral Testing of NLP models" | |
| """ | |
| import pytest | |
| import numpy as np | |
| class TestInvariance: | |
| """Test suite for invariance properties of the model.""" | |
| def test_typo_robustness(self, predict_text): | |
| """ | |
| Test that common typos do not significantly change predictions. | |
| Example: "revolutionized" vs "reovlutionized" | |
| """ | |
| original = "Fixed bug in data structure implementation using HashMap" | |
| typo_version = "Fixd bug in dat structure implemetation using HashMp" | |
| pred_original = set(predict_text(original)) | |
| pred_typo = set(predict_text(typo_version)) | |
| # Calculate Jaccard similarity (should be high) | |
| intersection = len(pred_original & pred_typo) | |
| union = len(pred_original | pred_typo) | |
| if union > 0: | |
| similarity = intersection / union | |
| assert similarity >= 0.7, ( | |
| f"Typos changed predictions too much. " | |
| f"Original: {pred_original}, Typo: {pred_typo}, " | |
| f"Similarity: {similarity:.2f}" | |
| ) | |
| def test_synonym_substitution(self, predict_text): | |
| """ | |
| Test that synonym substitutions preserve predictions. | |
| Example: "fix" vs "resolve", "bug" vs "issue" | |
| """ | |
| test_cases = [ | |
| ( | |
| "Fixed authentication bug in login system", | |
| "Resolved authentication issue in login system" | |
| ), | |
| ( | |
| "Implemented new feature for data processing", | |
| "Added new functionality for data processing" | |
| ), | |
| ( | |
| "Refactored code to improve performance", | |
| "Restructured code to enhance performance" | |
| ), | |
| ] | |
| for original, synonym_version in test_cases: | |
| pred_original = set(predict_text(original)) | |
| pred_synonym = set(predict_text(synonym_version)) | |
| intersection = len(pred_original & pred_synonym) | |
| union = len(pred_original | pred_synonym) | |
| if union > 0: | |
| similarity = intersection / union | |
| assert similarity >= 0.6, ( | |
| f"Synonyms changed predictions too much.\n" | |
| f"Original: '{original}' -> {pred_original}\n" | |
| f"Synonym: '{synonym_version}' -> {pred_synonym}\n" | |
| f"Similarity: {similarity:.2f}" | |
| ) | |
| def test_case_insensitivity(self, predict_text): | |
| """ | |
| Test that capitalization changes do not affect predictions. | |
| """ | |
| original = "Fixed API endpoint for user authentication" | |
| uppercase = original.upper() | |
| lowercase = original.lower() | |
| mixed_case = "fIxEd ApI EnDpOiNt FoR uSeR aUtHeNtIcAtIoN" | |
| pred_original = set(predict_text(original)) | |
| pred_upper = set(predict_text(uppercase)) | |
| pred_lower = set(predict_text(lowercase)) | |
| pred_mixed = set(predict_text(mixed_case)) | |
| # All should produce identical predictions | |
| assert pred_original == pred_lower, ( | |
| f"Lowercase changed predictions: {pred_original} vs {pred_lower}" | |
| ) | |
| assert pred_original == pred_upper, ( | |
| f"Uppercase changed predictions: {pred_original} vs {pred_upper}" | |
| ) | |
| assert pred_original == pred_mixed, ( | |
| f"Mixed case changed predictions: {pred_original} vs {pred_mixed}" | |
| ) | |
| def test_punctuation_robustness(self, predict_text): | |
| """ | |
| Test that punctuation changes do not significantly affect predictions. | |
| """ | |
| original = "Fixed bug in error handling logic" | |
| with_punctuation = "Fixed bug in error-handling logic!!!" | |
| extra_punctuation = "Fixed... bug in error, handling, logic." | |
| pred_original = set(predict_text(original)) | |
| pred_punct = set(predict_text(with_punctuation)) | |
| pred_extra = set(predict_text(extra_punctuation)) | |
| # Check similarity | |
| for pred, name in [(pred_punct, "with_punctuation"), (pred_extra, "extra_punctuation")]: | |
| intersection = len(pred_original & pred) | |
| union = len(pred_original | pred) | |
| if union > 0: | |
| similarity = intersection / union | |
| assert similarity >= 0.8, ( | |
| f"Punctuation in '{name}' changed predictions too much. " | |
| f"Similarity: {similarity:.2f}" | |
| ) | |
| def test_neutral_word_addition(self, predict_text): | |
| """ | |
| Test that adding neutral/filler words does not change predictions. | |
| """ | |
| original = "Implemented authentication system" | |
| with_fillers = "Well, I actually implemented the authentication system here" | |
| pred_original = set(predict_text(original)) | |
| pred_fillers = set(predict_text(with_fillers)) | |
| intersection = len(pred_original & pred_fillers) | |
| union = len(pred_original | pred_fillers) | |
| if union > 0: | |
| similarity = intersection / union | |
| assert similarity >= 0.7, ( | |
| f"Neutral words changed predictions. " | |
| f"Original: {pred_original}, With fillers: {pred_fillers}, " | |
| f"Similarity: {similarity:.2f}" | |
| ) | |
| def test_word_order_robustness(self, predict_text): | |
| """ | |
| Test that reasonable word reordering preserves key predictions. | |
| Note: This is a softer invariance test since word order can matter | |
| for some contexts, but core skills should remain. | |
| """ | |
| original = "Fixed database connection error in production" | |
| reordered = "In production, fixed error in database connection" | |
| pred_original = set(predict_text(original)) | |
| pred_reordered = set(predict_text(reordered)) | |
| intersection = len(pred_original & pred_reordered) | |
| union = len(pred_original | pred_reordered) | |
| if union > 0: | |
| similarity = intersection / union | |
| # Lower threshold since word order can affect meaning | |
| assert similarity >= 0.5, ( | |
| f"Word reordering changed predictions too drastically. " | |
| f"Similarity: {similarity:.2f}" | |
| ) | |
| def test_whitespace_normalization(self, predict_text): | |
| """ | |
| Test that extra whitespace does not affect predictions. | |
| """ | |
| original = "Fixed memory leak in data processing pipeline" | |
| extra_spaces = "Fixed memory leak in data processing pipeline" | |
| tabs_and_newlines = "Fixed\tmemory leak\nin data\nprocessing pipeline" | |
| pred_original = set(predict_text(original)) | |
| pred_spaces = set(predict_text(extra_spaces)) | |
| pred_tabs = set(predict_text(tabs_and_newlines)) | |
| # Should be identical after normalization | |
| assert pred_original == pred_spaces, ( | |
| f"Extra spaces changed predictions: {pred_original} vs {pred_spaces}" | |
| ) | |
| assert pred_original == pred_tabs, ( | |
| f"Tabs/newlines changed predictions: {pred_original} vs {pred_tabs}" | |
| ) | |
| def test_url_removal_invariance(self, predict_text): | |
| """ | |
| Test that adding/removing URLs doesn't change skill predictions. | |
| """ | |
| without_url = "Fixed authentication bug in login system" | |
| with_url = "Fixed authentication bug in login system https://github.com/example/repo/issues/123" | |
| multiple_urls = ( | |
| "Fixed authentication bug https://example.com in login system " | |
| "See: http://docs.example.com/auth" | |
| ) | |
| pred_original = set(predict_text(without_url)) | |
| pred_with_url = set(predict_text(with_url)) | |
| pred_multiple = set(predict_text(multiple_urls)) | |
| # URLs should not affect predictions | |
| assert pred_original == pred_with_url, ( | |
| f"URL addition changed predictions: {pred_original} vs {pred_with_url}" | |
| ) | |
| assert pred_original == pred_multiple, ( | |
| f"Multiple URLs changed predictions: {pred_original} vs {pred_multiple}" | |
| ) | |
| def test_code_snippet_noise_robustness(self, predict_text): | |
| """ | |
| Test robustness to inline code snippets and markdown formatting. | |
| """ | |
| clean = "Fixed null pointer exception in user service" | |
| with_code = "Fixed null pointer exception in user service `getUserById()`" | |
| with_code_block = """ | |
| Fixed null pointer exception in user service | |
| ```java | |
| public User getUserById(int id) { | |
| return null; // Fixed this | |
| } | |
| ``` | |
| """ | |
| pred_clean = set(predict_text(clean)) | |
| pred_code = set(predict_text(with_code)) | |
| pred_block = set(predict_text(with_code_block)) | |
| # Core skills should be preserved | |
| for pred, name in [(pred_code, "inline code"), (pred_block, "code block")]: | |
| intersection = len(pred_clean & pred) | |
| union = len(pred_clean | pred) | |
| if union > 0: | |
| similarity = intersection / union | |
| assert similarity >= 0.6, ( | |
| f"Code snippets in '{name}' changed predictions too much. " | |
| f"Similarity: {similarity:.2f}" | |
| ) | |