"""
Invariance Tests for Skill Classification Model

These tests verify that certain transformations to the input should NOT change
the model's predictions significantly. The model should be robust to:
- Typos and spelling variations
- Synonym substitutions  
- Changes in formatting (punctuation, capitalization)
- Addition of neutral words

Based on Ribeiro et al. (2020) "Beyond Accuracy: Behavioral Testing of NLP models"
"""
import pytest
import numpy as np


@pytest.mark.invariance
class TestInvariance:
    """Test suite for invariance properties of the model."""
    
    def test_typo_robustness(self, predict_text):
        """
        Test that common typos do not significantly change predictions.
        
        Example: "revolutionized" vs "reovlutionized"
        """
        original = "Fixed bug in data structure implementation using HashMap"
        typo_version = "Fixd bug in dat structure implemetation using HashMp"
        
        pred_original = set(predict_text(original))
        pred_typo = set(predict_text(typo_version))
        
        # Calculate Jaccard similarity (should be high)
        intersection = len(pred_original & pred_typo)
        union = len(pred_original | pred_typo)
        
        if union > 0:
            similarity = intersection / union
            assert similarity >= 0.7, (
                f"Typos changed predictions too much. "
                f"Original: {pred_original}, Typo: {pred_typo}, "
                f"Similarity: {similarity:.2f}"
            )
    
    def test_synonym_substitution(self, predict_text):
        """
        Test that synonym substitutions preserve predictions.
        
        Example: "fix" vs "resolve", "bug" vs "issue"
        """
        test_cases = [
            (
                "Fixed authentication bug in login system",
                "Resolved authentication issue in login system"
            ),
            (
                "Implemented new feature for data processing",
                "Added new functionality for data processing"
            ),
            (
                "Refactored code to improve performance",
                "Restructured code to enhance performance"
            ),
        ]
        
        for original, synonym_version in test_cases:
            pred_original = set(predict_text(original))
            pred_synonym = set(predict_text(synonym_version))
            
            intersection = len(pred_original & pred_synonym)
            union = len(pred_original | pred_synonym)
            
            if union > 0:
                similarity = intersection / union
                assert similarity >= 0.6, (
                    f"Synonyms changed predictions too much.\n"
                    f"Original: '{original}' -> {pred_original}\n"
                    f"Synonym: '{synonym_version}' -> {pred_synonym}\n"
                    f"Similarity: {similarity:.2f}"
                )
    
    def test_case_insensitivity(self, predict_text):
        """
        Test that capitalization changes do not affect predictions.
        """
        original = "Fixed API endpoint for user authentication"
        uppercase = original.upper()
        lowercase = original.lower()
        mixed_case = "fIxEd ApI EnDpOiNt FoR uSeR aUtHeNtIcAtIoN"
        
        pred_original = set(predict_text(original))
        pred_upper = set(predict_text(uppercase))
        pred_lower = set(predict_text(lowercase))
        pred_mixed = set(predict_text(mixed_case))
        
        # All should produce identical predictions
        assert pred_original == pred_lower, (
            f"Lowercase changed predictions: {pred_original} vs {pred_lower}"
        )
        assert pred_original == pred_upper, (
            f"Uppercase changed predictions: {pred_original} vs {pred_upper}"
        )
        assert pred_original == pred_mixed, (
            f"Mixed case changed predictions: {pred_original} vs {pred_mixed}"
        )
    
    def test_punctuation_robustness(self, predict_text):
        """
        Test that punctuation changes do not significantly affect predictions.
        """
        original = "Fixed bug in error handling logic"
        with_punctuation = "Fixed bug in error-handling logic!!!"
        extra_punctuation = "Fixed... bug in error, handling, logic."
        
        pred_original = set(predict_text(original))
        pred_punct = set(predict_text(with_punctuation))
        pred_extra = set(predict_text(extra_punctuation))
        
        # Check similarity
        for pred, name in [(pred_punct, "with_punctuation"), (pred_extra, "extra_punctuation")]:
            intersection = len(pred_original & pred)
            union = len(pred_original | pred)
            
            if union > 0:
                similarity = intersection / union
                assert similarity >= 0.8, (
                    f"Punctuation in '{name}' changed predictions too much. "
                    f"Similarity: {similarity:.2f}"
                )
    
    def test_neutral_word_addition(self, predict_text):
        """
        Test that adding neutral/filler words does not change predictions.
        """
        original = "Implemented authentication system"
        with_fillers = "Well, I actually implemented the authentication system here"
        
        pred_original = set(predict_text(original))
        pred_fillers = set(predict_text(with_fillers))
        
        intersection = len(pred_original & pred_fillers)
        union = len(pred_original | pred_fillers)
        
        if union > 0:
            similarity = intersection / union
            assert similarity >= 0.7, (
                f"Neutral words changed predictions. "
                f"Original: {pred_original}, With fillers: {pred_fillers}, "
                f"Similarity: {similarity:.2f}"
            )
    
    def test_word_order_robustness(self, predict_text):
        """
        Test that reasonable word reordering preserves key predictions.
        
        Note: This is a softer invariance test since word order can matter
        for some contexts, but core skills should remain.
        """
        original = "Fixed database connection error in production"
        reordered = "In production, fixed error in database connection"
        
        pred_original = set(predict_text(original))
        pred_reordered = set(predict_text(reordered))
        
        intersection = len(pred_original & pred_reordered)
        union = len(pred_original | pred_reordered)
        
        if union > 0:
            similarity = intersection / union
            # Lower threshold since word order can affect meaning
            assert similarity >= 0.5, (
                f"Word reordering changed predictions too drastically. "
                f"Similarity: {similarity:.2f}"
            )
    
    def test_whitespace_normalization(self, predict_text):
        """
        Test that extra whitespace does not affect predictions.
        """
        original = "Fixed memory leak in data processing pipeline"
        extra_spaces = "Fixed   memory    leak  in   data processing    pipeline"
        tabs_and_newlines = "Fixed\tmemory leak\nin data\nprocessing pipeline"
        
        pred_original = set(predict_text(original))
        pred_spaces = set(predict_text(extra_spaces))
        pred_tabs = set(predict_text(tabs_and_newlines))
        
        # Should be identical after normalization
        assert pred_original == pred_spaces, (
            f"Extra spaces changed predictions: {pred_original} vs {pred_spaces}"
        )
        assert pred_original == pred_tabs, (
            f"Tabs/newlines changed predictions: {pred_original} vs {pred_tabs}"
        )
    
    def test_url_removal_invariance(self, predict_text):
        """
        Test that adding/removing URLs doesn't change skill predictions.
        """
        without_url = "Fixed authentication bug in login system"
        with_url = "Fixed authentication bug in login system https://github.com/example/repo/issues/123"
        multiple_urls = (
            "Fixed authentication bug https://example.com in login system "
            "See: http://docs.example.com/auth"
        )
        
        pred_original = set(predict_text(without_url))
        pred_with_url = set(predict_text(with_url))
        pred_multiple = set(predict_text(multiple_urls))
        
        # URLs should not affect predictions
        assert pred_original == pred_with_url, (
            f"URL addition changed predictions: {pred_original} vs {pred_with_url}"
        )
        assert pred_original == pred_multiple, (
            f"Multiple URLs changed predictions: {pred_original} vs {pred_multiple}"
        )
    
    def test_code_snippet_noise_robustness(self, predict_text):
        """
        Test robustness to inline code snippets and markdown formatting.
        """
        clean = "Fixed null pointer exception in user service"
        with_code = "Fixed null pointer exception in user service `getUserById()`"
        with_code_block = """
        Fixed null pointer exception in user service
        ```java
        public User getUserById(int id) {
            return null; // Fixed this
        }
        ```
        """
        
        pred_clean = set(predict_text(clean))
        pred_code = set(predict_text(with_code))
        pred_block = set(predict_text(with_code_block))
        
        # Core skills should be preserved
        for pred, name in [(pred_code, "inline code"), (pred_block, "code block")]:
            intersection = len(pred_clean & pred)
            union = len(pred_clean | pred)
            
            if union > 0:
                similarity = intersection / union
                assert similarity >= 0.6, (
                    f"Code snippets in '{name}' changed predictions too much. "
                    f"Similarity: {similarity:.2f}"
                )