Spaces:

arjun-ms
/

Subtrans

Sleeping

File size: 10,598 Bytes

57bbccb

"""
TDD Tests for PrecisionPatch - NER + Confidence Correction.

Tests are based on OBSERVED spaCy behavior (verified via smoke test):
  - "NowCree" is tagged CARDINAL (unknown capitalized token)
  - "LinkedIn like Indeed" is grouped as ORG
  - "notebookklem.google.com" is NOT tagged by NER - caught by URL regex fallback
  - "Anthropic" is tagged GPE
  - "San Francisco" is tagged GPE, "Bay Area" is tagged LOC

Feature 1: find_entities - detect name-like tokens worth verifying
  - Must catch ORG, PRODUCT, PERSON, GPE, LOC, CARDINAL entities
  - Must catch URL-like tokens via regex fallback
  - Must return proper dict structure with text/start/end/label keys
  - Must return empty list for plain sentences with no proper nouns
"""
import pytest


class TestFindEntities:
    """PrecisionPatch.find_entities should correctly identify proper nouns and URLs."""

    def test_catches_unknown_capitalized_word_as_cardinal(self):
        """
        spaCy tags unknown capitalized brand names (like 'NowCree') as CARDINAL.
        Our ENTITY_LABELS must include CARDINAL to catch this.
        """
        from app.services.precision_patch import PrecisionPatch
        patcher = PrecisionPatch()
        text = "We can do the same thing on sites other than LinkedIn like Indeed or NowCree."
        entities = patcher.find_entities(text)
        entity_texts = [e["text"] for e in entities]
        # NowCree should be caught (as CARDINAL or ORG depending on context window)
        assert any("NowCree" in t for t in entity_texts), (
            f"Expected 'NowCree' to be flagged. Got: {entities}"
        )

    def test_catches_known_org_entities(self):
        """'LinkedIn' or 'Indeed' must be tagged as ORG."""
        from app.services.precision_patch import PrecisionPatch
        patcher = PrecisionPatch()
        text = "We can do the same thing on sites other than LinkedIn like Indeed or NowCree."
        entities = patcher.find_entities(text)
        labels = {e["label"] for e in entities}
        assert labels & {"ORG", "PRODUCT", "GPE", "CARDINAL"}, (
            f"Expected at least one name-like entity. Got: {entities}"
        )

    def test_catches_location_entities(self):
        """'San Francisco' must be tagged as GPE."""
        from app.services.precision_patch import PrecisionPatch
        patcher = PrecisionPatch()
        text = "Find me jobs in San Francisco or the Bay Area."
        entities = patcher.find_entities(text)
        labels = {e["label"] for e in entities}
        assert "GPE" in labels or "LOC" in labels, (
            f"Expected GPE/LOC entity for 'San Francisco'. Got: {entities}"
        )

    def test_url_regex_fallback_catches_garbled_url(self):
        """
        spaCy NER does NOT tag URLs like 'notebookklem.google.com'.
        The URL regex fallback must catch this.
        """
        from app.services.precision_patch import PrecisionPatch
        patcher = PrecisionPatch()
        text = "Let us go to notebookklem.google.com for interview prep."
        entities = patcher.find_entities(text)
        url_entities = [e for e in entities if e["label"] == "URL"]
        assert len(url_entities) > 0, (
            f"Expected URL entity for 'notebookklem.google.com'. Got: {entities}"
        )
        assert "notebookklem.google.com" in url_entities[0]["text"]

    def test_returns_empty_for_plain_sentence(self):
        """A sentence with no proper nouns or URLs should return an empty list."""
        from app.services.precision_patch import PrecisionPatch
        patcher = PrecisionPatch()
        text = "The quick brown fox jumps over the lazy dog."
        entities = patcher.find_entities(text)
        assert entities == [], f"Expected no entities, got: {entities}"

    def test_entity_dict_has_required_fields(self):
        """Each returned entity dict must have text, start, end, label keys."""
        from app.services.precision_patch import PrecisionPatch
        patcher = PrecisionPatch()
        text = "I applied to Anthropic last week."
        entities = patcher.find_entities(text)
        assert len(entities) > 0, "Expected at least one entity for 'Anthropic'"
        for ent in entities:
            assert "text" in ent, f"Missing 'text' key in {ent}"
            assert "start" in ent, f"Missing 'start' key in {ent}"
            assert "end" in ent, f"Missing 'end' key in {ent}"
            assert "label" in ent, f"Missing 'label' key in {ent}"

    def test_character_offsets_are_correct(self):
        """start/end offsets must correctly point to the entity text within the original string."""
        from app.services.precision_patch import PrecisionPatch
        patcher = PrecisionPatch()
        text = "Find me jobs in San Francisco or the Bay Area."
        entities = patcher.find_entities(text)
        for ent in entities:
            extracted = text[ent["start"]:ent["end"]]
            assert extracted == ent["text"], (
                f"Offset mismatch: expected '{ent['text']}', got '{extracted}'"
            )


class TestConfidenceMapping:
    """PrecisionPatch should correctly map Whisper word probabilities to entities."""

    def test_maps_confidence_to_single_word_entity(self):
        from app.services.precision_patch import PrecisionPatch
        from types import SimpleNamespace
        
        patcher = PrecisionPatch()
        text = "Hello NowCree."
        entities = [{"text": "NowCree", "start": 6, "end": 13, "label": "CARDINAL"}]
        
        # Mock Whisper words
        # Note: Whisper often includes spaces in the word text
        words = [
            SimpleNamespace(word="Hello", probability=0.99),
            SimpleNamespace(word=" NowCree.", probability=0.45)
        ]
        
        results = patcher.map_entities_to_confidence(entities, words, text)
        assert results[0]["confidence"] == 0.45

    def test_maps_confidence_to_multi_word_entity(self):
        from app.services.precision_patch import PrecisionPatch
        from types import SimpleNamespace
        
        patcher = PrecisionPatch()
        text = "Welcome to San Francisco."
        entities = [{"text": "San Francisco", "start": 11, "end": 24, "label": "GPE"}]
        
        words = [
            SimpleNamespace(word="Welcome", probability=0.99),
            SimpleNamespace(word=" to", probability=0.99),
            SimpleNamespace(word=" San", probability=0.80),
            SimpleNamespace(word=" Francisco.", probability=0.90)
        ]
        
        results = patcher.map_entities_to_confidence(entities, words, text)
        # Average of 0.8 and 0.9 = 0.85
        assert results[0]["confidence"] == pytest.approx(0.85)

    def test_identifies_suspicious_segments(self):
        from app.services.precision_patch import PrecisionPatch
        from types import SimpleNamespace
        
        patcher = PrecisionPatch()
        
        segments = [
            SimpleNamespace(
                text="I applied to Indeed.",
                words=[
                    SimpleNamespace(word="I", probability=0.99),
                    SimpleNamespace(word=" applied", probability=0.99),
                    SimpleNamespace(word=" to", probability=0.99),
                    SimpleNamespace(word=" Indeed.", probability=0.95)
                ]
            ),
            SimpleNamespace(
                text="Then I checked NowCree.",
                words=[
                    SimpleNamespace(word="Then", probability=0.99),
                    SimpleNamespace(word=" I", probability=0.99),
                    SimpleNamespace(word=" checked", probability=0.99),
                    SimpleNamespace(word=" NowCree.", probability=0.40)
                ]
            )
        ]
        
        suspicious = patcher.get_suspicious_indices(segments)
        # Only the second segment has a low-confidence entity
        assert suspicious == [1]


class TestLLMCorrection:
    """PrecisionPatch should integrate with GeminiAdapter to fix segments."""

    def test_apply_patch_calls_gemini_with_context(self, monkeypatch):
        from app.services.precision_patch import PrecisionPatch
        from types import SimpleNamespace
        
        # Mock GeminiAdapter
        class MockGemini:
            def correct_batch(self, lines, system_instruction=None):
                # Simple mock fix
                return [l.replace("NowCree", "Naukri") for l in lines]
        
        monkeypatch.setattr("app.services.translators.gemini_adapter.GeminiAdapter", lambda: MockGemini())
        
        patcher = PrecisionPatch()
        segments = [
            SimpleNamespace(text="I applied to Indeed.", words=[]),
            SimpleNamespace(text="Then I checked NowCree.", words=[]),
            SimpleNamespace(text="It was a great day.", words=[])
        ]
        
        # Manually set suspicious indices to simulate previous steps
        suspicious_indices = [1]
        
        patcher.apply_patch(segments, suspicious_indices)
        
        assert segments[1].text == "Then I checked Naukri."
        # Context segment 0 should also be processed (and in this case, replaced with itself if no NowCree)
        assert segments[0].text == "I applied to Indeed."
        assert segments[2].text == "It was a great day."


def test_apply_precision_patch_integration(monkeypatch):
    """Verifies the convenience helper correctly orchestrates the patch."""
    from app.services.precision_patch import apply_precision_patch
    from types import SimpleNamespace

    # Mock GeminiAdapter
    class MockGemini:
        def correct_batch(self, lines, system_instruction=None):
            return [l.replace("NowCree", "Naukri") for l in lines]
    
    monkeypatch.setattr("app.services.translators.gemini_adapter.GeminiAdapter", lambda: MockGemini())
    
    # Mock segments with a low-confidence entity
    segments = [
        SimpleNamespace(
            text="Check out LinkedIn like Indeed or NowCree.",
            words=[
                SimpleNamespace(word="Check", probability=0.99),
                SimpleNamespace(word=" out", probability=0.99),
                SimpleNamespace(word=" LinkedIn", probability=0.99),
                SimpleNamespace(word=" like", probability=0.99),
                SimpleNamespace(word=" Indeed", probability=0.99),
                SimpleNamespace(word=" or", probability=0.99),
                SimpleNamespace(word=" NowCree.", probability=0.10) # LOW CONFIDENCE
            ]
        )
    ]
    
    apply_precision_patch(segments)
    
    assert "Naukri" in segments[0].text