""" TDD Tests for PrecisionPatch - NER + Confidence Correction. Tests are based on OBSERVED spaCy behavior (verified via smoke test): - "NowCree" is tagged CARDINAL (unknown capitalized token) - "LinkedIn like Indeed" is grouped as ORG - "notebookklem.google.com" is NOT tagged by NER - caught by URL regex fallback - "Anthropic" is tagged GPE - "San Francisco" is tagged GPE, "Bay Area" is tagged LOC Feature 1: find_entities - detect name-like tokens worth verifying - Must catch ORG, PRODUCT, PERSON, GPE, LOC, CARDINAL entities - Must catch URL-like tokens via regex fallback - Must return proper dict structure with text/start/end/label keys - Must return empty list for plain sentences with no proper nouns """ import pytest class TestFindEntities: """PrecisionPatch.find_entities should correctly identify proper nouns and URLs.""" def test_catches_unknown_capitalized_word_as_cardinal(self): """ spaCy tags unknown capitalized brand names (like 'NowCree') as CARDINAL. Our ENTITY_LABELS must include CARDINAL to catch this. """ from app.services.precision_patch import PrecisionPatch patcher = PrecisionPatch() text = "We can do the same thing on sites other than LinkedIn like Indeed or NowCree." entities = patcher.find_entities(text) entity_texts = [e["text"] for e in entities] # NowCree should be caught (as CARDINAL or ORG depending on context window) assert any("NowCree" in t for t in entity_texts), ( f"Expected 'NowCree' to be flagged. Got: {entities}" ) def test_catches_known_org_entities(self): """'LinkedIn' or 'Indeed' must be tagged as ORG.""" from app.services.precision_patch import PrecisionPatch patcher = PrecisionPatch() text = "We can do the same thing on sites other than LinkedIn like Indeed or NowCree." entities = patcher.find_entities(text) labels = {e["label"] for e in entities} assert labels & {"ORG", "PRODUCT", "GPE", "CARDINAL"}, ( f"Expected at least one name-like entity. Got: {entities}" ) def test_catches_location_entities(self): """'San Francisco' must be tagged as GPE.""" from app.services.precision_patch import PrecisionPatch patcher = PrecisionPatch() text = "Find me jobs in San Francisco or the Bay Area." entities = patcher.find_entities(text) labels = {e["label"] for e in entities} assert "GPE" in labels or "LOC" in labels, ( f"Expected GPE/LOC entity for 'San Francisco'. Got: {entities}" ) def test_url_regex_fallback_catches_garbled_url(self): """ spaCy NER does NOT tag URLs like 'notebookklem.google.com'. The URL regex fallback must catch this. """ from app.services.precision_patch import PrecisionPatch patcher = PrecisionPatch() text = "Let us go to notebookklem.google.com for interview prep." entities = patcher.find_entities(text) url_entities = [e for e in entities if e["label"] == "URL"] assert len(url_entities) > 0, ( f"Expected URL entity for 'notebookklem.google.com'. Got: {entities}" ) assert "notebookklem.google.com" in url_entities[0]["text"] def test_returns_empty_for_plain_sentence(self): """A sentence with no proper nouns or URLs should return an empty list.""" from app.services.precision_patch import PrecisionPatch patcher = PrecisionPatch() text = "The quick brown fox jumps over the lazy dog." entities = patcher.find_entities(text) assert entities == [], f"Expected no entities, got: {entities}" def test_entity_dict_has_required_fields(self): """Each returned entity dict must have text, start, end, label keys.""" from app.services.precision_patch import PrecisionPatch patcher = PrecisionPatch() text = "I applied to Anthropic last week." entities = patcher.find_entities(text) assert len(entities) > 0, "Expected at least one entity for 'Anthropic'" for ent in entities: assert "text" in ent, f"Missing 'text' key in {ent}" assert "start" in ent, f"Missing 'start' key in {ent}" assert "end" in ent, f"Missing 'end' key in {ent}" assert "label" in ent, f"Missing 'label' key in {ent}" def test_character_offsets_are_correct(self): """start/end offsets must correctly point to the entity text within the original string.""" from app.services.precision_patch import PrecisionPatch patcher = PrecisionPatch() text = "Find me jobs in San Francisco or the Bay Area." entities = patcher.find_entities(text) for ent in entities: extracted = text[ent["start"]:ent["end"]] assert extracted == ent["text"], ( f"Offset mismatch: expected '{ent['text']}', got '{extracted}'" ) class TestConfidenceMapping: """PrecisionPatch should correctly map Whisper word probabilities to entities.""" def test_maps_confidence_to_single_word_entity(self): from app.services.precision_patch import PrecisionPatch from types import SimpleNamespace patcher = PrecisionPatch() text = "Hello NowCree." entities = [{"text": "NowCree", "start": 6, "end": 13, "label": "CARDINAL"}] # Mock Whisper words # Note: Whisper often includes spaces in the word text words = [ SimpleNamespace(word="Hello", probability=0.99), SimpleNamespace(word=" NowCree.", probability=0.45) ] results = patcher.map_entities_to_confidence(entities, words, text) assert results[0]["confidence"] == 0.45 def test_maps_confidence_to_multi_word_entity(self): from app.services.precision_patch import PrecisionPatch from types import SimpleNamespace patcher = PrecisionPatch() text = "Welcome to San Francisco." entities = [{"text": "San Francisco", "start": 11, "end": 24, "label": "GPE"}] words = [ SimpleNamespace(word="Welcome", probability=0.99), SimpleNamespace(word=" to", probability=0.99), SimpleNamespace(word=" San", probability=0.80), SimpleNamespace(word=" Francisco.", probability=0.90) ] results = patcher.map_entities_to_confidence(entities, words, text) # Average of 0.8 and 0.9 = 0.85 assert results[0]["confidence"] == pytest.approx(0.85) def test_identifies_suspicious_segments(self): from app.services.precision_patch import PrecisionPatch from types import SimpleNamespace patcher = PrecisionPatch() segments = [ SimpleNamespace( text="I applied to Indeed.", words=[ SimpleNamespace(word="I", probability=0.99), SimpleNamespace(word=" applied", probability=0.99), SimpleNamespace(word=" to", probability=0.99), SimpleNamespace(word=" Indeed.", probability=0.95) ] ), SimpleNamespace( text="Then I checked NowCree.", words=[ SimpleNamespace(word="Then", probability=0.99), SimpleNamespace(word=" I", probability=0.99), SimpleNamespace(word=" checked", probability=0.99), SimpleNamespace(word=" NowCree.", probability=0.40) ] ) ] suspicious = patcher.get_suspicious_indices(segments) # Only the second segment has a low-confidence entity assert suspicious == [1] class TestLLMCorrection: """PrecisionPatch should integrate with GeminiAdapter to fix segments.""" def test_apply_patch_calls_gemini_with_context(self, monkeypatch): from app.services.precision_patch import PrecisionPatch from types import SimpleNamespace # Mock GeminiAdapter class MockGemini: def correct_batch(self, lines, system_instruction=None): # Simple mock fix return [l.replace("NowCree", "Naukri") for l in lines] monkeypatch.setattr("app.services.translators.gemini_adapter.GeminiAdapter", lambda: MockGemini()) patcher = PrecisionPatch() segments = [ SimpleNamespace(text="I applied to Indeed.", words=[]), SimpleNamespace(text="Then I checked NowCree.", words=[]), SimpleNamespace(text="It was a great day.", words=[]) ] # Manually set suspicious indices to simulate previous steps suspicious_indices = [1] patcher.apply_patch(segments, suspicious_indices) assert segments[1].text == "Then I checked Naukri." # Context segment 0 should also be processed (and in this case, replaced with itself if no NowCree) assert segments[0].text == "I applied to Indeed." assert segments[2].text == "It was a great day." def test_apply_precision_patch_integration(monkeypatch): """Verifies the convenience helper correctly orchestrates the patch.""" from app.services.precision_patch import apply_precision_patch from types import SimpleNamespace # Mock GeminiAdapter class MockGemini: def correct_batch(self, lines, system_instruction=None): return [l.replace("NowCree", "Naukri") for l in lines] monkeypatch.setattr("app.services.translators.gemini_adapter.GeminiAdapter", lambda: MockGemini()) # Mock segments with a low-confidence entity segments = [ SimpleNamespace( text="Check out LinkedIn like Indeed or NowCree.", words=[ SimpleNamespace(word="Check", probability=0.99), SimpleNamespace(word=" out", probability=0.99), SimpleNamespace(word=" LinkedIn", probability=0.99), SimpleNamespace(word=" like", probability=0.99), SimpleNamespace(word=" Indeed", probability=0.99), SimpleNamespace(word=" or", probability=0.99), SimpleNamespace(word=" NowCree.", probability=0.10) # LOW CONFIDENCE ] ) ] apply_precision_patch(segments) assert "Naukri" in segments[0].text