| """ |
| TDD Tests for PrecisionPatch - NER + Confidence Correction. |
| |
| Tests are based on OBSERVED spaCy behavior (verified via smoke test): |
| - "NowCree" is tagged CARDINAL (unknown capitalized token) |
| - "LinkedIn like Indeed" is grouped as ORG |
| - "notebookklem.google.com" is NOT tagged by NER - caught by URL regex fallback |
| - "Anthropic" is tagged GPE |
| - "San Francisco" is tagged GPE, "Bay Area" is tagged LOC |
| |
| Feature 1: find_entities - detect name-like tokens worth verifying |
| - Must catch ORG, PRODUCT, PERSON, GPE, LOC, CARDINAL entities |
| - Must catch URL-like tokens via regex fallback |
| - Must return proper dict structure with text/start/end/label keys |
| - Must return empty list for plain sentences with no proper nouns |
| """ |
| import pytest |
|
|
|
|
| class TestFindEntities: |
| """PrecisionPatch.find_entities should correctly identify proper nouns and URLs.""" |
|
|
| def test_catches_unknown_capitalized_word_as_cardinal(self): |
| """ |
| spaCy tags unknown capitalized brand names (like 'NowCree') as CARDINAL. |
| Our ENTITY_LABELS must include CARDINAL to catch this. |
| """ |
| from app.services.precision_patch import PrecisionPatch |
| patcher = PrecisionPatch() |
| text = "We can do the same thing on sites other than LinkedIn like Indeed or NowCree." |
| entities = patcher.find_entities(text) |
| entity_texts = [e["text"] for e in entities] |
| |
| assert any("NowCree" in t for t in entity_texts), ( |
| f"Expected 'NowCree' to be flagged. Got: {entities}" |
| ) |
|
|
| def test_catches_known_org_entities(self): |
| """'LinkedIn' or 'Indeed' must be tagged as ORG.""" |
| from app.services.precision_patch import PrecisionPatch |
| patcher = PrecisionPatch() |
| text = "We can do the same thing on sites other than LinkedIn like Indeed or NowCree." |
| entities = patcher.find_entities(text) |
| labels = {e["label"] for e in entities} |
| assert labels & {"ORG", "PRODUCT", "GPE", "CARDINAL"}, ( |
| f"Expected at least one name-like entity. Got: {entities}" |
| ) |
|
|
| def test_catches_location_entities(self): |
| """'San Francisco' must be tagged as GPE.""" |
| from app.services.precision_patch import PrecisionPatch |
| patcher = PrecisionPatch() |
| text = "Find me jobs in San Francisco or the Bay Area." |
| entities = patcher.find_entities(text) |
| labels = {e["label"] for e in entities} |
| assert "GPE" in labels or "LOC" in labels, ( |
| f"Expected GPE/LOC entity for 'San Francisco'. Got: {entities}" |
| ) |
|
|
| def test_url_regex_fallback_catches_garbled_url(self): |
| """ |
| spaCy NER does NOT tag URLs like 'notebookklem.google.com'. |
| The URL regex fallback must catch this. |
| """ |
| from app.services.precision_patch import PrecisionPatch |
| patcher = PrecisionPatch() |
| text = "Let us go to notebookklem.google.com for interview prep." |
| entities = patcher.find_entities(text) |
| url_entities = [e for e in entities if e["label"] == "URL"] |
| assert len(url_entities) > 0, ( |
| f"Expected URL entity for 'notebookklem.google.com'. Got: {entities}" |
| ) |
| assert "notebookklem.google.com" in url_entities[0]["text"] |
|
|
| def test_returns_empty_for_plain_sentence(self): |
| """A sentence with no proper nouns or URLs should return an empty list.""" |
| from app.services.precision_patch import PrecisionPatch |
| patcher = PrecisionPatch() |
| text = "The quick brown fox jumps over the lazy dog." |
| entities = patcher.find_entities(text) |
| assert entities == [], f"Expected no entities, got: {entities}" |
|
|
| def test_entity_dict_has_required_fields(self): |
| """Each returned entity dict must have text, start, end, label keys.""" |
| from app.services.precision_patch import PrecisionPatch |
| patcher = PrecisionPatch() |
| text = "I applied to Anthropic last week." |
| entities = patcher.find_entities(text) |
| assert len(entities) > 0, "Expected at least one entity for 'Anthropic'" |
| for ent in entities: |
| assert "text" in ent, f"Missing 'text' key in {ent}" |
| assert "start" in ent, f"Missing 'start' key in {ent}" |
| assert "end" in ent, f"Missing 'end' key in {ent}" |
| assert "label" in ent, f"Missing 'label' key in {ent}" |
|
|
| def test_character_offsets_are_correct(self): |
| """start/end offsets must correctly point to the entity text within the original string.""" |
| from app.services.precision_patch import PrecisionPatch |
| patcher = PrecisionPatch() |
| text = "Find me jobs in San Francisco or the Bay Area." |
| entities = patcher.find_entities(text) |
| for ent in entities: |
| extracted = text[ent["start"]:ent["end"]] |
| assert extracted == ent["text"], ( |
| f"Offset mismatch: expected '{ent['text']}', got '{extracted}'" |
| ) |
|
|
|
|
| class TestConfidenceMapping: |
| """PrecisionPatch should correctly map Whisper word probabilities to entities.""" |
|
|
| def test_maps_confidence_to_single_word_entity(self): |
| from app.services.precision_patch import PrecisionPatch |
| from types import SimpleNamespace |
| |
| patcher = PrecisionPatch() |
| text = "Hello NowCree." |
| entities = [{"text": "NowCree", "start": 6, "end": 13, "label": "CARDINAL"}] |
| |
| |
| |
| words = [ |
| SimpleNamespace(word="Hello", probability=0.99), |
| SimpleNamespace(word=" NowCree.", probability=0.45) |
| ] |
| |
| results = patcher.map_entities_to_confidence(entities, words, text) |
| assert results[0]["confidence"] == 0.45 |
|
|
| def test_maps_confidence_to_multi_word_entity(self): |
| from app.services.precision_patch import PrecisionPatch |
| from types import SimpleNamespace |
| |
| patcher = PrecisionPatch() |
| text = "Welcome to San Francisco." |
| entities = [{"text": "San Francisco", "start": 11, "end": 24, "label": "GPE"}] |
| |
| words = [ |
| SimpleNamespace(word="Welcome", probability=0.99), |
| SimpleNamespace(word=" to", probability=0.99), |
| SimpleNamespace(word=" San", probability=0.80), |
| SimpleNamespace(word=" Francisco.", probability=0.90) |
| ] |
| |
| results = patcher.map_entities_to_confidence(entities, words, text) |
| |
| assert results[0]["confidence"] == pytest.approx(0.85) |
|
|
| def test_identifies_suspicious_segments(self): |
| from app.services.precision_patch import PrecisionPatch |
| from types import SimpleNamespace |
| |
| patcher = PrecisionPatch() |
| |
| segments = [ |
| SimpleNamespace( |
| text="I applied to Indeed.", |
| words=[ |
| SimpleNamespace(word="I", probability=0.99), |
| SimpleNamespace(word=" applied", probability=0.99), |
| SimpleNamespace(word=" to", probability=0.99), |
| SimpleNamespace(word=" Indeed.", probability=0.95) |
| ] |
| ), |
| SimpleNamespace( |
| text="Then I checked NowCree.", |
| words=[ |
| SimpleNamespace(word="Then", probability=0.99), |
| SimpleNamespace(word=" I", probability=0.99), |
| SimpleNamespace(word=" checked", probability=0.99), |
| SimpleNamespace(word=" NowCree.", probability=0.40) |
| ] |
| ) |
| ] |
| |
| suspicious = patcher.get_suspicious_indices(segments) |
| |
| assert suspicious == [1] |
|
|
|
|
| class TestLLMCorrection: |
| """PrecisionPatch should integrate with GeminiAdapter to fix segments.""" |
|
|
| def test_apply_patch_calls_gemini_with_context(self, monkeypatch): |
| from app.services.precision_patch import PrecisionPatch |
| from types import SimpleNamespace |
| |
| |
| class MockGemini: |
| def correct_batch(self, lines, system_instruction=None): |
| |
| return [l.replace("NowCree", "Naukri") for l in lines] |
| |
| monkeypatch.setattr("app.services.translators.gemini_adapter.GeminiAdapter", lambda: MockGemini()) |
| |
| patcher = PrecisionPatch() |
| segments = [ |
| SimpleNamespace(text="I applied to Indeed.", words=[]), |
| SimpleNamespace(text="Then I checked NowCree.", words=[]), |
| SimpleNamespace(text="It was a great day.", words=[]) |
| ] |
| |
| |
| suspicious_indices = [1] |
| |
| patcher.apply_patch(segments, suspicious_indices) |
| |
| assert segments[1].text == "Then I checked Naukri." |
| |
| assert segments[0].text == "I applied to Indeed." |
| assert segments[2].text == "It was a great day." |
|
|
|
|
| def test_apply_precision_patch_integration(monkeypatch): |
| """Verifies the convenience helper correctly orchestrates the patch.""" |
| from app.services.precision_patch import apply_precision_patch |
| from types import SimpleNamespace |
|
|
| |
| class MockGemini: |
| def correct_batch(self, lines, system_instruction=None): |
| return [l.replace("NowCree", "Naukri") for l in lines] |
| |
| monkeypatch.setattr("app.services.translators.gemini_adapter.GeminiAdapter", lambda: MockGemini()) |
| |
| |
| segments = [ |
| SimpleNamespace( |
| text="Check out LinkedIn like Indeed or NowCree.", |
| words=[ |
| SimpleNamespace(word="Check", probability=0.99), |
| SimpleNamespace(word=" out", probability=0.99), |
| SimpleNamespace(word=" LinkedIn", probability=0.99), |
| SimpleNamespace(word=" like", probability=0.99), |
| SimpleNamespace(word=" Indeed", probability=0.99), |
| SimpleNamespace(word=" or", probability=0.99), |
| SimpleNamespace(word=" NowCree.", probability=0.10) |
| ] |
| ) |
| ] |
| |
| apply_precision_patch(segments) |
| |
| assert "Naukri" in segments[0].text |
|
|