Subtrans / app /tests /test_precision_patch.py
arjun-ms's picture
Initial commit: Subtrans Subtitle Pipeline
57bbccb
"""
TDD Tests for PrecisionPatch - NER + Confidence Correction.
Tests are based on OBSERVED spaCy behavior (verified via smoke test):
- "NowCree" is tagged CARDINAL (unknown capitalized token)
- "LinkedIn like Indeed" is grouped as ORG
- "notebookklem.google.com" is NOT tagged by NER - caught by URL regex fallback
- "Anthropic" is tagged GPE
- "San Francisco" is tagged GPE, "Bay Area" is tagged LOC
Feature 1: find_entities - detect name-like tokens worth verifying
- Must catch ORG, PRODUCT, PERSON, GPE, LOC, CARDINAL entities
- Must catch URL-like tokens via regex fallback
- Must return proper dict structure with text/start/end/label keys
- Must return empty list for plain sentences with no proper nouns
"""
import pytest
class TestFindEntities:
"""PrecisionPatch.find_entities should correctly identify proper nouns and URLs."""
def test_catches_unknown_capitalized_word_as_cardinal(self):
"""
spaCy tags unknown capitalized brand names (like 'NowCree') as CARDINAL.
Our ENTITY_LABELS must include CARDINAL to catch this.
"""
from app.services.precision_patch import PrecisionPatch
patcher = PrecisionPatch()
text = "We can do the same thing on sites other than LinkedIn like Indeed or NowCree."
entities = patcher.find_entities(text)
entity_texts = [e["text"] for e in entities]
# NowCree should be caught (as CARDINAL or ORG depending on context window)
assert any("NowCree" in t for t in entity_texts), (
f"Expected 'NowCree' to be flagged. Got: {entities}"
)
def test_catches_known_org_entities(self):
"""'LinkedIn' or 'Indeed' must be tagged as ORG."""
from app.services.precision_patch import PrecisionPatch
patcher = PrecisionPatch()
text = "We can do the same thing on sites other than LinkedIn like Indeed or NowCree."
entities = patcher.find_entities(text)
labels = {e["label"] for e in entities}
assert labels & {"ORG", "PRODUCT", "GPE", "CARDINAL"}, (
f"Expected at least one name-like entity. Got: {entities}"
)
def test_catches_location_entities(self):
"""'San Francisco' must be tagged as GPE."""
from app.services.precision_patch import PrecisionPatch
patcher = PrecisionPatch()
text = "Find me jobs in San Francisco or the Bay Area."
entities = patcher.find_entities(text)
labels = {e["label"] for e in entities}
assert "GPE" in labels or "LOC" in labels, (
f"Expected GPE/LOC entity for 'San Francisco'. Got: {entities}"
)
def test_url_regex_fallback_catches_garbled_url(self):
"""
spaCy NER does NOT tag URLs like 'notebookklem.google.com'.
The URL regex fallback must catch this.
"""
from app.services.precision_patch import PrecisionPatch
patcher = PrecisionPatch()
text = "Let us go to notebookklem.google.com for interview prep."
entities = patcher.find_entities(text)
url_entities = [e for e in entities if e["label"] == "URL"]
assert len(url_entities) > 0, (
f"Expected URL entity for 'notebookklem.google.com'. Got: {entities}"
)
assert "notebookklem.google.com" in url_entities[0]["text"]
def test_returns_empty_for_plain_sentence(self):
"""A sentence with no proper nouns or URLs should return an empty list."""
from app.services.precision_patch import PrecisionPatch
patcher = PrecisionPatch()
text = "The quick brown fox jumps over the lazy dog."
entities = patcher.find_entities(text)
assert entities == [], f"Expected no entities, got: {entities}"
def test_entity_dict_has_required_fields(self):
"""Each returned entity dict must have text, start, end, label keys."""
from app.services.precision_patch import PrecisionPatch
patcher = PrecisionPatch()
text = "I applied to Anthropic last week."
entities = patcher.find_entities(text)
assert len(entities) > 0, "Expected at least one entity for 'Anthropic'"
for ent in entities:
assert "text" in ent, f"Missing 'text' key in {ent}"
assert "start" in ent, f"Missing 'start' key in {ent}"
assert "end" in ent, f"Missing 'end' key in {ent}"
assert "label" in ent, f"Missing 'label' key in {ent}"
def test_character_offsets_are_correct(self):
"""start/end offsets must correctly point to the entity text within the original string."""
from app.services.precision_patch import PrecisionPatch
patcher = PrecisionPatch()
text = "Find me jobs in San Francisco or the Bay Area."
entities = patcher.find_entities(text)
for ent in entities:
extracted = text[ent["start"]:ent["end"]]
assert extracted == ent["text"], (
f"Offset mismatch: expected '{ent['text']}', got '{extracted}'"
)
class TestConfidenceMapping:
"""PrecisionPatch should correctly map Whisper word probabilities to entities."""
def test_maps_confidence_to_single_word_entity(self):
from app.services.precision_patch import PrecisionPatch
from types import SimpleNamespace
patcher = PrecisionPatch()
text = "Hello NowCree."
entities = [{"text": "NowCree", "start": 6, "end": 13, "label": "CARDINAL"}]
# Mock Whisper words
# Note: Whisper often includes spaces in the word text
words = [
SimpleNamespace(word="Hello", probability=0.99),
SimpleNamespace(word=" NowCree.", probability=0.45)
]
results = patcher.map_entities_to_confidence(entities, words, text)
assert results[0]["confidence"] == 0.45
def test_maps_confidence_to_multi_word_entity(self):
from app.services.precision_patch import PrecisionPatch
from types import SimpleNamespace
patcher = PrecisionPatch()
text = "Welcome to San Francisco."
entities = [{"text": "San Francisco", "start": 11, "end": 24, "label": "GPE"}]
words = [
SimpleNamespace(word="Welcome", probability=0.99),
SimpleNamespace(word=" to", probability=0.99),
SimpleNamespace(word=" San", probability=0.80),
SimpleNamespace(word=" Francisco.", probability=0.90)
]
results = patcher.map_entities_to_confidence(entities, words, text)
# Average of 0.8 and 0.9 = 0.85
assert results[0]["confidence"] == pytest.approx(0.85)
def test_identifies_suspicious_segments(self):
from app.services.precision_patch import PrecisionPatch
from types import SimpleNamespace
patcher = PrecisionPatch()
segments = [
SimpleNamespace(
text="I applied to Indeed.",
words=[
SimpleNamespace(word="I", probability=0.99),
SimpleNamespace(word=" applied", probability=0.99),
SimpleNamespace(word=" to", probability=0.99),
SimpleNamespace(word=" Indeed.", probability=0.95)
]
),
SimpleNamespace(
text="Then I checked NowCree.",
words=[
SimpleNamespace(word="Then", probability=0.99),
SimpleNamespace(word=" I", probability=0.99),
SimpleNamespace(word=" checked", probability=0.99),
SimpleNamespace(word=" NowCree.", probability=0.40)
]
)
]
suspicious = patcher.get_suspicious_indices(segments)
# Only the second segment has a low-confidence entity
assert suspicious == [1]
class TestLLMCorrection:
"""PrecisionPatch should integrate with GeminiAdapter to fix segments."""
def test_apply_patch_calls_gemini_with_context(self, monkeypatch):
from app.services.precision_patch import PrecisionPatch
from types import SimpleNamespace
# Mock GeminiAdapter
class MockGemini:
def correct_batch(self, lines, system_instruction=None):
# Simple mock fix
return [l.replace("NowCree", "Naukri") for l in lines]
monkeypatch.setattr("app.services.translators.gemini_adapter.GeminiAdapter", lambda: MockGemini())
patcher = PrecisionPatch()
segments = [
SimpleNamespace(text="I applied to Indeed.", words=[]),
SimpleNamespace(text="Then I checked NowCree.", words=[]),
SimpleNamespace(text="It was a great day.", words=[])
]
# Manually set suspicious indices to simulate previous steps
suspicious_indices = [1]
patcher.apply_patch(segments, suspicious_indices)
assert segments[1].text == "Then I checked Naukri."
# Context segment 0 should also be processed (and in this case, replaced with itself if no NowCree)
assert segments[0].text == "I applied to Indeed."
assert segments[2].text == "It was a great day."
def test_apply_precision_patch_integration(monkeypatch):
"""Verifies the convenience helper correctly orchestrates the patch."""
from app.services.precision_patch import apply_precision_patch
from types import SimpleNamespace
# Mock GeminiAdapter
class MockGemini:
def correct_batch(self, lines, system_instruction=None):
return [l.replace("NowCree", "Naukri") for l in lines]
monkeypatch.setattr("app.services.translators.gemini_adapter.GeminiAdapter", lambda: MockGemini())
# Mock segments with a low-confidence entity
segments = [
SimpleNamespace(
text="Check out LinkedIn like Indeed or NowCree.",
words=[
SimpleNamespace(word="Check", probability=0.99),
SimpleNamespace(word=" out", probability=0.99),
SimpleNamespace(word=" LinkedIn", probability=0.99),
SimpleNamespace(word=" like", probability=0.99),
SimpleNamespace(word=" Indeed", probability=0.99),
SimpleNamespace(word=" or", probability=0.99),
SimpleNamespace(word=" NowCree.", probability=0.10) # LOW CONFIDENCE
]
)
]
apply_precision_patch(segments)
assert "Naukri" in segments[0].text