Spaces:

arjun-ms
/

Subtrans

Sleeping

App Files Files Community

Subtrans / app /tests /test_precision_patch.py

arjun-ms

Initial commit: Subtrans Subtitle Pipeline

57bbccb 7 days ago

raw

history blame contribute delete

10.6 kB

	"""
	TDD Tests for PrecisionPatch - NER + Confidence Correction.

	Tests are based on OBSERVED spaCy behavior (verified via smoke test):
	- "NowCree" is tagged CARDINAL (unknown capitalized token)
	- "LinkedIn like Indeed" is grouped as ORG
	- "notebookklem.google.com" is NOT tagged by NER - caught by URL regex fallback
	- "Anthropic" is tagged GPE
	- "San Francisco" is tagged GPE, "Bay Area" is tagged LOC

	Feature 1: find_entities - detect name-like tokens worth verifying
	- Must catch ORG, PRODUCT, PERSON, GPE, LOC, CARDINAL entities
	- Must catch URL-like tokens via regex fallback
	- Must return proper dict structure with text/start/end/label keys
	- Must return empty list for plain sentences with no proper nouns
	"""
	import pytest


	class TestFindEntities:
	"""PrecisionPatch.find_entities should correctly identify proper nouns and URLs."""

	def test_catches_unknown_capitalized_word_as_cardinal(self):
	"""
	spaCy tags unknown capitalized brand names (like 'NowCree') as CARDINAL.
	Our ENTITY_LABELS must include CARDINAL to catch this.
	"""
	from app.services.precision_patch import PrecisionPatch
	patcher = PrecisionPatch()
	text = "We can do the same thing on sites other than LinkedIn like Indeed or NowCree."
	entities = patcher.find_entities(text)
	entity_texts = [e["text"] for e in entities]
	# NowCree should be caught (as CARDINAL or ORG depending on context window)
	assert any("NowCree" in t for t in entity_texts), (
	f"Expected 'NowCree' to be flagged. Got: {entities}"
	)

	def test_catches_known_org_entities(self):
	"""'LinkedIn' or 'Indeed' must be tagged as ORG."""
	from app.services.precision_patch import PrecisionPatch
	patcher = PrecisionPatch()
	text = "We can do the same thing on sites other than LinkedIn like Indeed or NowCree."
	entities = patcher.find_entities(text)
	labels = {e["label"] for e in entities}
	assert labels & {"ORG", "PRODUCT", "GPE", "CARDINAL"}, (
	f"Expected at least one name-like entity. Got: {entities}"
	)

	def test_catches_location_entities(self):
	"""'San Francisco' must be tagged as GPE."""
	from app.services.precision_patch import PrecisionPatch
	patcher = PrecisionPatch()
	text = "Find me jobs in San Francisco or the Bay Area."
	entities = patcher.find_entities(text)
	labels = {e["label"] for e in entities}
	assert "GPE" in labels or "LOC" in labels, (
	f"Expected GPE/LOC entity for 'San Francisco'. Got: {entities}"
	)

	def test_url_regex_fallback_catches_garbled_url(self):
	"""
	spaCy NER does NOT tag URLs like 'notebookklem.google.com'.
	The URL regex fallback must catch this.
	"""
	from app.services.precision_patch import PrecisionPatch
	patcher = PrecisionPatch()
	text = "Let us go to notebookklem.google.com for interview prep."
	entities = patcher.find_entities(text)
	url_entities = [e for e in entities if e["label"] == "URL"]
	assert len(url_entities) > 0, (
	f"Expected URL entity for 'notebookklem.google.com'. Got: {entities}"
	)
	assert "notebookklem.google.com" in url_entities[0]["text"]

	def test_returns_empty_for_plain_sentence(self):
	"""A sentence with no proper nouns or URLs should return an empty list."""
	from app.services.precision_patch import PrecisionPatch
	patcher = PrecisionPatch()
	text = "The quick brown fox jumps over the lazy dog."
	entities = patcher.find_entities(text)
	assert entities == [], f"Expected no entities, got: {entities}"

	def test_entity_dict_has_required_fields(self):
	"""Each returned entity dict must have text, start, end, label keys."""
	from app.services.precision_patch import PrecisionPatch
	patcher = PrecisionPatch()
	text = "I applied to Anthropic last week."
	entities = patcher.find_entities(text)
	assert len(entities) > 0, "Expected at least one entity for 'Anthropic'"
	for ent in entities:
	assert "text" in ent, f"Missing 'text' key in {ent}"
	assert "start" in ent, f"Missing 'start' key in {ent}"
	assert "end" in ent, f"Missing 'end' key in {ent}"
	assert "label" in ent, f"Missing 'label' key in {ent}"

	def test_character_offsets_are_correct(self):
	"""start/end offsets must correctly point to the entity text within the original string."""
	from app.services.precision_patch import PrecisionPatch
	patcher = PrecisionPatch()
	text = "Find me jobs in San Francisco or the Bay Area."
	entities = patcher.find_entities(text)
	for ent in entities:
	extracted = text[ent["start"]:ent["end"]]
	assert extracted == ent["text"], (
	f"Offset mismatch: expected '{ent['text']}', got '{extracted}'"
	)


	class TestConfidenceMapping:
	"""PrecisionPatch should correctly map Whisper word probabilities to entities."""

	def test_maps_confidence_to_single_word_entity(self):
	from app.services.precision_patch import PrecisionPatch
	from types import SimpleNamespace

	patcher = PrecisionPatch()
	text = "Hello NowCree."
	entities = [{"text": "NowCree", "start": 6, "end": 13, "label": "CARDINAL"}]

	# Mock Whisper words
	# Note: Whisper often includes spaces in the word text
	words = [
	SimpleNamespace(word="Hello", probability=0.99),
	SimpleNamespace(word=" NowCree.", probability=0.45)
	]

	results = patcher.map_entities_to_confidence(entities, words, text)
	assert results[0]["confidence"] == 0.45

	def test_maps_confidence_to_multi_word_entity(self):
	from app.services.precision_patch import PrecisionPatch
	from types import SimpleNamespace

	patcher = PrecisionPatch()
	text = "Welcome to San Francisco."
	entities = [{"text": "San Francisco", "start": 11, "end": 24, "label": "GPE"}]

	words = [
	SimpleNamespace(word="Welcome", probability=0.99),
	SimpleNamespace(word=" to", probability=0.99),
	SimpleNamespace(word=" San", probability=0.80),
	SimpleNamespace(word=" Francisco.", probability=0.90)
	]

	results = patcher.map_entities_to_confidence(entities, words, text)
	# Average of 0.8 and 0.9 = 0.85
	assert results[0]["confidence"] == pytest.approx(0.85)

	def test_identifies_suspicious_segments(self):
	from app.services.precision_patch import PrecisionPatch
	from types import SimpleNamespace

	patcher = PrecisionPatch()

	segments = [
	SimpleNamespace(
	text="I applied to Indeed.",
	words=[
	SimpleNamespace(word="I", probability=0.99),
	SimpleNamespace(word=" applied", probability=0.99),
	SimpleNamespace(word=" to", probability=0.99),
	SimpleNamespace(word=" Indeed.", probability=0.95)
	]
	),
	SimpleNamespace(
	text="Then I checked NowCree.",
	words=[
	SimpleNamespace(word="Then", probability=0.99),
	SimpleNamespace(word=" I", probability=0.99),
	SimpleNamespace(word=" checked", probability=0.99),
	SimpleNamespace(word=" NowCree.", probability=0.40)
	]
	)
	]

	suspicious = patcher.get_suspicious_indices(segments)
	# Only the second segment has a low-confidence entity
	assert suspicious == [1]


	class TestLLMCorrection:
	"""PrecisionPatch should integrate with GeminiAdapter to fix segments."""

	def test_apply_patch_calls_gemini_with_context(self, monkeypatch):
	from app.services.precision_patch import PrecisionPatch
	from types import SimpleNamespace

	# Mock GeminiAdapter
	class MockGemini:
	def correct_batch(self, lines, system_instruction=None):
	# Simple mock fix
	return [l.replace("NowCree", "Naukri") for l in lines]

	monkeypatch.setattr("app.services.translators.gemini_adapter.GeminiAdapter", lambda: MockGemini())

	patcher = PrecisionPatch()
	segments = [
	SimpleNamespace(text="I applied to Indeed.", words=[]),
	SimpleNamespace(text="Then I checked NowCree.", words=[]),
	SimpleNamespace(text="It was a great day.", words=[])
	]

	# Manually set suspicious indices to simulate previous steps
	suspicious_indices = [1]

	patcher.apply_patch(segments, suspicious_indices)

	assert segments[1].text == "Then I checked Naukri."
	# Context segment 0 should also be processed (and in this case, replaced with itself if no NowCree)
	assert segments[0].text == "I applied to Indeed."
	assert segments[2].text == "It was a great day."


	def test_apply_precision_patch_integration(monkeypatch):
	"""Verifies the convenience helper correctly orchestrates the patch."""
	from app.services.precision_patch import apply_precision_patch
	from types import SimpleNamespace

	# Mock GeminiAdapter
	class MockGemini:
	def correct_batch(self, lines, system_instruction=None):
	return [l.replace("NowCree", "Naukri") for l in lines]

	monkeypatch.setattr("app.services.translators.gemini_adapter.GeminiAdapter", lambda: MockGemini())

	# Mock segments with a low-confidence entity
	segments = [
	SimpleNamespace(
	text="Check out LinkedIn like Indeed or NowCree.",
	words=[
	SimpleNamespace(word="Check", probability=0.99),
	SimpleNamespace(word=" out", probability=0.99),
	SimpleNamespace(word=" LinkedIn", probability=0.99),
	SimpleNamespace(word=" like", probability=0.99),
	SimpleNamespace(word=" Indeed", probability=0.99),
	SimpleNamespace(word=" or", probability=0.99),
	SimpleNamespace(word=" NowCree.", probability=0.10) # LOW CONFIDENCE
	]
	)
	]

	apply_precision_patch(segments)

	assert "Naukri" in segments[0].text