oneocr / _archive /dedup.py

OneOCR Dev

OneOCR - reverse engineering complete, ONNX pipeline 53% match rate

ce847d4 about 19 hours ago

24.9 kB

	"""Smart OCR deduplication — stabilization-first approach.

	Core principle: don't read text until it STOPS CHANGING.
	Then check against read history to avoid repeats.

	Architecture:

	Phase 1 — Snapshot Stabilization
	Each tick compares the full OCR output (all regions merged) with the
	previous tick. If text is growing (typewriter effect), we wait.
	Only when the snapshot is identical for ``stabilize_ticks`` consecutive
	ticks do we consider it "stable" and proceed.

	Phase 2 — Line History Dedup
	Once stable, each line is fuzzy-compared against a history of previously
	emitted lines. Only genuinely new lines pass through. History entries
	expire via TTL so the same text can be re-read after a cooldown.

	Phase 3 — Significance Check
	Rejects composed output that is too short, has too few real words,
	or is mostly non-alphanumeric (OCR garbage / UI artifacts).

	This naturally handles:
	- Typewriter effects: text grows → wait → stabilize → read complete sentence
	- Static UI (HP bars, names): stabilizes → read once → in history → skip
	- OCR noise: fuzzy matching tolerates minor variations
	- Dialog changes: snapshot changes → re-stabilize → emit new parts only
	- Repeated dialog: TTL expiry allows re-reading after cooldown

	Usage::

	from src.services.ocr.dedup import SmartDedup

	dedup = SmartDedup()
	text = dedup.process(region_labels, ocr_results)
	if text is not None:
	translate_and_speak(text)
	"""

	from __future__ import annotations

	import time
	from collections import deque
	from dataclasses import dataclass
	from difflib import SequenceMatcher

	from src.services.ocr.models import OcrResult
	from src.utils.logger import logger

	# ── Constants (sensible defaults) ────────────────────────────────

	DEFAULT_STABILIZE_TICKS: int = 3
	DEFAULT_SNAPSHOT_SIMILARITY: float = 0.92
	DEFAULT_LINE_SIMILARITY: float = 0.80
	DEFAULT_LINE_TTL: float = 120.0
	DEFAULT_HISTORY_TTL: float = 90.0
	DEFAULT_HISTORY_SIZE: int = 30
	DEFAULT_MIN_NEW_CHARS: int = 8
	DEFAULT_MIN_NEW_WORDS: int = 2
	DEFAULT_MIN_ALNUM_RATIO: float = 0.35


	# ── Data classes ─────────────────────────────────────────────────


	@dataclass
	class HistoryEntry:
	"""An entry in the global text history ring buffer."""

	norm_text: str
	original_text: str
	first_seen: float
	last_seen: float
	hit_count: int = 1


	@dataclass
	class DedupConfig:
	"""All tunable knobs for the dedup system.

	Attributes:
	stabilize_ticks: Consecutive identical ticks before text is considered "stable".
	snapshot_similarity: Fuzzy threshold for treating two snapshots as identical (0-1).
	line_similarity: Fuzzy threshold for line-level history matching (0-1).
	line_ttl: Seconds before a known line in history expires.
	history_ttl: Seconds before a global history entry expires.
	history_size: Max entries in the global history ring buffer.
	history_similarity: Alias for line_similarity (backward compat with bridge.py).
	min_new_chars: Minimum characters for a change to be significant.
	min_new_words: Minimum word count for significance.
	min_alnum_ratio: Minimum alphanumeric ratio for significance.
	debounce_time: Legacy field — not used internally, kept for bridge compat.
	"""

	stabilize_ticks: int = DEFAULT_STABILIZE_TICKS
	snapshot_similarity: float = DEFAULT_SNAPSHOT_SIMILARITY
	line_similarity: float = DEFAULT_LINE_SIMILARITY
	line_ttl: float = DEFAULT_LINE_TTL
	history_ttl: float = DEFAULT_HISTORY_TTL
	history_size: int = DEFAULT_HISTORY_SIZE
	history_similarity: float = DEFAULT_LINE_SIMILARITY
	min_new_chars: int = DEFAULT_MIN_NEW_CHARS
	min_new_words: int = DEFAULT_MIN_NEW_WORDS
	min_alnum_ratio: float = DEFAULT_MIN_ALNUM_RATIO
	debounce_time: float = 0.0 # legacy — mapped to stabilize_ticks externally
	instant_mode: bool = False # skip stabilization — emit text on first identical tick


	# ── Helpers ──────────────────────────────────────────────────────


	def _normalize(text: str) -> str:
	"""Collapse whitespace, strip, lowercase — for comparison only."""
	return " ".join(text.split()).strip().lower()


	# ── Line History ─────────────────────────────────────────────────


	class LineHistory:
	"""Tracks previously emitted lines with TTL-based expiry.

	Each emitted line is stored (normalized) with a timestamp.
	Old entries expire after ``ttl`` seconds, allowing re-reading.
	Fuzzy matching handles OCR noise on short lines.
	"""

	def __init__(
	self,
	ttl: float = DEFAULT_LINE_TTL,
	similarity: float = DEFAULT_LINE_SIMILARITY,
	) -> None:
	self._entries: dict[str, float] = {} # norm_line → last_emitted_at
	self._ttl = ttl
	self._similarity = similarity

	def is_known(self, line: str) -> bool:
	"""Check if a line was emitted recently (within TTL).

	Uses exact match first, then fuzzy for short lines.

	Args:
	line: Raw (non-normalized) line text.

	Returns:
	True if line is in recent history (should be skipped).
	"""
	norm = _normalize(line)
	if len(norm) < 2:
	return True # too short → treat as known (skip garbage)

	now = time.monotonic()
	self._gc(now)

	# Fast path: exact match
	if norm in self._entries:
	return True

	# Slow path: fuzzy match (short lines where OCR noise matters)
	if len(norm) < 60:
	for key in self._entries:
	if abs(len(norm) - len(key)) > max(5, len(key) * 0.25):
	continue
	ratio = SequenceMatcher(None, norm, key).ratio()
	if ratio >= self._similarity:
	return True

	return False

	def mark_emitted(self, line: str) -> None:
	"""Record a line as emitted."""
	norm = _normalize(line)
	if norm:
	self._entries[norm] = time.monotonic()

	def reset(self) -> None:
	"""Clear all history."""
	self._entries.clear()

	@property
	def size(self) -> int:
	return len(self._entries)

	def _gc(self, now: float) -> None:
	"""Remove entries older than TTL."""
	expired = [k for k, ts in self._entries.items() if now - ts > self._ttl]
	for k in expired:
	del self._entries[k]


	# ── Global Text History (ring buffer for full text blocks) ───────


	class GlobalTextHistory:
	"""Ring buffer of recently emitted text blocks with TTL.

	Prevents the same composed text from being re-emitted within
	the TTL window. Uses fuzzy matching to handle OCR noise.
	"""

	def __init__(
	self,
	max_size: int = DEFAULT_HISTORY_SIZE,
	ttl: float = DEFAULT_HISTORY_TTL,
	similarity: float = DEFAULT_LINE_SIMILARITY,
	) -> None:
	self._entries: deque[HistoryEntry] = deque(maxlen=max_size)
	self._ttl = ttl
	self._similarity = similarity

	def is_duplicate(self, text: str) -> tuple[bool, float]:
	"""Check whether text duplicates something in recent history.

	Args:
	text: Composed text block.

	Returns:
	``(is_dup, best_similarity)``
	"""
	now = time.monotonic()
	norm = _normalize(text)
	if not norm:
	return (True, 1.0)

	best_sim = 0.0
	for entry in self._entries:
	if now - entry.last_seen > self._ttl:
	continue

	if entry.norm_text == norm:
	entry.last_seen = now
	entry.hit_count += 1
	return (True, 1.0)

	ratio = SequenceMatcher(None, norm, entry.norm_text).ratio()
	best_sim = max(best_sim, ratio)
	if ratio >= self._similarity:
	entry.last_seen = now
	entry.hit_count += 1
	return (True, ratio)

	return (False, best_sim)

	def add(self, text: str) -> None:
	"""Record a new text block in history."""
	norm = _normalize(text)
	now = time.monotonic()
	self._entries.append(
	HistoryEntry(
	norm_text=norm,
	original_text=text,
	first_seen=now,
	last_seen=now,
	)
	)

	def reset(self) -> None:
	self._entries.clear()

	@property
	def size(self) -> int:
	return len(self._entries)


	# ── Significance Check ───────────────────────────────────────────


	class ChangeDetector:
	"""Decide whether new lines constitute a meaningful change.

	Rejects very short text, too few words, or mostly non-alphanumeric content.
	"""

	def __init__(
	self,
	min_chars: int = DEFAULT_MIN_NEW_CHARS,
	min_words: int = DEFAULT_MIN_NEW_WORDS,
	min_alnum_ratio: float = DEFAULT_MIN_ALNUM_RATIO,
	) -> None:
	self._min_chars = min_chars
	self._min_words = min_words
	self._min_alnum_ratio = min_alnum_ratio

	def is_significant(self, new_lines: list[str]) -> bool:
	"""Return True if the new lines represent real content, not OCR garbage."""
	text = " ".join(line.strip() for line in new_lines).strip()

	if len(text) < self._min_chars:
	return False

	words = text.split()
	if len(words) < self._min_words:
	return False

	alnum = sum(1 for c in text if c.isalnum())
	ratio = alnum / len(text) if text else 0
	if ratio < self._min_alnum_ratio:
	return False

	return True


	# ── Main Facade: SmartDedup ──────────────────────────────────────


	class SmartDedup:
	"""Stabilization-first OCR deduplication.

	Core algorithm:

	1. Each tick: merge all OCR results into a single text snapshot
	2. Compare snapshot with previous tick — growing? same? different?
	3. When snapshot is identical for ``stabilize_ticks`` consecutive ticks → STABLE
	4. Extract lines, filter against read history → emit only NEW lines
	5. Significance check → reject OCR garbage
	6. Add emitted lines to history, record in global ring buffer

	This replaces the old per-line-tracker approach which caused:
	- Sentence fragments (read partial text too early)
	- Infinite silence (partial lines marked "known" too aggressively)

	Example::

	dedup = SmartDedup()

	# On each pipeline tick:
	text = dedup.process(region_labels, ocr_results)
	if text is not None:
	await translate_and_speak(text)

	# On pipeline stop or config change:
	dedup.reset()
	"""

	def __init__(self, config: DedupConfig \| None = None) -> None:
	self._cfg = config or DedupConfig()

	# Stabilization state
	self._last_snapshot: str \| None = None
	self._last_raw: str \| None = None
	self._stable_count: int = 0
	self._processed_snapshot: str \| None = None

	# Why: track last emitted text to detect post-emit growth
	# (e.g. we emitted 2 lines, then lines 3-4 appear → continuation, not new text)
	self._last_emitted_norm: str \| None = None

	# History layers
	self._line_history = LineHistory(
	ttl=self._cfg.line_ttl,
	similarity=self._cfg.line_similarity,
	)
	self._global_history = GlobalTextHistory(
	max_size=self._cfg.history_size,
	ttl=self._cfg.history_ttl,
	similarity=self._cfg.history_similarity,
	)
	self._change_detector = ChangeDetector(
	min_chars=self._cfg.min_new_chars,
	min_words=self._cfg.min_new_words,
	min_alnum_ratio=self._cfg.min_alnum_ratio,
	)

	# ── Public API ───────────────────────────────────────────────

	def process(
	self,
	region_labels: list[str],
	ocr_results: list[OcrResult],
	*,
	force: bool = False,
	) -> str \| None:
	"""Run stabilization-based dedup on multi-region OCR results.

	Args:
	region_labels: Label/ID for each region (for diagnostics).
	ocr_results: OCR result per region (same order as labels).
	force: If True, skip all dedup and return all text immediately.

	Returns:
	Text to translate + speak, or None if suppressed by dedup.
	"""
	# ── Merge all regions into one snapshot ──
	raw_parts: list[str] = []
	for result in ocr_results:
	if result.error or result.is_empty:
	continue
	text = result.text.strip()
	if text:
	raw_parts.append(text)

	if not raw_parts:
	return None

	full_raw = "\n".join(raw_parts)
	full_norm = _normalize(full_raw)

	if not full_norm or len(full_norm) < 2:
	return None

	# ── Force read: bypass all dedup ──
	if force:
	self._global_history.add(full_raw)
	self._mark_all_lines_known(full_raw)
	self._last_snapshot = full_norm
	self._last_raw = full_raw
	self._processed_snapshot = full_norm
	self._stable_count = 0
	logger.info("Dedup: force read — emitting %d chars", len(full_raw))
	return full_raw

	# ── Phase 1: Stabilization check ──
	if self._last_snapshot is None:
	# First tick — record snapshot, wait for next
	self._last_snapshot = full_norm
	self._last_raw = full_raw
	self._stable_count = 0
	self._processed_snapshot = None
	# Why: in instant mode, skip waiting — proceed on the very first tick
	if not self._cfg.instant_mode:
	return None

	# Compare current snapshot with previous
	snapshot_sim = self._snapshot_similarity(self._last_snapshot, full_norm)

	if snapshot_sim >= self._cfg.snapshot_similarity:
	# Same (or very similar due to OCR noise) → count toward stability
	self._stable_count += 1
	elif self._is_text_growing(self._last_snapshot, full_norm):
	# Text is expanding (typewriter effect) → reset, keep waiting
	self._stable_count = 0
	self._last_snapshot = full_norm
	self._last_raw = full_raw
	self._processed_snapshot = None
	logger.debug("Dedup: text growing, waiting for stabilization")
	return None
	elif (
	self._last_emitted_norm is not None
	and self._is_text_growing(self._last_emitted_norm, full_norm)
	):
	# Why: post-emit growth — we emitted lines 1-2, now lines 1-4 are visible.
	# The new snapshot is a SUPERSET of what we emitted → continuation.
	# Reset stability and wait for the full text to settle.
	self._stable_count = 0
	self._last_snapshot = full_norm
	self._last_raw = full_raw
	self._processed_snapshot = None
	logger.debug("Dedup: post-emit growth detected, waiting for continuation")
	return None
	else:
	# Completely different content → new text, start fresh
	self._stable_count = 0
	self._last_snapshot = full_norm
	self._last_raw = full_raw
	self._processed_snapshot = None
	logger.debug("Dedup: snapshot changed, waiting for stabilization")
	return None

	# Update raw text (keep latest version even during stability counting)
	self._last_snapshot = full_norm
	self._last_raw = full_raw

	# Not stable yet?
	required_ticks = 1 if self._cfg.instant_mode else self._cfg.stabilize_ticks
	if self._stable_count < required_ticks:
	return None

	# ── Already processed this exact snapshot? ──
	if self._processed_snapshot is not None:
	sim = self._snapshot_similarity(full_norm, self._processed_snapshot)
	if sim >= self._cfg.snapshot_similarity:
	return None # already evaluated, nothing new

	# ── Phase 2: Text is STABLE — extract new lines ──
	all_lines = self._extract_lines(full_raw, ocr_results)
	new_lines: list[str] = []

	for line in all_lines:
	if not self._line_history.is_known(line):
	new_lines.append(line)

	# Also check against global text history (full text block dedup)
	if new_lines:
	composed = "\n".join(new_lines)
	is_dup, sim = self._global_history.is_duplicate(composed)
	if is_dup:
	logger.debug("Dedup: global history match (sim=%.3f)", sim)
	new_lines = []

	if not new_lines:
	# All lines already known — mark snapshot as processed
	self._processed_snapshot = full_norm
	return None

	# ── Phase 3: Significance check ──
	if not self._change_detector.is_significant(new_lines):
	logger.debug(
	"Dedup: new lines not significant (%d lines, %d chars)",
	len(new_lines),
	sum(len(line) for line in new_lines),
	)
	self._processed_snapshot = full_norm
	return None

	# ── EMIT! ──
	composed = "\n".join(new_lines)
	self._mark_all_lines_known(composed)
	self._global_history.add(composed)
	self._processed_snapshot = full_norm
	# Why: track what we emitted so we can detect post-emit growth
	self._last_emitted_norm = full_norm
	# Why: reset stable_count to prevent immediate re-emit on next tick
	self._stable_count = 0

	logger.info(
	"Dedup: emitting %d new lines (%d chars, %d known lines in history)",
	len(new_lines),
	len(composed),
	self._line_history.size,
	)
	return composed

	def force_flush(self) -> str \| None:
	"""Force-emit whatever raw text is pending (for force-read button)."""
	if self._last_raw:
	raw = self._last_raw
	self._global_history.add(raw)
	self._mark_all_lines_known(raw)
	return raw
	return None

	def update_config(self, config: DedupConfig) -> None:
	"""Apply new configuration. Rebuilds internal components."""
	self._cfg = config
	self._line_history = LineHistory(
	ttl=config.line_ttl,
	similarity=config.line_similarity,
	)
	self._global_history = GlobalTextHistory(
	max_size=config.history_size,
	ttl=config.history_ttl,
	similarity=config.history_similarity,
	)
	self._change_detector = ChangeDetector(
	min_chars=config.min_new_chars,
	min_words=config.min_new_words,
	min_alnum_ratio=config.min_alnum_ratio,
	)
	logger.info("SmartDedup: config updated")

	def reset(self) -> None:
	"""Clear all state (e.g. on scene change or pipeline restart)."""
	self._last_snapshot = None
	self._last_raw = None
	self._stable_count = 0
	self._processed_snapshot = None
	self._last_emitted_norm = None
	self._line_history.reset()
	self._global_history.reset()
	logger.info("SmartDedup: all state reset")

	def reset_region(self, label: str) -> None:
	"""No-op in snapshot-based approach — kept for backward compat."""
	pass

	@property
	def stats(self) -> dict[str, int]:
	"""Return diagnostic stats."""
	return {
	"tracked_regions": 0,
	"total_known_lines": self._line_history.size,
	"history_size": self._global_history.size,
	"stable_count": self._stable_count,
	}

	# ── Internal ─────────────────────────────────────────────────

	@staticmethod
	def _snapshot_similarity(a: str, b: str) -> float:
	"""Fast similarity between two normalized snapshots."""
	if a == b:
	return 1.0
	if not a or not b:
	return 0.0
	return SequenceMatcher(None, a, b).ratio()

	@staticmethod
	def _is_text_growing(old_norm: str, new_norm: str) -> bool:
	"""Check if new text is an expansion of old text (typewriter effect).

	Returns True if new_norm is longer AND contains most of old_norm's
	words at the beginning (prefix-like growth).
	"""
	if len(new_norm) <= len(old_norm):
	return False

	# Simple prefix check — covers most typewriter cases
	if new_norm.startswith(old_norm):
	return True

	# Word-level check: old words appear at the start of new word sequence
	old_words = old_norm.split()
	new_words = new_norm.split()

	if len(new_words) <= len(old_words):
	return False

	# Count matching words at the beginning
	matching = 0
	for old_w, new_w in zip(old_words, new_words):
	if old_w == new_w:
	matching += 1
	elif SequenceMatcher(None, old_w, new_w).ratio() > 0.8:
	# Why: OCR noise may corrupt already-visible words slightly
	matching += 1

	# Why: 60% threshold — allows some OCR noise in the matching portion
	return matching >= len(old_words) * 0.6

	def _extract_lines(
	self, raw_text: str, ocr_results: list[OcrResult]
	) -> list[str]:
	"""Extract individual lines from OCR results.

	Prefers structured ``OcrResult.lines`` when available.
	Deduplicates across regions (overlapping capture areas).

	Args:
	raw_text: Fallback raw text (used if no structured lines).
	ocr_results: OCR results with structured lines.

	Returns:
	List of unique raw line texts.
	"""
	lines: list[str] = []
	seen_norms: set[str] = set()

	for result in ocr_results:
	if result.error or result.is_empty:
	continue
	for ocr_line in result.lines:
	raw = ocr_line.text.strip()
	if not raw:
	continue
	norm = _normalize(raw)
	if len(norm) < 2:
	continue

	# Why: skip duplicate lines across regions (overlapping capture areas)
	if norm in seen_norms:
	continue

	# Fuzzy cross-region dedup for short lines
	# Why: high threshold (0.95) because overlapping regions produce
	# near-identical text, not merely similar text
	is_cross_dup = False
	if len(norm) < 60:
	for seen in seen_norms:
	if abs(len(norm) - len(seen)) > 3:
	continue
	if SequenceMatcher(None, norm, seen).ratio() >= 0.95:
	is_cross_dup = True
	break
	if is_cross_dup:
	continue

	seen_norms.add(norm)
	lines.append(raw)

	# Fallback: if no structured lines, split raw text
	if not lines:
	for line in raw_text.split("\n"):
	stripped = line.strip()
	if stripped and len(_normalize(stripped)) >= 2:
	norm = _normalize(stripped)
	if norm not in seen_norms:
	seen_norms.add(norm)
	lines.append(stripped)

	return lines

	def _mark_all_lines_known(self, text: str) -> None:
	"""Add all lines in text to line history."""
	for line in text.split("\n"):
	stripped = line.strip()
	if stripped and len(_normalize(stripped)) >= 2:
	self._line_history.mark_emitted(stripped)