import re from difflib import SequenceMatcher from typing import List, Optional, Tuple class NearDuplicateDetector: def __init__(self, threshold: float = 0.88): self.threshold = float(threshold) self._embedder = None self._embeddings_enabled = False try: from sentence_transformers import SentenceTransformer # type: ignore self._embedder = SentenceTransformer("all-MiniLM-L6-v2") self._embeddings_enabled = True except Exception: self._embedder = None self._embeddings_enabled = False @property def method(self) -> str: return "embeddings" if self._embeddings_enabled else "lexical" def normalize_tokens(self, text: str) -> List[str]: text = (text or "").lower() text = re.sub(r"[^a-z0-9\s]", " ", text) text = re.sub(r"\s+", " ", text).strip() return [tok for tok in text.split(" ") if tok] def lexical_similarity(self, a: str, b: str) -> float: toks_a = self.normalize_tokens(a) toks_b = self.normalize_tokens(b) if not toks_a or not toks_b: return 0.0 set_a = set(toks_a) set_b = set(toks_b) jaccard = len(set_a & set_b) / max(1, len(set_a | set_b)) ratio = SequenceMatcher(None, " ".join(toks_a), " ".join(toks_b)).ratio() return max(jaccard, ratio) def embedding_similarity(self, a: str, b: str) -> float: if not self._embeddings_enabled or not self._embedder: return self.lexical_similarity(a, b) try: import numpy as np # type: ignore vecs = self._embedder.encode([a, b], convert_to_numpy=True) va, vb = vecs[0], vecs[1] denom = float(np.linalg.norm(va) * np.linalg.norm(vb)) if denom <= 0: return 0.0 return float(np.dot(va, vb) / denom) except Exception: return self.lexical_similarity(a, b) def similarity(self, a: str, b: str) -> float: if self._embeddings_enabled: return self.embedding_similarity(a, b) return self.lexical_similarity(a, b) def find_best_match(self, text: str, candidates: List[Tuple[str, str]]) -> Tuple[Optional[str], float]: best_id = None best_score = 0.0 for item_id, candidate_text in candidates: score = self.similarity(text, candidate_text) if score > best_score: best_score = score best_id = item_id return best_id, best_score def is_near_duplicate(self, a: str, b: str) -> bool: cutoff = self.threshold if self._embeddings_enabled else min(self.threshold, 0.72) return self.similarity(a, b) >= cutoff