Spaces:
Sleeping
Sleeping
| import re | |
| from difflib import SequenceMatcher | |
| from typing import List, Optional, Tuple | |
| class NearDuplicateDetector: | |
| def __init__(self, threshold: float = 0.88): | |
| self.threshold = float(threshold) | |
| self._embedder = None | |
| self._embeddings_enabled = False | |
| try: | |
| from sentence_transformers import SentenceTransformer # type: ignore | |
| self._embedder = SentenceTransformer("all-MiniLM-L6-v2") | |
| self._embeddings_enabled = True | |
| except Exception: | |
| self._embedder = None | |
| self._embeddings_enabled = False | |
| def method(self) -> str: | |
| return "embeddings" if self._embeddings_enabled else "lexical" | |
| def normalize_tokens(self, text: str) -> List[str]: | |
| text = (text or "").lower() | |
| text = re.sub(r"[^a-z0-9\s]", " ", text) | |
| text = re.sub(r"\s+", " ", text).strip() | |
| return [tok for tok in text.split(" ") if tok] | |
| def lexical_similarity(self, a: str, b: str) -> float: | |
| toks_a = self.normalize_tokens(a) | |
| toks_b = self.normalize_tokens(b) | |
| if not toks_a or not toks_b: | |
| return 0.0 | |
| set_a = set(toks_a) | |
| set_b = set(toks_b) | |
| jaccard = len(set_a & set_b) / max(1, len(set_a | set_b)) | |
| ratio = SequenceMatcher(None, " ".join(toks_a), " ".join(toks_b)).ratio() | |
| return max(jaccard, ratio) | |
| def embedding_similarity(self, a: str, b: str) -> float: | |
| if not self._embeddings_enabled or not self._embedder: | |
| return self.lexical_similarity(a, b) | |
| try: | |
| import numpy as np # type: ignore | |
| vecs = self._embedder.encode([a, b], convert_to_numpy=True) | |
| va, vb = vecs[0], vecs[1] | |
| denom = float(np.linalg.norm(va) * np.linalg.norm(vb)) | |
| if denom <= 0: | |
| return 0.0 | |
| return float(np.dot(va, vb) / denom) | |
| except Exception: | |
| return self.lexical_similarity(a, b) | |
| def similarity(self, a: str, b: str) -> float: | |
| if self._embeddings_enabled: | |
| return self.embedding_similarity(a, b) | |
| return self.lexical_similarity(a, b) | |
| def find_best_match(self, text: str, candidates: List[Tuple[str, str]]) -> Tuple[Optional[str], float]: | |
| best_id = None | |
| best_score = 0.0 | |
| for item_id, candidate_text in candidates: | |
| score = self.similarity(text, candidate_text) | |
| if score > best_score: | |
| best_score = score | |
| best_id = item_id | |
| return best_id, best_score | |
| def is_near_duplicate(self, a: str, b: str) -> bool: | |
| cutoff = self.threshold if self._embeddings_enabled else min(self.threshold, 0.72) | |
| return self.similarity(a, b) >= cutoff | |