ups-contract-faq2 / question_dedupe.py
Justin Tippins
Upgrade HF Space question workflow
a9271a0
import re
from difflib import SequenceMatcher
from typing import List, Optional, Tuple
class NearDuplicateDetector:
def __init__(self, threshold: float = 0.88):
self.threshold = float(threshold)
self._embedder = None
self._embeddings_enabled = False
try:
from sentence_transformers import SentenceTransformer # type: ignore
self._embedder = SentenceTransformer("all-MiniLM-L6-v2")
self._embeddings_enabled = True
except Exception:
self._embedder = None
self._embeddings_enabled = False
@property
def method(self) -> str:
return "embeddings" if self._embeddings_enabled else "lexical"
def normalize_tokens(self, text: str) -> List[str]:
text = (text or "").lower()
text = re.sub(r"[^a-z0-9\s]", " ", text)
text = re.sub(r"\s+", " ", text).strip()
return [tok for tok in text.split(" ") if tok]
def lexical_similarity(self, a: str, b: str) -> float:
toks_a = self.normalize_tokens(a)
toks_b = self.normalize_tokens(b)
if not toks_a or not toks_b:
return 0.0
set_a = set(toks_a)
set_b = set(toks_b)
jaccard = len(set_a & set_b) / max(1, len(set_a | set_b))
ratio = SequenceMatcher(None, " ".join(toks_a), " ".join(toks_b)).ratio()
return max(jaccard, ratio)
def embedding_similarity(self, a: str, b: str) -> float:
if not self._embeddings_enabled or not self._embedder:
return self.lexical_similarity(a, b)
try:
import numpy as np # type: ignore
vecs = self._embedder.encode([a, b], convert_to_numpy=True)
va, vb = vecs[0], vecs[1]
denom = float(np.linalg.norm(va) * np.linalg.norm(vb))
if denom <= 0:
return 0.0
return float(np.dot(va, vb) / denom)
except Exception:
return self.lexical_similarity(a, b)
def similarity(self, a: str, b: str) -> float:
if self._embeddings_enabled:
return self.embedding_similarity(a, b)
return self.lexical_similarity(a, b)
def find_best_match(self, text: str, candidates: List[Tuple[str, str]]) -> Tuple[Optional[str], float]:
best_id = None
best_score = 0.0
for item_id, candidate_text in candidates:
score = self.similarity(text, candidate_text)
if score > best_score:
best_score = score
best_id = item_id
return best_id, best_score
def is_near_duplicate(self, a: str, b: str) -> bool:
cutoff = self.threshold if self._embeddings_enabled else min(self.threshold, 0.72)
return self.similarity(a, b) >= cutoff