Spaces:

Justinroy
/

ups-contract-faq2

Sleeping

Justin Tippins

Upgrade HF Space question workflow

a9271a0 3 months ago

2.77 kB

	import re
	from difflib import SequenceMatcher
	from typing import List, Optional, Tuple


	class NearDuplicateDetector:
	def __init__(self, threshold: float = 0.88):
	self.threshold = float(threshold)
	self._embedder = None
	self._embeddings_enabled = False
	try:
	from sentence_transformers import SentenceTransformer # type: ignore

	self._embedder = SentenceTransformer("all-MiniLM-L6-v2")
	self._embeddings_enabled = True
	except Exception:
	self._embedder = None
	self._embeddings_enabled = False

	@property
	def method(self) -> str:
	return "embeddings" if self._embeddings_enabled else "lexical"

	def normalize_tokens(self, text: str) -> List[str]:
	text = (text or "").lower()
	text = re.sub(r"[^a-z0-9\s]", " ", text)
	text = re.sub(r"\s+", " ", text).strip()
	return [tok for tok in text.split(" ") if tok]

	def lexical_similarity(self, a: str, b: str) -> float:
	toks_a = self.normalize_tokens(a)
	toks_b = self.normalize_tokens(b)
	if not toks_a or not toks_b:
	return 0.0
	set_a = set(toks_a)
	set_b = set(toks_b)
	jaccard = len(set_a & set_b) / max(1, len(set_a \| set_b))
	ratio = SequenceMatcher(None, " ".join(toks_a), " ".join(toks_b)).ratio()
	return max(jaccard, ratio)

	def embedding_similarity(self, a: str, b: str) -> float:
	if not self._embeddings_enabled or not self._embedder:
	return self.lexical_similarity(a, b)
	try:
	import numpy as np # type: ignore

	vecs = self._embedder.encode([a, b], convert_to_numpy=True)
	va, vb = vecs[0], vecs[1]
	denom = float(np.linalg.norm(va) * np.linalg.norm(vb))
	if denom <= 0:
	return 0.0
	return float(np.dot(va, vb) / denom)
	except Exception:
	return self.lexical_similarity(a, b)

	def similarity(self, a: str, b: str) -> float:
	if self._embeddings_enabled:
	return self.embedding_similarity(a, b)
	return self.lexical_similarity(a, b)

	def find_best_match(self, text: str, candidates: List[Tuple[str, str]]) -> Tuple[Optional[str], float]:
	best_id = None
	best_score = 0.0
	for item_id, candidate_text in candidates:
	score = self.similarity(text, candidate_text)
	if score > best_score:
	best_score = score
	best_id = item_id
	return best_id, best_score

	def is_near_duplicate(self, a: str, b: str) -> bool:
	cutoff = self.threshold if self._embeddings_enabled else min(self.threshold, 0.72)
	return self.similarity(a, b) >= cutoff