import re
from difflib import SequenceMatcher
from typing import List, Optional, Tuple


class NearDuplicateDetector:
    def __init__(self, threshold: float = 0.88):
        self.threshold = float(threshold)
        self._embedder = None
        self._embeddings_enabled = False
        try:
            from sentence_transformers import SentenceTransformer  # type: ignore

            self._embedder = SentenceTransformer("all-MiniLM-L6-v2")
            self._embeddings_enabled = True
        except Exception:
            self._embedder = None
            self._embeddings_enabled = False

    @property
    def method(self) -> str:
        return "embeddings" if self._embeddings_enabled else "lexical"

    def normalize_tokens(self, text: str) -> List[str]:
        text = (text or "").lower()
        text = re.sub(r"[^a-z0-9\s]", " ", text)
        text = re.sub(r"\s+", " ", text).strip()
        return [tok for tok in text.split(" ") if tok]

    def lexical_similarity(self, a: str, b: str) -> float:
        toks_a = self.normalize_tokens(a)
        toks_b = self.normalize_tokens(b)
        if not toks_a or not toks_b:
            return 0.0
        set_a = set(toks_a)
        set_b = set(toks_b)
        jaccard = len(set_a & set_b) / max(1, len(set_a | set_b))
        ratio = SequenceMatcher(None, " ".join(toks_a), " ".join(toks_b)).ratio()
        return max(jaccard, ratio)

    def embedding_similarity(self, a: str, b: str) -> float:
        if not self._embeddings_enabled or not self._embedder:
            return self.lexical_similarity(a, b)
        try:
            import numpy as np  # type: ignore

            vecs = self._embedder.encode([a, b], convert_to_numpy=True)
            va, vb = vecs[0], vecs[1]
            denom = float(np.linalg.norm(va) * np.linalg.norm(vb))
            if denom <= 0:
                return 0.0
            return float(np.dot(va, vb) / denom)
        except Exception:
            return self.lexical_similarity(a, b)

    def similarity(self, a: str, b: str) -> float:
        if self._embeddings_enabled:
            return self.embedding_similarity(a, b)
        return self.lexical_similarity(a, b)

    def find_best_match(self, text: str, candidates: List[Tuple[str, str]]) -> Tuple[Optional[str], float]:
        best_id = None
        best_score = 0.0
        for item_id, candidate_text in candidates:
            score = self.similarity(text, candidate_text)
            if score > best_score:
                best_score = score
                best_id = item_id
        return best_id, best_score

    def is_near_duplicate(self, a: str, b: str) -> bool:
        cutoff = self.threshold if self._embeddings_enabled else min(self.threshold, 0.72)
        return self.similarity(a, b) >= cutoff