File size: 1,924 Bytes
ef18673 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 | """Exact and near-duplicate detection helpers."""
from __future__ import annotations
import hashlib
import re
from collections import defaultdict
from typing import Iterable
TOKEN_RE = re.compile(r"\w+")
def exact_content_hash(text: str) -> str:
"""Return an exact content hash."""
return hashlib.sha1(text.encode("utf-8")).hexdigest()
def shingles(text: str, n: int = 5) -> set[str]:
"""Build token shingles for near-duplicate detection."""
tokens = TOKEN_RE.findall(text.lower())
if len(tokens) < n:
return {" ".join(tokens)} if tokens else set()
return {" ".join(tokens[i : i + n]) for i in range(len(tokens) - n + 1)}
def jaccard_similarity(left: str, right: str, n: int = 5) -> float:
"""Compute shingle-level Jaccard similarity."""
left_set = shingles(left, n)
right_set = shingles(right, n)
if not left_set and not right_set:
return 1.0
if not left_set or not right_set:
return 0.0
return len(left_set & right_set) / len(left_set | right_set)
def deduplicate_records(records: Iterable[dict[str, object]], near_dup_threshold: float = 0.92) -> list[dict[str, object]]:
"""Drop exact and near-duplicate records."""
exact_seen: set[str] = set()
buckets: dict[str, list[dict[str, object]]] = defaultdict(list)
kept: list[dict[str, object]] = []
for record in records:
text = str(record["text"])
digest = exact_content_hash(text)
if digest in exact_seen:
continue
signature = digest[:8]
near_duplicate = False
for candidate in buckets[signature]:
if jaccard_similarity(text, str(candidate["text"])) >= near_dup_threshold:
near_duplicate = True
break
if near_duplicate:
continue
exact_seen.add(digest)
buckets[signature].append(record)
kept.append(record)
return kept
|