| """Exact and near-duplicate detection helpers.""" |
|
|
| from __future__ import annotations |
|
|
| import hashlib |
| import re |
| from collections import defaultdict |
| from typing import Iterable |
|
|
|
|
| TOKEN_RE = re.compile(r"\w+") |
|
|
|
|
| def exact_content_hash(text: str) -> str: |
| """Return an exact content hash.""" |
| return hashlib.sha1(text.encode("utf-8")).hexdigest() |
|
|
|
|
| def shingles(text: str, n: int = 5) -> set[str]: |
| """Build token shingles for near-duplicate detection.""" |
| tokens = TOKEN_RE.findall(text.lower()) |
| if len(tokens) < n: |
| return {" ".join(tokens)} if tokens else set() |
| return {" ".join(tokens[i : i + n]) for i in range(len(tokens) - n + 1)} |
|
|
|
|
| def jaccard_similarity(left: str, right: str, n: int = 5) -> float: |
| """Compute shingle-level Jaccard similarity.""" |
| left_set = shingles(left, n) |
| right_set = shingles(right, n) |
| if not left_set and not right_set: |
| return 1.0 |
| if not left_set or not right_set: |
| return 0.0 |
| return len(left_set & right_set) / len(left_set | right_set) |
|
|
|
|
| def deduplicate_records(records: Iterable[dict[str, object]], near_dup_threshold: float = 0.92) -> list[dict[str, object]]: |
| """Drop exact and near-duplicate records.""" |
| exact_seen: set[str] = set() |
| buckets: dict[str, list[dict[str, object]]] = defaultdict(list) |
| kept: list[dict[str, object]] = [] |
| for record in records: |
| text = str(record["text"]) |
| digest = exact_content_hash(text) |
| if digest in exact_seen: |
| continue |
| signature = digest[:8] |
| near_duplicate = False |
| for candidate in buckets[signature]: |
| if jaccard_similarity(text, str(candidate["text"])) >= near_dup_threshold: |
| near_duplicate = True |
| break |
| if near_duplicate: |
| continue |
| exact_seen.add(digest) |
| buckets[signature].append(record) |
| kept.append(record) |
| return kept |
|
|