sage / data /dedup.py
sage002's picture
feat: rewrite SAGE 1B architecture and replace legacy repo contents
ef18673 verified
"""Exact and near-duplicate detection helpers."""
from __future__ import annotations
import hashlib
import re
from collections import defaultdict
from typing import Iterable
TOKEN_RE = re.compile(r"\w+")
def exact_content_hash(text: str) -> str:
"""Return an exact content hash."""
return hashlib.sha1(text.encode("utf-8")).hexdigest()
def shingles(text: str, n: int = 5) -> set[str]:
"""Build token shingles for near-duplicate detection."""
tokens = TOKEN_RE.findall(text.lower())
if len(tokens) < n:
return {" ".join(tokens)} if tokens else set()
return {" ".join(tokens[i : i + n]) for i in range(len(tokens) - n + 1)}
def jaccard_similarity(left: str, right: str, n: int = 5) -> float:
"""Compute shingle-level Jaccard similarity."""
left_set = shingles(left, n)
right_set = shingles(right, n)
if not left_set and not right_set:
return 1.0
if not left_set or not right_set:
return 0.0
return len(left_set & right_set) / len(left_set | right_set)
def deduplicate_records(records: Iterable[dict[str, object]], near_dup_threshold: float = 0.92) -> list[dict[str, object]]:
"""Drop exact and near-duplicate records."""
exact_seen: set[str] = set()
buckets: dict[str, list[dict[str, object]]] = defaultdict(list)
kept: list[dict[str, object]] = []
for record in records:
text = str(record["text"])
digest = exact_content_hash(text)
if digest in exact_seen:
continue
signature = digest[:8]
near_duplicate = False
for candidate in buckets[signature]:
if jaccard_similarity(text, str(candidate["text"])) >= near_dup_threshold:
near_duplicate = True
break
if near_duplicate:
continue
exact_seen.add(digest)
buckets[signature].append(record)
kept.append(record)
return kept