LogicGoInfotechSpaces's picture
Initial duplicate detector
e28a7b2
"""Utilities to resolve merchants that are aliases of each other."""
from __future__ import annotations
import re
from dataclasses import dataclass, field
from typing import Dict, Iterable, Set
MERCHANT_CLEAN_RE = re.compile(r"[^a-z0-9]+")
def normalize_merchant(name: str) -> str:
"""Lowercase, strip and remove punctuation for comparisons."""
cleaned = MERCHANT_CLEAN_RE.sub("", name.strip().lower())
return cleaned
@dataclass
class MerchantAliasResolver:
"""A simple in-memory alias graph.
The resolver can hydrate itself from Mongo (through a repository) or
fall back to a small bootstrapped dictionary.
"""
alias_sets: Dict[str, Set[str]] = field(default_factory=dict)
def load_from_cursor(self, alias_documents: Iterable[dict]) -> None:
for doc in alias_documents:
canonical = normalize_merchant(doc.get("name", ""))
aliases = {normalize_merchant(alias) for alias in doc.get("aliases", [])}
aliases.add(canonical)
self.alias_sets[canonical] = aliases
def are_aliases(self, a: str, b: str) -> tuple[bool, str]:
norm_a = normalize_merchant(a)
norm_b = normalize_merchant(b)
if not norm_a or not norm_b:
return False, "blank"
if norm_a == norm_b:
return True, "exact"
for root, alias_group in self.alias_sets.items():
if norm_a in alias_group and norm_b in alias_group:
return True, "alias"
return False, "none"