"""Utilities to resolve merchants that are aliases of each other.""" from __future__ import annotations import re from dataclasses import dataclass, field from typing import Dict, Iterable, Set MERCHANT_CLEAN_RE = re.compile(r"[^a-z0-9]+") def normalize_merchant(name: str) -> str: """Lowercase, strip and remove punctuation for comparisons.""" cleaned = MERCHANT_CLEAN_RE.sub("", name.strip().lower()) return cleaned @dataclass class MerchantAliasResolver: """A simple in-memory alias graph. The resolver can hydrate itself from Mongo (through a repository) or fall back to a small bootstrapped dictionary. """ alias_sets: Dict[str, Set[str]] = field(default_factory=dict) def load_from_cursor(self, alias_documents: Iterable[dict]) -> None: for doc in alias_documents: canonical = normalize_merchant(doc.get("name", "")) aliases = {normalize_merchant(alias) for alias in doc.get("aliases", [])} aliases.add(canonical) self.alias_sets[canonical] = aliases def are_aliases(self, a: str, b: str) -> tuple[bool, str]: norm_a = normalize_merchant(a) norm_b = normalize_merchant(b) if not norm_a or not norm_b: return False, "blank" if norm_a == norm_b: return True, "exact" for root, alias_group in self.alias_sets.items(): if norm_a in alias_group and norm_b in alias_group: return True, "alias" return False, "none"