|
|
"""Utilities to resolve merchants that are aliases of each other.""" |
|
|
|
|
|
from __future__ import annotations |
|
|
|
|
|
import re |
|
|
from dataclasses import dataclass, field |
|
|
from typing import Dict, Iterable, Set |
|
|
|
|
|
|
|
|
MERCHANT_CLEAN_RE = re.compile(r"[^a-z0-9]+") |
|
|
|
|
|
|
|
|
def normalize_merchant(name: str) -> str: |
|
|
"""Lowercase, strip and remove punctuation for comparisons.""" |
|
|
cleaned = MERCHANT_CLEAN_RE.sub("", name.strip().lower()) |
|
|
return cleaned |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class MerchantAliasResolver: |
|
|
"""A simple in-memory alias graph. |
|
|
|
|
|
The resolver can hydrate itself from Mongo (through a repository) or |
|
|
fall back to a small bootstrapped dictionary. |
|
|
""" |
|
|
|
|
|
alias_sets: Dict[str, Set[str]] = field(default_factory=dict) |
|
|
|
|
|
def load_from_cursor(self, alias_documents: Iterable[dict]) -> None: |
|
|
for doc in alias_documents: |
|
|
canonical = normalize_merchant(doc.get("name", "")) |
|
|
aliases = {normalize_merchant(alias) for alias in doc.get("aliases", [])} |
|
|
aliases.add(canonical) |
|
|
self.alias_sets[canonical] = aliases |
|
|
|
|
|
def are_aliases(self, a: str, b: str) -> tuple[bool, str]: |
|
|
norm_a = normalize_merchant(a) |
|
|
norm_b = normalize_merchant(b) |
|
|
if not norm_a or not norm_b: |
|
|
return False, "blank" |
|
|
if norm_a == norm_b: |
|
|
return True, "exact" |
|
|
for root, alias_group in self.alias_sets.items(): |
|
|
if norm_a in alias_group and norm_b in alias_group: |
|
|
return True, "alias" |
|
|
return False, "none" |
|
|
|
|
|
|