# merchant_map.py
# Normalizes raw merchant strings to clean display names
# Format: "fragment_to_match_lowercase": "Clean Name"

MERCHANT_ALIASES = {
    # Amazon
    "amzn": "Amazon",
    "amazon": "Amazon",
    "amz*": "Amazon",
    # Streaming
    "netflix": "Netflix",
    "nflx": "Netflix",
    "spotify": "Spotify",
    "hulu": "Hulu",
    "disney": "Disney+",
    "disneyplus": "Disney+",
    "hbo": "HBO Max",
    "max.com": "HBO Max",
    "peacock": "Peacock",
    "paramount": "Paramount+",
    "appletv": "Apple TV+",
    "apple.com/bill": "Apple Services",
    "apple services": "Apple Services",
    "itunes": "Apple Services",
    "youtube": "YouTube Premium",
    "youtubepremium": "YouTube Premium",
    # Food delivery
    "doordash": "DoorDash",
    "ubereats": "Uber Eats",
    "grubhub": "Grubhub",
    "seamless": "Seamless",
    "instacart": "Instacart",
    # Rideshare
    "uber": "Uber",
    "lyft": "Lyft",
    # Grocery
    "wholefds": "Whole Foods",
    "whole foods": "Whole Foods",
    "trader joe": "Trader Joe's",
    "kroger": "Kroger",
    "safeway": "Safeway",
    "wegmans": "Wegmans",
    "shoprite": "ShopRite",
    "costco": "Costco",
    "sams club": "Sam's Club",
    "target": "Target",
    "walmart": "Walmart",
    # Fuel
    "shell": "Shell",
    "exxon": "ExxonMobil",
    "mobil": "ExxonMobil",
    "bp ": "BP",
    "chevron": "Chevron",
    "sunoco": "Sunoco",
    "wawa": "Wawa",
    "quick chek": "Quick Chek",
    "quickchek": "Quick Chek",
    # Coffee
    "starbucks": "Starbucks",
    "dunkin": "Dunkin'",
    "dutch bros": "Dutch Bros",
    "caribou": "Caribou Coffee",
    # Fast food
    "mcdonald": "McDonald's",
    "mcdonalds": "McDonald's",
    "chick-fil-a": "Chick-fil-A",
    "chickfila": "Chick-fil-A",
    "chipotle": "Chipotle",
    "taco bell": "Taco Bell",
    "tacobell": "Taco Bell",
    "burger king": "Burger King",
    "burgerking": "Burger King",
    "wendy": "Wendy's",
    "subway": "Subway",
    "panera": "Panera Bread",
    # Tech / Cloud
    "google": "Google",
    "microsoft": "Microsoft",
    "msft": "Microsoft",
    "adobe": "Adobe",
    "dropbox": "Dropbox",
    "github": "GitHub",
    "openai": "OpenAI",
    "chatgpt": "OpenAI",
    "zoom": "Zoom",
    "slack": "Slack",
    "notion": "Notion",
    "1password": "1Password",
    "lastpass": "LastPass",
    # Fitness
    "planet fitness": "Planet Fitness",
    "la fitness": "LA Fitness",
    "lafitness": "LA Fitness",
    "peloton": "Peloton",
    "equinox": "Equinox",
    "anytime fitness": "Anytime Fitness",
    "ymca": "YMCA",
    # Insurance
    "geico": "GEICO",
    "progressive": "Progressive",
    "statefarm": "State Farm",
    "state farm": "State Farm",
    "allstate": "Allstate",
    # Utilities/Telecom
    "verizon": "Verizon",
    "at&t": "AT&T",
    "att ": "AT&T",
    "t-mobile": "T-Mobile",
    "tmobile": "T-Mobile",
    "comcast": "Comcast/Xfinity",
    "xfinity": "Comcast/Xfinity",
    "spectrum": "Spectrum",
    # Shopping
    "etsy": "Etsy",
    "ebay": "eBay",
    "bestbuy": "Best Buy",
    "best buy": "Best Buy",
    "home depot": "Home Depot",
    "homedepot": "Home Depot",
    "lowes": "Lowe's",
    "wayfair": "Wayfair",
    "chewy": "Chewy",
    # Travel
    "airbnb": "Airbnb",
    "vrbo": "VRBO",
    "expedia": "Expedia",
    "hotels.com": "Hotels.com",
    "booking.com": "Booking.com",
    "united air": "United Airlines",
    "delta air": "Delta Airlines",
    "american air": "American Airlines",
    "southwest": "Southwest Airlines",
    "jetblue": "JetBlue",
}


def normalize_merchant(raw: str) -> str:
    """
    Attempt to normalize a raw merchant string to a clean name.
    Returns the best match or a cleaned version of the original.
    """
    if not raw:
        return "Unknown"
    cleaned = raw.strip().lower()
    # Remove common noise suffixes
    for noise in ["*", "#", "  "]:
        cleaned = cleaned.replace(noise, " ")
    cleaned = cleaned.strip()

    for fragment, clean_name in MERCHANT_ALIASES.items():
        if fragment in cleaned:
            return clean_name

    # Fallback: title-case the raw string, trim long codes
    words = raw.strip().split()
    # Drop trailing tokens that look like reference codes (all digits/caps short tokens)
    filtered = []
    for w in words:
        if len(w) <= 3 and w.isupper() and w.isalpha():
            continue  # likely a state abbreviation or noise
        if w.isdigit():
            continue
        filtered.append(w)
    return " ".join(filtered[:4]).title() if filtered else raw.title()