from __future__ import annotations import re import unicodedata NAME_ALIASES = { # ---- Batters ---- "shohei ohtani": "shohei ohtani", "shōhei ohtani": "shohei ohtani", "seiya suzuki": "seiya suzuki", "masataka yoshida": "masataka yoshida", "mookie betts": "mookie betts", "fernando tatis jr": "fernando tatis jr", "fernando tatis jr.": "fernando tatis jr", "vladimir guerrero jr": "vladimir guerrero jr", "vladimir guerrero jr.": "vladimir guerrero jr", "ronald acuña jr": "ronald acuna jr", "ronald acuna jr": "ronald acuna jr", "ronald acuña jr.": "ronald acuna jr", "ronald acuna jr.": "ronald acuna jr", "juan soto": "juan soto", "julio rodriguez": "julio rodriguez", "julio rodríguez": "julio rodriguez", "jose ramirez": "jose ramirez", "josé ramírez": "jose ramirez", "yordan alvarez": "yordan alvarez", "yordan álvarez": "yordan alvarez", "luis robert jr": "luis robert jr", "luis robert jr.": "luis robert jr", "bo bichette": "bo bichette", "manny machado": "manny machado", "xander bogaerts": "xander bogaerts", "rafael devers": "rafael devers", "ketel marte": "ketel marte", "isaac paredes": "isaac paredes", "andy pages": "andy pages", # ---- Pitchers ---- # Jr./Sr. variants — canonical form keeps suffix when that is how statcast stores the name "nestor cortes jr": "nestor cortes jr", "nestor cortes": "nestor cortes jr", # odds APIs sometimes omit Jr "néstor cortés jr": "nestor cortes jr", "néstor cortés": "nestor cortes jr", # International names with diacritics commonly mis-encoded by odds sources "framber valdéz": "framber valdez", "framber valdez": "framber valdez", "sandy alcántara": "sandy alcantara", "sandy alcantara": "sandy alcantara", "pablo lópez": "pablo lopez", "pablo lopez": "pablo lopez", "ranger suárez": "ranger suarez", "ranger suarez": "ranger suarez", "josé berríos": "jose berrios", "jose berrios": "jose berrios", "josé quintana": "jose quintana", "jose quintana": "jose quintana", "martín pérez": "martin perez", "martin perez": "martin perez", "eduardo rodríguez": "eduardo rodriguez", "eduardo rodriguez": "eduardo rodriguez", "cristopher sánchez": "cristopher sanchez", "cristopher sanchez": "cristopher sanchez", "adrián houser": "adrian houser", "adrian houser": "adrian houser", "julio urías": "julio urias", "julio urias": "julio urias", "yonny chirinos": "yonny chirinos", "yusei kikuchi": "yusei kikuchi", "yoshinobu yamamoto": "yoshinobu yamamoto", "kodai senga": "kodai senga", "shōta imanaga": "shota imanaga", "shota imanaga": "shota imanaga", } def _strip_accents(text: str) -> str: normalized = unicodedata.normalize("NFKD", text) return "".join(ch for ch in normalized if not unicodedata.combining(ch)) def normalize_player_name(name: str) -> str: text = str(name or "").strip().lower() text = _strip_accents(text) text = text.replace("’", "'") text = text.replace(".", "") text = re.sub(r"\s+", " ", text) # Remove common suffix punctuation inconsistencies text = text.replace(" jr ", " jr ") text = text.replace(" sr ", " sr ") return NAME_ALIASES.get(text, text) def normalize_pitcher_name(name: str) -> str: """Canonical normalizer for pitcher names. Identical pipeline to normalize_player_name() — strips accents, removes punctuation, collapses whitespace, applies NAME_ALIASES — exposed as a separate entrypoint so pitcher-side imports are unambiguous and the alias table can be extended with pitcher-specific entries without touching the batter path. """ return normalize_player_name(name) def map_odds_name_to_model_name(name: str) -> str: return normalize_player_name(name)