2026_MLB_Model / data /odds_name_map.py
Syntrex's picture
Accuracy overhaul: pitcher resolution logging, baseline recalibration, vig fix, XGBoost blend
21151ce
raw
history blame
3.94 kB
from __future__ import annotations
import re
import unicodedata
NAME_ALIASES = {
# ---- Batters ----
"shohei ohtani": "shohei ohtani",
"shōhei ohtani": "shohei ohtani",
"seiya suzuki": "seiya suzuki",
"masataka yoshida": "masataka yoshida",
"mookie betts": "mookie betts",
"fernando tatis jr": "fernando tatis jr",
"fernando tatis jr.": "fernando tatis jr",
"vladimir guerrero jr": "vladimir guerrero jr",
"vladimir guerrero jr.": "vladimir guerrero jr",
"ronald acuña jr": "ronald acuna jr",
"ronald acuna jr": "ronald acuna jr",
"ronald acuña jr.": "ronald acuna jr",
"ronald acuna jr.": "ronald acuna jr",
"juan soto": "juan soto",
"julio rodriguez": "julio rodriguez",
"julio rodríguez": "julio rodriguez",
"jose ramirez": "jose ramirez",
"josé ramírez": "jose ramirez",
"yordan alvarez": "yordan alvarez",
"yordan álvarez": "yordan alvarez",
"luis robert jr": "luis robert jr",
"luis robert jr.": "luis robert jr",
"bo bichette": "bo bichette",
"manny machado": "manny machado",
"xander bogaerts": "xander bogaerts",
"rafael devers": "rafael devers",
"ketel marte": "ketel marte",
"isaac paredes": "isaac paredes",
"andy pages": "andy pages",
# ---- Pitchers ----
# Jr./Sr. variants — canonical form keeps suffix when that is how statcast stores the name
"nestor cortes jr": "nestor cortes jr",
"nestor cortes": "nestor cortes jr", # odds APIs sometimes omit Jr
"néstor cortés jr": "nestor cortes jr",
"néstor cortés": "nestor cortes jr",
# International names with diacritics commonly mis-encoded by odds sources
"framber valdéz": "framber valdez",
"framber valdez": "framber valdez",
"sandy alcántara": "sandy alcantara",
"sandy alcantara": "sandy alcantara",
"pablo lópez": "pablo lopez",
"pablo lopez": "pablo lopez",
"ranger suárez": "ranger suarez",
"ranger suarez": "ranger suarez",
"josé berríos": "jose berrios",
"jose berrios": "jose berrios",
"josé quintana": "jose quintana",
"jose quintana": "jose quintana",
"martín pérez": "martin perez",
"martin perez": "martin perez",
"eduardo rodríguez": "eduardo rodriguez",
"eduardo rodriguez": "eduardo rodriguez",
"cristopher sánchez": "cristopher sanchez",
"cristopher sanchez": "cristopher sanchez",
"adrián houser": "adrian houser",
"adrian houser": "adrian houser",
"julio urías": "julio urias",
"julio urias": "julio urias",
"yonny chirinos": "yonny chirinos",
"yusei kikuchi": "yusei kikuchi",
"yoshinobu yamamoto": "yoshinobu yamamoto",
"kodai senga": "kodai senga",
"shōta imanaga": "shota imanaga",
"shota imanaga": "shota imanaga",
}
def _strip_accents(text: str) -> str:
normalized = unicodedata.normalize("NFKD", text)
return "".join(ch for ch in normalized if not unicodedata.combining(ch))
def normalize_player_name(name: str) -> str:
text = str(name or "").strip().lower()
text = _strip_accents(text)
text = text.replace("’", "'")
text = text.replace(".", "")
text = re.sub(r"\s+", " ", text)
# Remove common suffix punctuation inconsistencies
text = text.replace(" jr ", " jr ")
text = text.replace(" sr ", " sr ")
return NAME_ALIASES.get(text, text)
def normalize_pitcher_name(name: str) -> str:
"""Canonical normalizer for pitcher names.
Identical pipeline to normalize_player_name() — strips accents, removes
punctuation, collapses whitespace, applies NAME_ALIASES — exposed as a
separate entrypoint so pitcher-side imports are unambiguous and the alias
table can be extended with pitcher-specific entries without touching the
batter path.
"""
return normalize_player_name(name)
def map_odds_name_to_model_name(name: str) -> str:
return normalize_player_name(name)