from typing import Dict, Optional import re # Stat aliases to help robust detection KNOWN_COLUMNS = { "num": ["#", "no", "no.", "number", "jersey", "nr"], "name": ["player", "name", "athlete", "roster", "nom"], "min": ["min", "mins", "time", "played", "duration"], "pts": ["pts", "points", "score"], "fg": ["fg", "fgm-a", "fg m-a", "field goals", "fg-a"], "fg_pct": ["fg%", "fg %", "field goal %", "pct"], "2p": ["2p", "2pt", "2fg", "2pm-a", "2fga"], "2p_pct": ["2p%", "2pt%", "2fg%"], "3p": ["3p", "3pt", "3fg", "3pm-a", "3fga"], "3p_pct": ["3p%", "3pt%", "3fg%"], "ft": ["ft", "ftm-a", "free throws", "ft-a"], "ft_pct": ["ft%", "ft %", "free throw %"], "off": ["off", "or", "of", "oreb", "offensive"], "def": ["def", "dr", "df", "dreb", "defensive"], "reb": ["reb", "tot", "tr", "trb", "rebounds", "rb"], "ast": ["ast", "as", "assists"], "to": ["to", "tov", "turnovers"], "stl": ["stl", "st", "steals"], "blk": ["blk", "bs", "blocks"], "pf": ["pf", "fouls", "fl", "f"], "pos": ["pos", "position"] } def normalize_header(header_text: str) -> str: """Strips and lowercases a header string.""" header_text = header_text.lower() # Remove all spaces and weird punctuation to reduce variations header_text = re.sub(r'[^a-z0-9%#\.]', '', header_text) return header_text def identify_column(raw_header: str) -> Optional[str]: """Matches a raw string from a table header to our known keys.""" normalized = normalize_header(raw_header) for canonical_key, aliases in KNOWN_COLUMNS.items(): if normalized in [normalize_header(a) for a in aliases]: return canonical_key return None def extract_made_attempted(value_str: str) -> tuple[int, int]: """ Parses '4-16', '4/16', or '4 16' into made, attempted. Handles broken OCR like '416' by assuming first digit is made if > 0 and next characters are attempted. Will fallback to default (0, 0). """ if not value_str or pd_isna(value_str): return 0, 0 val = str(value_str).strip() # 4-16 or 4/16 or 4 16 match = re.match(r'^(\d+)[-/|\s]+(\d+)$', val) if match: return int(match.group(1)), int(match.group(2)) return 0, 0 def pd_isna(val): if val is None: return True if isinstance(val, str) and (val.isspace() or val == "" or val.lower() == "nan"): return True return False def parse_percentage(value_str: str) -> Optional[float]: if not value_str or pd_isna(value_str): return None val = str(value_str).replace('%', '').strip() try: if '.' in val: return float(val) return float(val) / 100.0 if float(val) > 100 else float(val) except ValueError: return None