| from typing import Dict, Optional |
| import re |
|
|
| |
| KNOWN_COLUMNS = { |
| "num": ["#", "no", "no.", "number", "jersey", "nr"], |
| "name": ["player", "name", "athlete", "roster", "nom"], |
| "min": ["min", "mins", "time", "played", "duration"], |
| "pts": ["pts", "points", "score"], |
| "fg": ["fg", "fgm-a", "fg m-a", "field goals", "fg-a"], |
| "fg_pct": ["fg%", "fg %", "field goal %", "pct"], |
| "2p": ["2p", "2pt", "2fg", "2pm-a", "2fga"], |
| "2p_pct": ["2p%", "2pt%", "2fg%"], |
| "3p": ["3p", "3pt", "3fg", "3pm-a", "3fga"], |
| "3p_pct": ["3p%", "3pt%", "3fg%"], |
| "ft": ["ft", "ftm-a", "free throws", "ft-a"], |
| "ft_pct": ["ft%", "ft %", "free throw %"], |
| "off": ["off", "or", "of", "oreb", "offensive"], |
| "def": ["def", "dr", "df", "dreb", "defensive"], |
| "reb": ["reb", "tot", "tr", "trb", "rebounds", "rb"], |
| "ast": ["ast", "as", "assists"], |
| "to": ["to", "tov", "turnovers"], |
| "stl": ["stl", "st", "steals"], |
| "blk": ["blk", "bs", "blocks"], |
| "pf": ["pf", "fouls", "fl", "f"], |
| "pos": ["pos", "position"] |
| } |
|
|
| def normalize_header(header_text: str) -> str: |
| """Strips and lowercases a header string.""" |
| header_text = header_text.lower() |
| |
| header_text = re.sub(r'[^a-z0-9%#\.]', '', header_text) |
| return header_text |
|
|
| def identify_column(raw_header: str) -> Optional[str]: |
| """Matches a raw string from a table header to our known keys.""" |
| normalized = normalize_header(raw_header) |
| for canonical_key, aliases in KNOWN_COLUMNS.items(): |
| if normalized in [normalize_header(a) for a in aliases]: |
| return canonical_key |
| return None |
|
|
| def extract_made_attempted(value_str: str) -> tuple[int, int]: |
| """ |
| Parses '4-16', '4/16', or '4 16' into made, attempted. |
| Handles broken OCR like '416' by assuming first digit is made if > 0 and |
| next characters are attempted. Will fallback to default (0, 0). |
| """ |
| if not value_str or pd_isna(value_str): |
| return 0, 0 |
| |
| val = str(value_str).strip() |
| |
| |
| match = re.match(r'^(\d+)[-/|\s]+(\d+)$', val) |
| if match: |
| return int(match.group(1)), int(match.group(2)) |
| |
| return 0, 0 |
|
|
| def pd_isna(val): |
| if val is None: |
| return True |
| if isinstance(val, str) and (val.isspace() or val == "" or val.lower() == "nan"): |
| return True |
| return False |
|
|
| def parse_percentage(value_str: str) -> Optional[float]: |
| if not value_str or pd_isna(value_str): |
| return None |
| val = str(value_str).replace('%', '').strip() |
| try: |
| if '.' in val: |
| return float(val) |
| return float(val) / 100.0 if float(val) > 100 else float(val) |
| except ValueError: |
| return None |
|
|