BakoAI / app /stat_import /parsing /column_mapper.py
Okidi Norbert
Deployment fix: clean backend only
c6abe34
from typing import Dict, Optional
import re
# Stat aliases to help robust detection
KNOWN_COLUMNS = {
"num": ["#", "no", "no.", "number", "jersey", "nr"],
"name": ["player", "name", "athlete", "roster", "nom"],
"min": ["min", "mins", "time", "played", "duration"],
"pts": ["pts", "points", "score"],
"fg": ["fg", "fgm-a", "fg m-a", "field goals", "fg-a"],
"fg_pct": ["fg%", "fg %", "field goal %", "pct"],
"2p": ["2p", "2pt", "2fg", "2pm-a", "2fga"],
"2p_pct": ["2p%", "2pt%", "2fg%"],
"3p": ["3p", "3pt", "3fg", "3pm-a", "3fga"],
"3p_pct": ["3p%", "3pt%", "3fg%"],
"ft": ["ft", "ftm-a", "free throws", "ft-a"],
"ft_pct": ["ft%", "ft %", "free throw %"],
"off": ["off", "or", "of", "oreb", "offensive"],
"def": ["def", "dr", "df", "dreb", "defensive"],
"reb": ["reb", "tot", "tr", "trb", "rebounds", "rb"],
"ast": ["ast", "as", "assists"],
"to": ["to", "tov", "turnovers"],
"stl": ["stl", "st", "steals"],
"blk": ["blk", "bs", "blocks"],
"pf": ["pf", "fouls", "fl", "f"],
"pos": ["pos", "position"]
}
def normalize_header(header_text: str) -> str:
"""Strips and lowercases a header string."""
header_text = header_text.lower()
# Remove all spaces and weird punctuation to reduce variations
header_text = re.sub(r'[^a-z0-9%#\.]', '', header_text)
return header_text
def identify_column(raw_header: str) -> Optional[str]:
"""Matches a raw string from a table header to our known keys."""
normalized = normalize_header(raw_header)
for canonical_key, aliases in KNOWN_COLUMNS.items():
if normalized in [normalize_header(a) for a in aliases]:
return canonical_key
return None
def extract_made_attempted(value_str: str) -> tuple[int, int]:
"""
Parses '4-16', '4/16', or '4 16' into made, attempted.
Handles broken OCR like '416' by assuming first digit is made if > 0 and
next characters are attempted. Will fallback to default (0, 0).
"""
if not value_str or pd_isna(value_str):
return 0, 0
val = str(value_str).strip()
# 4-16 or 4/16 or 4 16
match = re.match(r'^(\d+)[-/|\s]+(\d+)$', val)
if match:
return int(match.group(1)), int(match.group(2))
return 0, 0
def pd_isna(val):
if val is None:
return True
if isinstance(val, str) and (val.isspace() or val == "" or val.lower() == "nan"):
return True
return False
def parse_percentage(value_str: str) -> Optional[float]:
if not value_str or pd_isna(value_str):
return None
val = str(value_str).replace('%', '').strip()
try:
if '.' in val:
return float(val)
return float(val) / 100.0 if float(val) > 100 else float(val)
except ValueError:
return None