File size: 2,809 Bytes
c6abe34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
from typing import Dict, Optional
import re

# Stat aliases to help robust detection
KNOWN_COLUMNS = {
    "num": ["#", "no", "no.", "number", "jersey", "nr"],
    "name": ["player", "name", "athlete", "roster", "nom"],
    "min": ["min", "mins", "time", "played", "duration"],
    "pts": ["pts", "points", "score"],
    "fg": ["fg", "fgm-a", "fg m-a", "field goals", "fg-a"],
    "fg_pct": ["fg%", "fg %", "field goal %", "pct"],
    "2p": ["2p", "2pt", "2fg", "2pm-a", "2fga"],
    "2p_pct": ["2p%", "2pt%", "2fg%"],
    "3p": ["3p", "3pt", "3fg", "3pm-a", "3fga"],
    "3p_pct": ["3p%", "3pt%", "3fg%"],
    "ft": ["ft", "ftm-a", "free throws", "ft-a"],
    "ft_pct": ["ft%", "ft %", "free throw %"],
    "off": ["off", "or", "of", "oreb", "offensive"],
    "def": ["def", "dr", "df", "dreb", "defensive"],
    "reb": ["reb", "tot", "tr", "trb", "rebounds", "rb"],
    "ast": ["ast", "as", "assists"],
    "to": ["to", "tov", "turnovers"],
    "stl": ["stl", "st", "steals"],
    "blk": ["blk", "bs", "blocks"],
    "pf": ["pf", "fouls", "fl", "f"],
    "pos": ["pos", "position"]
}

def normalize_header(header_text: str) -> str:
    """Strips and lowercases a header string."""
    header_text = header_text.lower()
    # Remove all spaces and weird punctuation to reduce variations
    header_text = re.sub(r'[^a-z0-9%#\.]', '', header_text)
    return header_text

def identify_column(raw_header: str) -> Optional[str]:
    """Matches a raw string from a table header to our known keys."""
    normalized = normalize_header(raw_header)
    for canonical_key, aliases in KNOWN_COLUMNS.items():
        if normalized in [normalize_header(a) for a in aliases]:
            return canonical_key
    return None

def extract_made_attempted(value_str: str) -> tuple[int, int]:
    """
    Parses '4-16', '4/16', or '4 16' into made, attempted.
    Handles broken OCR like '416' by assuming first digit is made if > 0 and 
    next characters are attempted. Will fallback to default (0, 0).
    """
    if not value_str or pd_isna(value_str):
        return 0, 0
    
    val = str(value_str).strip()
    
    # 4-16 or 4/16 or 4 16
    match = re.match(r'^(\d+)[-/|\s]+(\d+)$', val)
    if match:
        return int(match.group(1)), int(match.group(2))
        
    return 0, 0

def pd_isna(val):
    if val is None:
        return True
    if isinstance(val, str) and (val.isspace() or val == "" or val.lower() == "nan"):
        return True
    return False

def parse_percentage(value_str: str) -> Optional[float]:
    if not value_str or pd_isna(value_str):
        return None
    val = str(value_str).replace('%', '').strip()
    try:
        if '.' in val:
            return float(val)
        return float(val) / 100.0 if float(val) > 100 else float(val) 
    except ValueError:
        return None