File size: 5,141 Bytes
c6abe34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import re
from typing import List, Optional

from app.stat_import.schemas import RawOCRBlock, ParsedPlayerRow
from app.stat_import.parsing.column_mapper import identify_column, extract_made_attempted, parse_percentage

def reconstruct_lines(blocks: List[RawOCRBlock], y_tolerance: float = 20.0) -> List[List[RawOCRBlock]]:
    """Groups OCR blocks into distinct horizontal lines based on Y-coordinate proximity."""
    if not blocks:
        return []

    # Sort vertically first
    sorted_blocks = sorted(blocks, key=lambda b: b.y)
    
    lines = []
    current_line = []
    last_y = -100.0

    for b in sorted_blocks:
        if b.y - last_y > y_tolerance:
            if current_line:
                lines.append(current_line)
            current_line = [b]
        else:
            current_line.append(b)
        
        last_y = sum(x.y for x in current_line) / len(current_line)

    if current_line:
        lines.append(current_line)

    # Sort each line left-to-right
    for line in lines:
        line.sort(key=lambda b: b.x)
        
    return lines

def parse_stat_line(tokens: List[str], header_map: dict) -> Optional[ParsedPlayerRow]:
    """
    Given a list of cleaned tokens and a column index mapping from the header,
    constructs a ParsedPlayerRow.
    header_map: { 0: "num", 1: "name", 2: "min", 3: "pts", ... }
    """
    if not tokens or len(tokens) < 3:
        return None

    # Merge consecutive unnumbered text tokens as player name
    # We find the start of stats (usually numbers or MM:SS)
    name_tokens = []
    stat_tokens_start = 0
    jersey_num = None
    
    # Check if first token is a number (jersey)
    if tokens[0].isdigit():
        jersey_num = int(tokens[0])
        stat_tokens_start = 1
    
    # Read name tokens until we hit a number or a clock time
    for i in range(stat_tokens_start, len(tokens)):
        t = tokens[i]
        # if it's a number, percentage, or time format (30:41), we assume stats have begun
        if re.match(r'^[\d.:/%-]+$', t) and not t.isalpha():
            stat_tokens_start = i
            break
        name_tokens.append(t)
        
    player_name = " ".join(name_tokens).strip()
    if not player_name or len(player_name) < 2:
        # Ignore team total rows or noise
        if "total" in " ".join(tokens).lower() or "team" in " ".join(tokens).lower():
            return None
            
    stats = tokens[stat_tokens_start:]
    
    # Fallback to standard dense mapping if headers are messy
    row = ParsedPlayerRow(player_name_raw=player_name, jersey_number=jersey_num)

    # Basic heuristic index mapper if headers are missing
    # Expecting: MIN PTS FG 3P FT OR DR REB AST TO STL BLK PF
    # Let's cleanly populate what we can from stats
    idx = 0
    def grab(regex_str, default="0"):
        nonlocal idx
        if idx < len(stats):
            val = stats[idx]
            idx += 1
            return val
        return default

    # Extracting logic built for typical FIBA density assuming strict order
    try:
        row.minutes = grab(r'.*')
        row.points = int(grab(r'\d+', "0"))
        
        fg_raw = grab(r'.*')
        row.fg_made, row.fg_attempted = extract_made_attempted(fg_raw)
        
        three_raw = grab(r'.*')
        row.three_p_made, row.three_p_attempted = extract_made_attempted(three_raw)
        
        ft_raw = grab(r'.*')
        row.ft_made, row.ft_attempted = extract_made_attempted(ft_raw)
        
        row.off_reb = int(grab(r'\d+', "0"))
        row.def_reb = int(grab(r'\d+', "0"))
        row.reb = int(grab(r'\d+', "0"))
        row.ast = int(grab(r'\d+', "0"))
        row.turnovers = int(grab(r'\d+', "0"))
        row.stl = int(grab(r'\d+', "0"))
        row.blk = int(grab(r'\d+', "0"))
        row.pf = int(grab(r'\d+', "0"))
        
        # Calculate derived percentages securely
        if row.fg_attempted > 0: row.fg_pct = round((row.fg_made / row.fg_attempted) * 100, 1)
        if row.three_p_attempted > 0: row.three_p_pct = round((row.three_p_made / row.three_p_attempted) * 100, 1)
        if row.ft_attempted > 0: row.ft_pct = round((row.ft_made / row.ft_attempted) * 100, 1)
        
        row.two_p_made = row.fg_made - row.three_p_made
        row.two_p_attempted = row.fg_attempted - row.three_p_attempted
        if row.two_p_attempted > 0:
            row.two_p_pct = round((row.two_p_made / row.two_p_attempted) * 100, 1)
            
        row.row_confidence = 0.8  # Default ok confidence if parsing passed
    except Exception:
        row.row_confidence = 0.3  # Penalize if extraction crashed mid-way

    return row

def extract_team_summary(lines: List[List[RawOCRBlock]]) -> dict:
    """Finds lines like 'Points from turnovers: 14'"""
    summary = {}
    for line in lines:
        text = " ".join([b.text for b in line]).lower()
        if "turnovers:" in text or "points from" in text:
            # simple regex scanning
            nums = re.findall(r'\d+', text)
            if nums:
                summary['points_from_turnovers'] = int(nums[0])
        # Add more regexes based on typical footers
    return summary