Spaces:

icanedit2
/

BakoAI

Sleeping

File size: 5,141 Bytes

c6abe34

import re
from typing import List, Optional

from app.stat_import.schemas import RawOCRBlock, ParsedPlayerRow
from app.stat_import.parsing.column_mapper import identify_column, extract_made_attempted, parse_percentage

def reconstruct_lines(blocks: List[RawOCRBlock], y_tolerance: float = 20.0) -> List[List[RawOCRBlock]]:
    """Groups OCR blocks into distinct horizontal lines based on Y-coordinate proximity."""
    if not blocks:
        return []

    # Sort vertically first
    sorted_blocks = sorted(blocks, key=lambda b: b.y)
    
    lines = []
    current_line = []
    last_y = -100.0

    for b in sorted_blocks:
        if b.y - last_y > y_tolerance:
            if current_line:
                lines.append(current_line)
            current_line = [b]
        else:
            current_line.append(b)
        
        last_y = sum(x.y for x in current_line) / len(current_line)

    if current_line:
        lines.append(current_line)

    # Sort each line left-to-right
    for line in lines:
        line.sort(key=lambda b: b.x)
        
    return lines

def parse_stat_line(tokens: List[str], header_map: dict) -> Optional[ParsedPlayerRow]:
    """
    Given a list of cleaned tokens and a column index mapping from the header,
    constructs a ParsedPlayerRow.
    header_map: { 0: "num", 1: "name", 2: "min", 3: "pts", ... }
    """
    if not tokens or len(tokens) < 3:
        return None

    # Merge consecutive unnumbered text tokens as player name
    # We find the start of stats (usually numbers or MM:SS)
    name_tokens = []
    stat_tokens_start = 0
    jersey_num = None
    
    # Check if first token is a number (jersey)
    if tokens[0].isdigit():
        jersey_num = int(tokens[0])
        stat_tokens_start = 1
    
    # Read name tokens until we hit a number or a clock time
    for i in range(stat_tokens_start, len(tokens)):
        t = tokens[i]
        # if it's a number, percentage, or time format (30:41), we assume stats have begun
        if re.match(r'^[\d.:/%-]+$', t) and not t.isalpha():
            stat_tokens_start = i
            break
        name_tokens.append(t)
        
    player_name = " ".join(name_tokens).strip()
    if not player_name or len(player_name) < 2:
        # Ignore team total rows or noise
        if "total" in " ".join(tokens).lower() or "team" in " ".join(tokens).lower():
            return None
            
    stats = tokens[stat_tokens_start:]
    
    # Fallback to standard dense mapping if headers are messy
    row = ParsedPlayerRow(player_name_raw=player_name, jersey_number=jersey_num)

    # Basic heuristic index mapper if headers are missing
    # Expecting: MIN PTS FG 3P FT OR DR REB AST TO STL BLK PF
    # Let's cleanly populate what we can from stats
    idx = 0
    def grab(regex_str, default="0"):
        nonlocal idx
        if idx < len(stats):
            val = stats[idx]
            idx += 1
            return val
        return default

    # Extracting logic built for typical FIBA density assuming strict order
    try:
        row.minutes = grab(r'.*')
        row.points = int(grab(r'\d+', "0"))
        
        fg_raw = grab(r'.*')
        row.fg_made, row.fg_attempted = extract_made_attempted(fg_raw)
        
        three_raw = grab(r'.*')
        row.three_p_made, row.three_p_attempted = extract_made_attempted(three_raw)
        
        ft_raw = grab(r'.*')
        row.ft_made, row.ft_attempted = extract_made_attempted(ft_raw)
        
        row.off_reb = int(grab(r'\d+', "0"))
        row.def_reb = int(grab(r'\d+', "0"))
        row.reb = int(grab(r'\d+', "0"))
        row.ast = int(grab(r'\d+', "0"))
        row.turnovers = int(grab(r'\d+', "0"))
        row.stl = int(grab(r'\d+', "0"))
        row.blk = int(grab(r'\d+', "0"))
        row.pf = int(grab(r'\d+', "0"))
        
        # Calculate derived percentages securely
        if row.fg_attempted > 0: row.fg_pct = round((row.fg_made / row.fg_attempted) * 100, 1)
        if row.three_p_attempted > 0: row.three_p_pct = round((row.three_p_made / row.three_p_attempted) * 100, 1)
        if row.ft_attempted > 0: row.ft_pct = round((row.ft_made / row.ft_attempted) * 100, 1)
        
        row.two_p_made = row.fg_made - row.three_p_made
        row.two_p_attempted = row.fg_attempted - row.three_p_attempted
        if row.two_p_attempted > 0:
            row.two_p_pct = round((row.two_p_made / row.two_p_attempted) * 100, 1)
            
        row.row_confidence = 0.8  # Default ok confidence if parsing passed
    except Exception:
        row.row_confidence = 0.3  # Penalize if extraction crashed mid-way

    return row

def extract_team_summary(lines: List[List[RawOCRBlock]]) -> dict:
    """Finds lines like 'Points from turnovers: 14'"""
    summary = {}
    for line in lines:
        text = " ".join([b.text for b in line]).lower()
        if "turnovers:" in text or "points from" in text:
            # simple regex scanning
            nums = re.findall(r'\d+', text)
            if nums:
                summary['points_from_turnovers'] = int(nums[0])
        # Add more regexes based on typical footers
    return summary