import re from typing import List, Optional from app.stat_import.schemas import RawOCRBlock, ParsedPlayerRow from app.stat_import.parsing.column_mapper import identify_column, extract_made_attempted, parse_percentage def reconstruct_lines(blocks: List[RawOCRBlock], y_tolerance: float = 20.0) -> List[List[RawOCRBlock]]: """Groups OCR blocks into distinct horizontal lines based on Y-coordinate proximity.""" if not blocks: return [] # Sort vertically first sorted_blocks = sorted(blocks, key=lambda b: b.y) lines = [] current_line = [] last_y = -100.0 for b in sorted_blocks: if b.y - last_y > y_tolerance: if current_line: lines.append(current_line) current_line = [b] else: current_line.append(b) last_y = sum(x.y for x in current_line) / len(current_line) if current_line: lines.append(current_line) # Sort each line left-to-right for line in lines: line.sort(key=lambda b: b.x) return lines def parse_stat_line(tokens: List[str], header_map: dict) -> Optional[ParsedPlayerRow]: """ Given a list of cleaned tokens and a column index mapping from the header, constructs a ParsedPlayerRow. header_map: { 0: "num", 1: "name", 2: "min", 3: "pts", ... } """ if not tokens or len(tokens) < 3: return None # Merge consecutive unnumbered text tokens as player name # We find the start of stats (usually numbers or MM:SS) name_tokens = [] stat_tokens_start = 0 jersey_num = None # Check if first token is a number (jersey) if tokens[0].isdigit(): jersey_num = int(tokens[0]) stat_tokens_start = 1 # Read name tokens until we hit a number or a clock time for i in range(stat_tokens_start, len(tokens)): t = tokens[i] # if it's a number, percentage, or time format (30:41), we assume stats have begun if re.match(r'^[\d.:/%-]+$', t) and not t.isalpha(): stat_tokens_start = i break name_tokens.append(t) player_name = " ".join(name_tokens).strip() if not player_name or len(player_name) < 2: # Ignore team total rows or noise if "total" in " ".join(tokens).lower() or "team" in " ".join(tokens).lower(): return None stats = tokens[stat_tokens_start:] # Fallback to standard dense mapping if headers are messy row = ParsedPlayerRow(player_name_raw=player_name, jersey_number=jersey_num) # Basic heuristic index mapper if headers are missing # Expecting: MIN PTS FG 3P FT OR DR REB AST TO STL BLK PF # Let's cleanly populate what we can from stats idx = 0 def grab(regex_str, default="0"): nonlocal idx if idx < len(stats): val = stats[idx] idx += 1 return val return default # Extracting logic built for typical FIBA density assuming strict order try: row.minutes = grab(r'.*') row.points = int(grab(r'\d+', "0")) fg_raw = grab(r'.*') row.fg_made, row.fg_attempted = extract_made_attempted(fg_raw) three_raw = grab(r'.*') row.three_p_made, row.three_p_attempted = extract_made_attempted(three_raw) ft_raw = grab(r'.*') row.ft_made, row.ft_attempted = extract_made_attempted(ft_raw) row.off_reb = int(grab(r'\d+', "0")) row.def_reb = int(grab(r'\d+', "0")) row.reb = int(grab(r'\d+', "0")) row.ast = int(grab(r'\d+', "0")) row.turnovers = int(grab(r'\d+', "0")) row.stl = int(grab(r'\d+', "0")) row.blk = int(grab(r'\d+', "0")) row.pf = int(grab(r'\d+', "0")) # Calculate derived percentages securely if row.fg_attempted > 0: row.fg_pct = round((row.fg_made / row.fg_attempted) * 100, 1) if row.three_p_attempted > 0: row.three_p_pct = round((row.three_p_made / row.three_p_attempted) * 100, 1) if row.ft_attempted > 0: row.ft_pct = round((row.ft_made / row.ft_attempted) * 100, 1) row.two_p_made = row.fg_made - row.three_p_made row.two_p_attempted = row.fg_attempted - row.three_p_attempted if row.two_p_attempted > 0: row.two_p_pct = round((row.two_p_made / row.two_p_attempted) * 100, 1) row.row_confidence = 0.8 # Default ok confidence if parsing passed except Exception: row.row_confidence = 0.3 # Penalize if extraction crashed mid-way return row def extract_team_summary(lines: List[List[RawOCRBlock]]) -> dict: """Finds lines like 'Points from turnovers: 14'""" summary = {} for line in lines: text = " ".join([b.text for b in line]).lower() if "turnovers:" in text or "points from" in text: # simple regex scanning nums = re.findall(r'\d+', text) if nums: summary['points_from_turnovers'] = int(nums[0]) # Add more regexes based on typical footers return summary