BakoAI / app /stat_import /parsing /row_parser.py
Okidi Norbert
Deployment fix: clean backend only
c6abe34
import re
from typing import List, Optional
from app.stat_import.schemas import RawOCRBlock, ParsedPlayerRow
from app.stat_import.parsing.column_mapper import identify_column, extract_made_attempted, parse_percentage
def reconstruct_lines(blocks: List[RawOCRBlock], y_tolerance: float = 20.0) -> List[List[RawOCRBlock]]:
"""Groups OCR blocks into distinct horizontal lines based on Y-coordinate proximity."""
if not blocks:
return []
# Sort vertically first
sorted_blocks = sorted(blocks, key=lambda b: b.y)
lines = []
current_line = []
last_y = -100.0
for b in sorted_blocks:
if b.y - last_y > y_tolerance:
if current_line:
lines.append(current_line)
current_line = [b]
else:
current_line.append(b)
last_y = sum(x.y for x in current_line) / len(current_line)
if current_line:
lines.append(current_line)
# Sort each line left-to-right
for line in lines:
line.sort(key=lambda b: b.x)
return lines
def parse_stat_line(tokens: List[str], header_map: dict) -> Optional[ParsedPlayerRow]:
"""
Given a list of cleaned tokens and a column index mapping from the header,
constructs a ParsedPlayerRow.
header_map: { 0: "num", 1: "name", 2: "min", 3: "pts", ... }
"""
if not tokens or len(tokens) < 3:
return None
# Merge consecutive unnumbered text tokens as player name
# We find the start of stats (usually numbers or MM:SS)
name_tokens = []
stat_tokens_start = 0
jersey_num = None
# Check if first token is a number (jersey)
if tokens[0].isdigit():
jersey_num = int(tokens[0])
stat_tokens_start = 1
# Read name tokens until we hit a number or a clock time
for i in range(stat_tokens_start, len(tokens)):
t = tokens[i]
# if it's a number, percentage, or time format (30:41), we assume stats have begun
if re.match(r'^[\d.:/%-]+$', t) and not t.isalpha():
stat_tokens_start = i
break
name_tokens.append(t)
player_name = " ".join(name_tokens).strip()
if not player_name or len(player_name) < 2:
# Ignore team total rows or noise
if "total" in " ".join(tokens).lower() or "team" in " ".join(tokens).lower():
return None
stats = tokens[stat_tokens_start:]
# Fallback to standard dense mapping if headers are messy
row = ParsedPlayerRow(player_name_raw=player_name, jersey_number=jersey_num)
# Basic heuristic index mapper if headers are missing
# Expecting: MIN PTS FG 3P FT OR DR REB AST TO STL BLK PF
# Let's cleanly populate what we can from stats
idx = 0
def grab(regex_str, default="0"):
nonlocal idx
if idx < len(stats):
val = stats[idx]
idx += 1
return val
return default
# Extracting logic built for typical FIBA density assuming strict order
try:
row.minutes = grab(r'.*')
row.points = int(grab(r'\d+', "0"))
fg_raw = grab(r'.*')
row.fg_made, row.fg_attempted = extract_made_attempted(fg_raw)
three_raw = grab(r'.*')
row.three_p_made, row.three_p_attempted = extract_made_attempted(three_raw)
ft_raw = grab(r'.*')
row.ft_made, row.ft_attempted = extract_made_attempted(ft_raw)
row.off_reb = int(grab(r'\d+', "0"))
row.def_reb = int(grab(r'\d+', "0"))
row.reb = int(grab(r'\d+', "0"))
row.ast = int(grab(r'\d+', "0"))
row.turnovers = int(grab(r'\d+', "0"))
row.stl = int(grab(r'\d+', "0"))
row.blk = int(grab(r'\d+', "0"))
row.pf = int(grab(r'\d+', "0"))
# Calculate derived percentages securely
if row.fg_attempted > 0: row.fg_pct = round((row.fg_made / row.fg_attempted) * 100, 1)
if row.three_p_attempted > 0: row.three_p_pct = round((row.three_p_made / row.three_p_attempted) * 100, 1)
if row.ft_attempted > 0: row.ft_pct = round((row.ft_made / row.ft_attempted) * 100, 1)
row.two_p_made = row.fg_made - row.three_p_made
row.two_p_attempted = row.fg_attempted - row.three_p_attempted
if row.two_p_attempted > 0:
row.two_p_pct = round((row.two_p_made / row.two_p_attempted) * 100, 1)
row.row_confidence = 0.8 # Default ok confidence if parsing passed
except Exception:
row.row_confidence = 0.3 # Penalize if extraction crashed mid-way
return row
def extract_team_summary(lines: List[List[RawOCRBlock]]) -> dict:
"""Finds lines like 'Points from turnovers: 14'"""
summary = {}
for line in lines:
text = " ".join([b.text for b in line]).lower()
if "turnovers:" in text or "points from" in text:
# simple regex scanning
nums = re.findall(r'\d+', text)
if nums:
summary['points_from_turnovers'] = int(nums[0])
# Add more regexes based on typical footers
return summary