Spaces:

icanedit2
/

BakoAI

Sleeping

BakoAI / app /stat_import /parsing /row_parser.py

Okidi Norbert

Deployment fix: clean backend only

c6abe34 13 days ago

5.14 kB

	import re
	from typing import List, Optional

	from app.stat_import.schemas import RawOCRBlock, ParsedPlayerRow
	from app.stat_import.parsing.column_mapper import identify_column, extract_made_attempted, parse_percentage

	def reconstruct_lines(blocks: List[RawOCRBlock], y_tolerance: float = 20.0) -> List[List[RawOCRBlock]]:
	"""Groups OCR blocks into distinct horizontal lines based on Y-coordinate proximity."""
	if not blocks:
	return []

	# Sort vertically first
	sorted_blocks = sorted(blocks, key=lambda b: b.y)

	lines = []
	current_line = []
	last_y = -100.0

	for b in sorted_blocks:
	if b.y - last_y > y_tolerance:
	if current_line:
	lines.append(current_line)
	current_line = [b]
	else:
	current_line.append(b)

	last_y = sum(x.y for x in current_line) / len(current_line)

	if current_line:
	lines.append(current_line)

	# Sort each line left-to-right
	for line in lines:
	line.sort(key=lambda b: b.x)

	return lines

	def parse_stat_line(tokens: List[str], header_map: dict) -> Optional[ParsedPlayerRow]:
	"""
	Given a list of cleaned tokens and a column index mapping from the header,
	constructs a ParsedPlayerRow.
	header_map: { 0: "num", 1: "name", 2: "min", 3: "pts", ... }
	"""
	if not tokens or len(tokens) < 3:
	return None

	# Merge consecutive unnumbered text tokens as player name
	# We find the start of stats (usually numbers or MM:SS)
	name_tokens = []
	stat_tokens_start = 0
	jersey_num = None

	# Check if first token is a number (jersey)
	if tokens[0].isdigit():
	jersey_num = int(tokens[0])
	stat_tokens_start = 1

	# Read name tokens until we hit a number or a clock time
	for i in range(stat_tokens_start, len(tokens)):
	t = tokens[i]
	# if it's a number, percentage, or time format (30:41), we assume stats have begun
	if re.match(r'^[\d.:/%-]+$', t) and not t.isalpha():
	stat_tokens_start = i
	break
	name_tokens.append(t)

	player_name = " ".join(name_tokens).strip()
	if not player_name or len(player_name) < 2:
	# Ignore team total rows or noise
	if "total" in " ".join(tokens).lower() or "team" in " ".join(tokens).lower():
	return None

	stats = tokens[stat_tokens_start:]

	# Fallback to standard dense mapping if headers are messy
	row = ParsedPlayerRow(player_name_raw=player_name, jersey_number=jersey_num)

	# Basic heuristic index mapper if headers are missing
	# Expecting: MIN PTS FG 3P FT OR DR REB AST TO STL BLK PF
	# Let's cleanly populate what we can from stats
	idx = 0
	def grab(regex_str, default="0"):
	nonlocal idx
	if idx < len(stats):
	val = stats[idx]
	idx += 1
	return val
	return default

	# Extracting logic built for typical FIBA density assuming strict order
	try:
	row.minutes = grab(r'.*')
	row.points = int(grab(r'\d+', "0"))

	fg_raw = grab(r'.*')
	row.fg_made, row.fg_attempted = extract_made_attempted(fg_raw)

	three_raw = grab(r'.*')
	row.three_p_made, row.three_p_attempted = extract_made_attempted(three_raw)

	ft_raw = grab(r'.*')
	row.ft_made, row.ft_attempted = extract_made_attempted(ft_raw)

	row.off_reb = int(grab(r'\d+', "0"))
	row.def_reb = int(grab(r'\d+', "0"))
	row.reb = int(grab(r'\d+', "0"))
	row.ast = int(grab(r'\d+', "0"))
	row.turnovers = int(grab(r'\d+', "0"))
	row.stl = int(grab(r'\d+', "0"))
	row.blk = int(grab(r'\d+', "0"))
	row.pf = int(grab(r'\d+', "0"))

	# Calculate derived percentages securely
	if row.fg_attempted > 0: row.fg_pct = round((row.fg_made / row.fg_attempted) * 100, 1)
	if row.three_p_attempted > 0: row.three_p_pct = round((row.three_p_made / row.three_p_attempted) * 100, 1)
	if row.ft_attempted > 0: row.ft_pct = round((row.ft_made / row.ft_attempted) * 100, 1)

	row.two_p_made = row.fg_made - row.three_p_made
	row.two_p_attempted = row.fg_attempted - row.three_p_attempted
	if row.two_p_attempted > 0:
	row.two_p_pct = round((row.two_p_made / row.two_p_attempted) * 100, 1)

	row.row_confidence = 0.8 # Default ok confidence if parsing passed
	except Exception:
	row.row_confidence = 0.3 # Penalize if extraction crashed mid-way

	return row

	def extract_team_summary(lines: List[List[RawOCRBlock]]) -> dict:
	"""Finds lines like 'Points from turnovers: 14'"""
	summary = {}
	for line in lines:
	text = " ".join([b.text for b in line]).lower()
	if "turnovers:" in text or "points from" in text:
	# simple regex scanning
	nums = re.findall(r'\d+', text)
	if nums:
	summary['points_from_turnovers'] = int(nums[0])
	# Add more regexes based on typical footers
	return summary