File size: 5,141 Bytes
c6abe34 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 | import re
from typing import List, Optional
from app.stat_import.schemas import RawOCRBlock, ParsedPlayerRow
from app.stat_import.parsing.column_mapper import identify_column, extract_made_attempted, parse_percentage
def reconstruct_lines(blocks: List[RawOCRBlock], y_tolerance: float = 20.0) -> List[List[RawOCRBlock]]:
"""Groups OCR blocks into distinct horizontal lines based on Y-coordinate proximity."""
if not blocks:
return []
# Sort vertically first
sorted_blocks = sorted(blocks, key=lambda b: b.y)
lines = []
current_line = []
last_y = -100.0
for b in sorted_blocks:
if b.y - last_y > y_tolerance:
if current_line:
lines.append(current_line)
current_line = [b]
else:
current_line.append(b)
last_y = sum(x.y for x in current_line) / len(current_line)
if current_line:
lines.append(current_line)
# Sort each line left-to-right
for line in lines:
line.sort(key=lambda b: b.x)
return lines
def parse_stat_line(tokens: List[str], header_map: dict) -> Optional[ParsedPlayerRow]:
"""
Given a list of cleaned tokens and a column index mapping from the header,
constructs a ParsedPlayerRow.
header_map: { 0: "num", 1: "name", 2: "min", 3: "pts", ... }
"""
if not tokens or len(tokens) < 3:
return None
# Merge consecutive unnumbered text tokens as player name
# We find the start of stats (usually numbers or MM:SS)
name_tokens = []
stat_tokens_start = 0
jersey_num = None
# Check if first token is a number (jersey)
if tokens[0].isdigit():
jersey_num = int(tokens[0])
stat_tokens_start = 1
# Read name tokens until we hit a number or a clock time
for i in range(stat_tokens_start, len(tokens)):
t = tokens[i]
# if it's a number, percentage, or time format (30:41), we assume stats have begun
if re.match(r'^[\d.:/%-]+$', t) and not t.isalpha():
stat_tokens_start = i
break
name_tokens.append(t)
player_name = " ".join(name_tokens).strip()
if not player_name or len(player_name) < 2:
# Ignore team total rows or noise
if "total" in " ".join(tokens).lower() or "team" in " ".join(tokens).lower():
return None
stats = tokens[stat_tokens_start:]
# Fallback to standard dense mapping if headers are messy
row = ParsedPlayerRow(player_name_raw=player_name, jersey_number=jersey_num)
# Basic heuristic index mapper if headers are missing
# Expecting: MIN PTS FG 3P FT OR DR REB AST TO STL BLK PF
# Let's cleanly populate what we can from stats
idx = 0
def grab(regex_str, default="0"):
nonlocal idx
if idx < len(stats):
val = stats[idx]
idx += 1
return val
return default
# Extracting logic built for typical FIBA density assuming strict order
try:
row.minutes = grab(r'.*')
row.points = int(grab(r'\d+', "0"))
fg_raw = grab(r'.*')
row.fg_made, row.fg_attempted = extract_made_attempted(fg_raw)
three_raw = grab(r'.*')
row.three_p_made, row.three_p_attempted = extract_made_attempted(three_raw)
ft_raw = grab(r'.*')
row.ft_made, row.ft_attempted = extract_made_attempted(ft_raw)
row.off_reb = int(grab(r'\d+', "0"))
row.def_reb = int(grab(r'\d+', "0"))
row.reb = int(grab(r'\d+', "0"))
row.ast = int(grab(r'\d+', "0"))
row.turnovers = int(grab(r'\d+', "0"))
row.stl = int(grab(r'\d+', "0"))
row.blk = int(grab(r'\d+', "0"))
row.pf = int(grab(r'\d+', "0"))
# Calculate derived percentages securely
if row.fg_attempted > 0: row.fg_pct = round((row.fg_made / row.fg_attempted) * 100, 1)
if row.three_p_attempted > 0: row.three_p_pct = round((row.three_p_made / row.three_p_attempted) * 100, 1)
if row.ft_attempted > 0: row.ft_pct = round((row.ft_made / row.ft_attempted) * 100, 1)
row.two_p_made = row.fg_made - row.three_p_made
row.two_p_attempted = row.fg_attempted - row.three_p_attempted
if row.two_p_attempted > 0:
row.two_p_pct = round((row.two_p_made / row.two_p_attempted) * 100, 1)
row.row_confidence = 0.8 # Default ok confidence if parsing passed
except Exception:
row.row_confidence = 0.3 # Penalize if extraction crashed mid-way
return row
def extract_team_summary(lines: List[List[RawOCRBlock]]) -> dict:
"""Finds lines like 'Points from turnovers: 14'"""
summary = {}
for line in lines:
text = " ".join([b.text for b in line]).lower()
if "turnovers:" in text or "points from" in text:
# simple regex scanning
nums = re.findall(r'\d+', text)
if nums:
summary['points_from_turnovers'] = int(nums[0])
# Add more regexes based on typical footers
return summary
|