| import re |
| from typing import List, Optional |
|
|
| from app.stat_import.schemas import RawOCRBlock, ParsedPlayerRow |
| from app.stat_import.parsing.column_mapper import identify_column, extract_made_attempted, parse_percentage |
|
|
| def reconstruct_lines(blocks: List[RawOCRBlock], y_tolerance: float = 20.0) -> List[List[RawOCRBlock]]: |
| """Groups OCR blocks into distinct horizontal lines based on Y-coordinate proximity.""" |
| if not blocks: |
| return [] |
|
|
| |
| sorted_blocks = sorted(blocks, key=lambda b: b.y) |
| |
| lines = [] |
| current_line = [] |
| last_y = -100.0 |
|
|
| for b in sorted_blocks: |
| if b.y - last_y > y_tolerance: |
| if current_line: |
| lines.append(current_line) |
| current_line = [b] |
| else: |
| current_line.append(b) |
| |
| last_y = sum(x.y for x in current_line) / len(current_line) |
|
|
| if current_line: |
| lines.append(current_line) |
|
|
| |
| for line in lines: |
| line.sort(key=lambda b: b.x) |
| |
| return lines |
|
|
| def parse_stat_line(tokens: List[str], header_map: dict) -> Optional[ParsedPlayerRow]: |
| """ |
| Given a list of cleaned tokens and a column index mapping from the header, |
| constructs a ParsedPlayerRow. |
| header_map: { 0: "num", 1: "name", 2: "min", 3: "pts", ... } |
| """ |
| if not tokens or len(tokens) < 3: |
| return None |
|
|
| |
| |
| name_tokens = [] |
| stat_tokens_start = 0 |
| jersey_num = None |
| |
| |
| if tokens[0].isdigit(): |
| jersey_num = int(tokens[0]) |
| stat_tokens_start = 1 |
| |
| |
| for i in range(stat_tokens_start, len(tokens)): |
| t = tokens[i] |
| |
| if re.match(r'^[\d.:/%-]+$', t) and not t.isalpha(): |
| stat_tokens_start = i |
| break |
| name_tokens.append(t) |
| |
| player_name = " ".join(name_tokens).strip() |
| if not player_name or len(player_name) < 2: |
| |
| if "total" in " ".join(tokens).lower() or "team" in " ".join(tokens).lower(): |
| return None |
| |
| stats = tokens[stat_tokens_start:] |
| |
| |
| row = ParsedPlayerRow(player_name_raw=player_name, jersey_number=jersey_num) |
|
|
| |
| |
| |
| idx = 0 |
| def grab(regex_str, default="0"): |
| nonlocal idx |
| if idx < len(stats): |
| val = stats[idx] |
| idx += 1 |
| return val |
| return default |
|
|
| |
| try: |
| row.minutes = grab(r'.*') |
| row.points = int(grab(r'\d+', "0")) |
| |
| fg_raw = grab(r'.*') |
| row.fg_made, row.fg_attempted = extract_made_attempted(fg_raw) |
| |
| three_raw = grab(r'.*') |
| row.three_p_made, row.three_p_attempted = extract_made_attempted(three_raw) |
| |
| ft_raw = grab(r'.*') |
| row.ft_made, row.ft_attempted = extract_made_attempted(ft_raw) |
| |
| row.off_reb = int(grab(r'\d+', "0")) |
| row.def_reb = int(grab(r'\d+', "0")) |
| row.reb = int(grab(r'\d+', "0")) |
| row.ast = int(grab(r'\d+', "0")) |
| row.turnovers = int(grab(r'\d+', "0")) |
| row.stl = int(grab(r'\d+', "0")) |
| row.blk = int(grab(r'\d+', "0")) |
| row.pf = int(grab(r'\d+', "0")) |
| |
| |
| if row.fg_attempted > 0: row.fg_pct = round((row.fg_made / row.fg_attempted) * 100, 1) |
| if row.three_p_attempted > 0: row.three_p_pct = round((row.three_p_made / row.three_p_attempted) * 100, 1) |
| if row.ft_attempted > 0: row.ft_pct = round((row.ft_made / row.ft_attempted) * 100, 1) |
| |
| row.two_p_made = row.fg_made - row.three_p_made |
| row.two_p_attempted = row.fg_attempted - row.three_p_attempted |
| if row.two_p_attempted > 0: |
| row.two_p_pct = round((row.two_p_made / row.two_p_attempted) * 100, 1) |
| |
| row.row_confidence = 0.8 |
| except Exception: |
| row.row_confidence = 0.3 |
|
|
| return row |
|
|
| def extract_team_summary(lines: List[List[RawOCRBlock]]) -> dict: |
| """Finds lines like 'Points from turnovers: 14'""" |
| summary = {} |
| for line in lines: |
| text = " ".join([b.text for b in line]).lower() |
| if "turnovers:" in text or "points from" in text: |
| |
| nums = re.findall(r'\d+', text) |
| if nums: |
| summary['points_from_turnovers'] = int(nums[0]) |
| |
| return summary |
|
|