Spaces:

Syntrex
/

2026_MLB_Model

Running

App Files Files

2026_MLB_Model / data /data.py

Syntrex

Create data.py

e0d0e3c verified 2 months ago

raw

history blame

5.63 kB

	from __future__ import annotations

	import re
	from datetime import datetime
	from typing import Any

	import pandas as pd
	import requests

	HEADERS = {
	"User-Agent": "Mozilla/5.0",
	"Accept-Language": "en-US,en;q=0.9",
	}

	SCORES_URL_TEMPLATE = "https://www.mlb.com/world-baseball-classic/scores/{date_str}"

	TEAM_MAP = {
	"AUS": "Australia",
	"BRA": "Brazil",
	"CAN": "Canada",
	"CHN": "China",
	"TPE": "Chinese Taipei",
	"COL": "Colombia",
	"CUB": "Cuba",
	"CZE": "Czechia",
	"DOM": "Dominican Republic",
	"GBR": "Great Britain",
	"ISR": "Israel",
	"ITA": "Italy",
	"JPN": "Japan",
	"KOR": "Korea",
	"MEX": "Mexico",
	"NED": "Netherlands",
	"NCA": "Nicaragua",
	"PAN": "Panama",
	"PUR": "Puerto Rico",
	"USA": "United States",
	"VEN": "Venezuela",
	}

	FINAL_RE = re.compile(r"^([A-Z]{3})\s+(\d+),\s+([A-Z]{3})\s+(\d+)$")
	TEAM_AT_RE = re.compile(r"^([A-Z]{3})\s+@\s+([A-Z]{3})$")
	TIME_RE = re.compile(r"^\d{1,2}:\d{2}\s+[AP]M\s+ET$")
	TV_MARKERS = {"FS1", "FS2", "FOX", "Tubi"}


	def _full_team(code: str) -> str:
	return TEAM_MAP.get(code, code)


	def _strip_html_to_lines(html: str) -> list[str]:
	text = re.sub(r"<script.*?</script>", " ", html, flags=re.DOTALL \| re.IGNORECASE)
	text = re.sub(r"<style.*?</style>", " ", text, flags=re.DOTALL \| re.IGNORECASE)
	text = re.sub(r"<[^>]+>", "\n", text)
	text = text.replace(" ", " ")
	text = re.sub(r"\r", "\n", text)
	text = re.sub(r"\n+", "\n", text)

	lines = [line.strip() for line in text.split("\n") if line.strip()]

	cleaned: list[str] = []
	for line in lines:
	if line.startswith("Image:"):
	continue
	if line.startswith("calendar-"):
	continue
	if line in {"Wrap", "Box", "Story", "Preview", "Tickets"}:
	continue
	cleaned.append(line)

	deduped: list[str] = []
	for line in cleaned:
	if not deduped or deduped[-1] != line:
	deduped.append(line)

	return deduped


	def fetch_scores_for_date(date_str: str) -> pd.DataFrame:
	url = SCORES_URL_TEMPLATE.format(date_str=date_str)
	response = requests.get(url, headers=HEADERS, timeout=30)
	response.raise_for_status()

	lines = _strip_html_to_lines(response.text)

	rows: list[dict[str, Any]] = []

	i = 0
	pending_matchup: tuple[str, str] \| None = None

	while i < len(lines):
	line = lines[i]

	final_match = FINAL_RE.match(line)
	if final_match:
	away_code, away_score, home_code, home_score = final_match.groups()
	rows.append(
	{
	"score_fetch_time": datetime.utcnow(),
	"game_date": date_str,
	"away_team": _full_team(away_code),
	"home_team": _full_team(home_code),
	"away_score": int(away_score),
	"home_score": int(home_score),
	"status": "Final",
	"start_time_et": "",
	"tv": "",
	}
	)
	i += 1
	continue

	matchup_match = TEAM_AT_RE.match(line)
	if matchup_match:
	away_code, home_code = matchup_match.groups()
	pending_matchup = (_full_team(away_code), _full_team(home_code))
	i += 1
	continue

	if pending_matchup is not None:
	away_team, home_team = pending_matchup

	if line == "LIVE":
	tv = ""
	start_time_et = ""
	if i + 1 < len(lines) and lines[i + 1] in TV_MARKERS:
	tv = lines[i + 1]
	i += 1

	rows.append(
	{
	"score_fetch_time": datetime.utcnow(),
	"game_date": date_str,
	"away_team": away_team,
	"home_team": home_team,
	"away_score": None,
	"home_score": None,
	"status": "Live",
	"start_time_et": start_time_et,
	"tv": tv,
	}
	)
	pending_matchup = None
	i += 1
	continue

	if TIME_RE.match(line):
	tv = ""
	if i + 1 < len(lines) and lines[i + 1] in TV_MARKERS:
	tv = lines[i + 1]
	i += 1

	rows.append(
	{
	"score_fetch_time": datetime.utcnow(),
	"game_date": date_str,
	"away_team": away_team,
	"home_team": home_team,
	"away_score": None,
	"home_score": None,
	"status": "Scheduled",
	"start_time_et": line,
	"tv": tv,
	}
	)
	pending_matchup = None
	i += 1
	continue

	i += 1

	if not rows:
	return pd.DataFrame(
	columns=[
	"score_fetch_time",
	"game_date",
	"away_team",
	"home_team",
	"away_score",
	"home_score",
	"status",
	"start_time_et",
	"tv",
	]
	)

	df = pd.DataFrame(rows)
	df = df.drop_duplicates(subset=["game_date", "away_team", "home_team", "status", "away_score", "home_score"])
	return df.reset_index(drop=True)