2026_MLB_Model / data /data.py
Syntrex's picture
Create data.py
e0d0e3c verified
raw
history blame
5.63 kB
from __future__ import annotations
import re
from datetime import datetime
from typing import Any
import pandas as pd
import requests
HEADERS = {
"User-Agent": "Mozilla/5.0",
"Accept-Language": "en-US,en;q=0.9",
}
SCORES_URL_TEMPLATE = "https://www.mlb.com/world-baseball-classic/scores/{date_str}"
TEAM_MAP = {
"AUS": "Australia",
"BRA": "Brazil",
"CAN": "Canada",
"CHN": "China",
"TPE": "Chinese Taipei",
"COL": "Colombia",
"CUB": "Cuba",
"CZE": "Czechia",
"DOM": "Dominican Republic",
"GBR": "Great Britain",
"ISR": "Israel",
"ITA": "Italy",
"JPN": "Japan",
"KOR": "Korea",
"MEX": "Mexico",
"NED": "Netherlands",
"NCA": "Nicaragua",
"PAN": "Panama",
"PUR": "Puerto Rico",
"USA": "United States",
"VEN": "Venezuela",
}
FINAL_RE = re.compile(r"^([A-Z]{3})\s+(\d+),\s+([A-Z]{3})\s+(\d+)$")
TEAM_AT_RE = re.compile(r"^([A-Z]{3})\s+@\s+([A-Z]{3})$")
TIME_RE = re.compile(r"^\d{1,2}:\d{2}\s+[AP]M\s+ET$")
TV_MARKERS = {"FS1", "FS2", "FOX", "Tubi"}
def _full_team(code: str) -> str:
return TEAM_MAP.get(code, code)
def _strip_html_to_lines(html: str) -> list[str]:
text = re.sub(r"<script.*?</script>", " ", html, flags=re.DOTALL | re.IGNORECASE)
text = re.sub(r"<style.*?</style>", " ", text, flags=re.DOTALL | re.IGNORECASE)
text = re.sub(r"<[^>]+>", "\n", text)
text = text.replace("&nbsp;", " ")
text = re.sub(r"\r", "\n", text)
text = re.sub(r"\n+", "\n", text)
lines = [line.strip() for line in text.split("\n") if line.strip()]
cleaned: list[str] = []
for line in lines:
if line.startswith("Image:"):
continue
if line.startswith("calendar-"):
continue
if line in {"Wrap", "Box", "Story", "Preview", "Tickets"}:
continue
cleaned.append(line)
deduped: list[str] = []
for line in cleaned:
if not deduped or deduped[-1] != line:
deduped.append(line)
return deduped
def fetch_scores_for_date(date_str: str) -> pd.DataFrame:
url = SCORES_URL_TEMPLATE.format(date_str=date_str)
response = requests.get(url, headers=HEADERS, timeout=30)
response.raise_for_status()
lines = _strip_html_to_lines(response.text)
rows: list[dict[str, Any]] = []
i = 0
pending_matchup: tuple[str, str] | None = None
while i < len(lines):
line = lines[i]
final_match = FINAL_RE.match(line)
if final_match:
away_code, away_score, home_code, home_score = final_match.groups()
rows.append(
{
"score_fetch_time": datetime.utcnow(),
"game_date": date_str,
"away_team": _full_team(away_code),
"home_team": _full_team(home_code),
"away_score": int(away_score),
"home_score": int(home_score),
"status": "Final",
"start_time_et": "",
"tv": "",
}
)
i += 1
continue
matchup_match = TEAM_AT_RE.match(line)
if matchup_match:
away_code, home_code = matchup_match.groups()
pending_matchup = (_full_team(away_code), _full_team(home_code))
i += 1
continue
if pending_matchup is not None:
away_team, home_team = pending_matchup
if line == "LIVE":
tv = ""
start_time_et = ""
if i + 1 < len(lines) and lines[i + 1] in TV_MARKERS:
tv = lines[i + 1]
i += 1
rows.append(
{
"score_fetch_time": datetime.utcnow(),
"game_date": date_str,
"away_team": away_team,
"home_team": home_team,
"away_score": None,
"home_score": None,
"status": "Live",
"start_time_et": start_time_et,
"tv": tv,
}
)
pending_matchup = None
i += 1
continue
if TIME_RE.match(line):
tv = ""
if i + 1 < len(lines) and lines[i + 1] in TV_MARKERS:
tv = lines[i + 1]
i += 1
rows.append(
{
"score_fetch_time": datetime.utcnow(),
"game_date": date_str,
"away_team": away_team,
"home_team": home_team,
"away_score": None,
"home_score": None,
"status": "Scheduled",
"start_time_et": line,
"tv": tv,
}
)
pending_matchup = None
i += 1
continue
i += 1
if not rows:
return pd.DataFrame(
columns=[
"score_fetch_time",
"game_date",
"away_team",
"home_team",
"away_score",
"home_score",
"status",
"start_time_et",
"tv",
]
)
df = pd.DataFrame(rows)
df = df.drop_duplicates(subset=["game_date", "away_team", "home_team", "status", "away_score", "home_score"])
return df.reset_index(drop=True)