MLBmod / app.py
tygiorgio's picture
Update app.py
788efbc verified
Raw
History Blame Contribute Delete
19.2 kB
import gradio as gr
import requests
import re
import unicodedata
from datetime import datetime
from zoneinfo import ZoneInfo
from playwright.sync_api import sync_playwright
MLB_SCHEDULE_URL = "https://statsapi.mlb.com/api/v1/schedule"
MLB_LIVE_URL = "https://statsapi.mlb.com/api/v1.1/game/{game_pk}/feed/live"
SAVANT_PREVIEW_URL = (
"https://baseballsavant.mlb.com/preview"
"?game_pk={game_pk}&game_date={game_date}&date={game_date}"
)
SAVANT_HITTER_X_URL = (
"https://baseballsavant.mlb.com/savant-player/"
"{slug}?stats=statcast-r-hitting-mlb"
)
SAVANT_HITTER_SPLIT_URL = (
"https://baseballsavant.mlb.com/savant-player/"
"{slug}?stats=splits-r-hitting-mlb&season={season}"
)
SAVANT_PITCHER_X_URL = (
"https://baseballsavant.mlb.com/savant-player/"
"{slug}?stats=statcast-r-pitching-mlb"
)
HITTER_X_TABLE_SELECTOR = "#statcast_glance_batter > table"
HITTER_PLATOON_TABLE_SELECTOR = "#date-platoon-mlb > table"
PITCHER_X_TABLE_SELECTOR = "#statcast_stats_pitching > table"
PA_THRESHOLD_HITTER = 50
PITCH_THRESHOLD_PITCHER = 500
def today_pacific() -> str:
return datetime.now(ZoneInfo("America/Los_Angeles")).strftime("%Y-%m-%d")
def normalize_name(name: str) -> str:
if not name:
return ""
name = unicodedata.normalize("NFKD", name)
name = "".join(ch for ch in name if not unicodedata.combining(ch))
name = name.lower().strip()
name = re.sub(r"[^\w\s]", "", name)
suffixes = {"jr", "sr", "ii", "iii", "iv", "v"}
parts = [p for p in name.split() if p not in suffixes]
return " ".join(parts)
def get_json(url: str, params=None) -> dict:
r = requests.get(url, params=params, timeout=25)
r.raise_for_status()
return r.json()
def safe_int(text: str) -> int:
text = (text or "").strip().replace(",", "")
try:
return int(text)
except Exception:
return 0
def safe_float(text: str) -> float:
text = (text or "").strip().replace("%", "").replace(",", "")
try:
return float(text)
except Exception:
return 0.0
def get_games():
date_str = today_pacific()
data = get_json(MLB_SCHEDULE_URL, params={"sportId": 1, "date": date_str})
games = []
for d in data.get("dates", []):
for g in d.get("games", []):
game_pk = g.get("gamePk")
away = g.get("teams", {}).get("away", {}).get("team", {}).get("name", "Away")
home = g.get("teams", {}).get("home", {}).get("team", {}).get("name", "Home")
status = g.get("status", {}).get("detailedState", "")
label = f"{away} @ {home} | {status}"
games.append((label, game_pk))
if not games:
return gr.update(choices=[], value=None)
return gr.update(choices=games, value=games[0][1])
def fetch_game_players(game_pk: int):
data = get_json(MLB_LIVE_URL.format(game_pk=game_pk))
game_data = data.get("gameData", {})
live_data = data.get("liveData", {})
box_teams = live_data.get("boxscore", {}).get("teams", {})
players_meta = game_data.get("players", {})
probable = game_data.get("probablePitchers", {})
away_team = game_data.get("teams", {}).get("away", {}).get("name", "Away")
home_team = game_data.get("teams", {}).get("home", {}).get("name", "Home")
players = []
for side in ["away", "home"]:
team_name = away_team if side == "away" else home_team
team_box = box_teams.get(side, {})
batter_ids = team_box.get("batters", [])
players_box = team_box.get("players", {})
for idx, pid in enumerate(batter_ids[:9], start=1):
player = players_box.get(f"ID{pid}", {})
name = player.get("person", {}).get("fullName", "")
hand = players_meta.get(f"ID{pid}", {}).get("batSide", {}).get("code", "")
players.append(
{
"type": "hitter",
"order": idx,
"side": side,
"team": team_name,
"player_id": pid,
"name": name,
"hand": hand,
"norm_name": normalize_name(name),
}
)
p = probable.get(side, {})
pid = p.get("id")
name = p.get("fullName", "")
hand = players_meta.get(f"ID{pid}", {}).get("pitchHand", {}).get("code", "") if pid else ""
if name and pid:
players.append(
{
"type": "pitcher",
"order": None,
"side": side,
"team": team_name,
"player_id": pid,
"name": name,
"hand": hand,
"norm_name": normalize_name(name),
}
)
return players, {"away_team": away_team, "home_team": home_team}
def scrape_savant_player_links_for_game(page, game_pk: int, game_date: str):
preview_url = SAVANT_PREVIEW_URL.format(game_pk=game_pk, game_date=game_date)
page.goto(preview_url, wait_until="domcontentloaded", timeout=60000)
page.wait_for_timeout(5000)
links = page.locator('a[href*="/savant-player/"]')
link_count = links.count()
rows = []
seen = set()
for i in range(link_count):
href = links.nth(i).get_attribute("href") or ""
name = links.nth(i).inner_text().strip()
if not name:
continue
m = re.search(r"/savant-player/([a-z0-9-]+)", href)
if not m:
continue
slug = m.group(1)
norm = normalize_name(name)
if norm in seen:
continue
seen.add(norm)
rows.append(
{
"name": name,
"norm_name": norm,
"slug": slug,
}
)
return rows
def match_players_to_savant(players, savant_rows):
savant_map = {r["norm_name"]: r for r in savant_rows}
matched = []
for player in players:
savant = savant_map.get(player["norm_name"])
row = dict(player)
if savant:
row["matched"] = True
row["savant_slug"] = savant["slug"]
else:
row["matched"] = False
row["savant_slug"] = None
matched.append(row)
return matched
def parse_hitter_x_stats_year_table(page):
page.wait_for_selector(HITTER_X_TABLE_SELECTOR, timeout=30000)
page.wait_for_timeout(800)
rows = page.locator(f"{HITTER_X_TABLE_SELECTOR} tbody tr")
out = {}
for i in range(rows.count()):
row = rows.nth(i)
cells = row.locator("td")
if cells.count() < 19:
continue
vals = [cells.nth(c).inner_text().strip() for c in range(cells.count())]
if not vals[0].isdigit():
continue
yr = int(vals[0])
out[yr] = {
"season": yr,
"pa": safe_int(vals[2]),
"xwoba": safe_float(vals[14]),
"xba": safe_float(vals[11]),
"xslg": safe_float(vals[12]),
"hard_hit_pct": safe_float(vals[16]),
"barrel_pct": safe_float(vals[5]),
"k_pct": safe_float(vals[17]),
"bb_pct": safe_float(vals[18]),
}
return out
def parse_hitter_platoon_rows(page, season: int):
page.wait_for_selector(HITTER_PLATOON_TABLE_SELECTOR, timeout=30000)
page.wait_for_timeout(800)
rows = page.locator(f"{HITTER_PLATOON_TABLE_SELECTOR} tbody tr")
out = {}
for i in range(min(rows.count(), 2)):
row = rows.nth(i)
cells = row.locator("td")
if cells.count() < 20:
continue
split_label = cells.nth(2).inner_text().strip()
split_key = "vs_L" if "Left" in split_label else "vs_R"
out[split_key] = {
"season": season,
"split": split_label,
"pa": safe_int(cells.nth(3).inner_text()),
"ops": safe_float(cells.nth(19).inner_text()),
}
return out
def combine_hitter_split_rows(row_2026, row_2025):
if not row_2026 and not row_2025:
return {}
if row_2026 and row_2026.get("pa", 0) >= PA_THRESHOLD_HITTER:
out = dict(row_2026)
out["source"] = "2026_only"
return out
if not row_2025:
out = dict(row_2026) if row_2026 else {}
if out:
out["source"] = "2026_only_no_2025"
return out
pa_26 = row_2026.get("pa", 0) if row_2026 else 0
pa_25 = row_2025.get("pa", 0)
total = pa_26 + pa_25
if total == 0:
return {}
def weighted(key):
v26 = row_2026.get(key, 0.0) if row_2026 else 0.0
v25 = row_2025.get(key, 0.0)
return ((v26 * pa_26) + (v25 * pa_25)) / total
return {
"split": row_2026["split"] if row_2026 else row_2025["split"],
"pa_2026": pa_26,
"pa_2025": pa_25,
"pa_total_used": total,
"ops": weighted("ops"),
"source": "2026_plus_2025",
}
def blend_hitter_overall_xstats(x2026, x2025):
if not x2026 and not x2025:
return {}
if x2026 and x2026.get("pa", 0) >= PA_THRESHOLD_HITTER:
out = dict(x2026)
out["source"] = "2026_only"
return out
if not x2025:
out = dict(x2026) if x2026 else {}
if out:
out["source"] = "2026_only_no_2025"
return out
pa_26 = x2026.get("pa", 0) if x2026 else 0
pa_25 = x2025.get("pa", 0)
total = pa_26 + pa_25
def weighted(key):
v26 = x2026.get(key, 0.0) if x2026 else 0.0
v25 = x2025.get(key, 0.0)
return ((v26 * pa_26) + (v25 * pa_25)) / total
return {
"pa_2026": pa_26,
"pa_2025": pa_25,
"pa_total_used": total,
"xwoba": weighted("xwoba"),
"xba": weighted("xba"),
"xslg": weighted("xslg"),
"hard_hit_pct": weighted("hard_hit_pct"),
"barrel_pct": weighted("barrel_pct"),
"k_pct": weighted("k_pct"),
"bb_pct": weighted("bb_pct"),
"source": "2026_plus_2025",
}
def parse_pitcher_xstats(page):
page.wait_for_selector(PITCHER_X_TABLE_SELECTOR, timeout=30000)
page.wait_for_timeout(800)
rows = page.locator(f"{PITCHER_X_TABLE_SELECTOR} tbody tr")
out = {}
for i in range(rows.count()):
row = rows.nth(i)
cells = row.locator("td")
if cells.count() < 21:
continue
vals = [cells.nth(c).inner_text().strip() for c in range(cells.count())]
if not vals[0].isdigit():
continue
yr = int(vals[0])
out[yr] = {
"season": yr,
"pitches": safe_int(vals[2]),
"xwoba": safe_float(vals[14]),
"xba": safe_float(vals[11]),
"xslg": safe_float(vals[12]),
"hard_hit_pct": safe_float(vals[16]),
"barrel_pct": safe_float(vals[5]),
"k_pct": safe_float(vals[17]),
"bb_pct": safe_float(vals[18]),
"era": safe_float(vals[19]),
"xera": safe_float(vals[20]),
}
return out
def blend_pitcher_xstats(x2026, x2025):
if not x2026 and not x2025:
return {}
if x2026 and x2026.get("pitches", 0) >= PITCH_THRESHOLD_PITCHER:
out = dict(x2026)
out["source"] = "2026_only"
return out
if not x2025:
out = dict(x2026) if x2026 else {}
if out:
out["source"] = "2026_only_no_2025"
return out
p26 = x2026.get("pitches", 0) if x2026 else 0
p25 = x2025.get("pitches", 0)
total = p26 + p25
def weighted(key):
v26 = x2026.get(key, 0.0) if x2026 else 0.0
v25 = x2025.get(key, 0.0)
return ((v26 * p26) + (v25 * p25)) / total
return {
"pitches_2026": p26,
"pitches_2025": p25,
"pitches_total_used": total,
"xwoba": weighted("xwoba"),
"xba": weighted("xba"),
"xslg": weighted("xslg"),
"hard_hit_pct": weighted("hard_hit_pct"),
"barrel_pct": weighted("barrel_pct"),
"k_pct": weighted("k_pct"),
"bb_pct": weighted("bb_pct"),
"era": weighted("era"),
"xera": weighted("xera"),
"source": "2026_plus_2025",
}
def scrape_hitter_savant_data(page, player_slug: str):
x_url = SAVANT_HITTER_X_URL.format(slug=player_slug)
split_2026_url = SAVANT_HITTER_SPLIT_URL.format(slug=player_slug, season=2026)
split_2025_url = SAVANT_HITTER_SPLIT_URL.format(slug=player_slug, season=2025)
page.goto(x_url, wait_until="domcontentloaded", timeout=60000)
page.wait_for_timeout(2200)
x_years = parse_hitter_x_stats_year_table(page)
page.goto(split_2026_url, wait_until="domcontentloaded", timeout=60000)
page.wait_for_timeout(2200)
split_2026 = parse_hitter_platoon_rows(page, 2026)
page.goto(split_2025_url, wait_until="domcontentloaded", timeout=60000)
page.wait_for_timeout(2200)
split_2025 = parse_hitter_platoon_rows(page, 2025)
x_2026 = x_years.get(2026, {})
x_2025 = x_years.get(2025, {})
return {
"overall_used": blend_hitter_overall_xstats(x_2026, x_2025),
"vs_L_used": combine_hitter_split_rows(split_2026.get("vs_L"), split_2025.get("vs_L")),
"vs_R_used": combine_hitter_split_rows(split_2026.get("vs_R"), split_2025.get("vs_R")),
}
def scrape_pitcher_savant_data(page, player_slug: str):
x_url = SAVANT_PITCHER_X_URL.format(slug=player_slug)
page.goto(x_url, wait_until="domcontentloaded", timeout=60000)
page.wait_for_timeout(2500)
x_years = parse_pitcher_xstats(page)
x_2026 = x_years.get(2026, {})
x_2025 = x_years.get(2025, {})
return {"overall_used": blend_pitcher_xstats(x_2026, x_2025)}
def compact_player_block(player, data):
if player["type"] == "pitcher":
overall = data.get("overall_used", {})
return (
f"SP {player['team']} | {player['name']} ({player['hand']}) | "
f"xwOBA {overall.get('xwoba', 0):.3f} | xERA {overall.get('xera', 0):.2f} | "
f"K% {overall.get('k_pct', 0):.1f} | BB% {overall.get('bb_pct', 0):.1f} | "
f"HH% {overall.get('hard_hit_pct', 0):.1f} | Barrel% {overall.get('barrel_pct', 0):.1f}"
)
overall = data.get("overall_used", {})
vs_l = data.get("vs_L_used", {})
vs_r = data.get("vs_R_used", {})
return (
f"{player['order']:>2}. {player['name']} ({player['hand']}) | "
f"xwOBA {overall.get('xwoba', 0):.3f} | xSLG {overall.get('xslg', 0):.3f} | "
f"HH% {overall.get('hard_hit_pct', 0):.1f} | K% {overall.get('k_pct', 0):.1f} | "
f"BB% {overall.get('bb_pct', 0):.1f} | "
f"OPS vL {vs_l.get('ops', 0):.3f} | OPS vR {vs_r.get('ops', 0):.3f}"
)
def build_final_report(meta, matched, scraped_map, started_at):
elapsed = int((datetime.now() - started_at).total_seconds())
away_pitcher = None
home_pitcher = None
away_hitters = []
home_hitters = []
for player in matched:
pdata = scraped_map.get(player["name"], {})
line = compact_player_block(player, pdata) if pdata else (
f"{player['type'].upper()} | {player['team']} | {player['name']} ({player['hand']}) | No Savant match"
)
if player["type"] == "pitcher":
if player["side"] == "away":
away_pitcher = line
else:
home_pitcher = line
else:
if player["side"] == "away":
away_hitters.append(line)
else:
home_hitters.append(line)
report = [
f"{meta['away_team']} @ {meta['home_team']}",
f"Date (PT): {today_pacific()}",
f"Build time: {elapsed}s",
"",
away_pitcher or "Away SP not found",
home_pitcher or "Home SP not found",
"",
f"{meta['away_team']} lineup",
*away_hitters,
"",
f"{meta['home_team']} lineup",
*home_hitters,
]
return "\n".join(report)
def run_selected_game(game_pk):
if not game_pk:
yield "No game selected."
return
started_at = datetime.now()
game_date = today_pacific()
try:
yield f"Loading selected game...\nDate (PT): {game_date}"
players, meta = fetch_game_players(game_pk)
yield (
f"Loading selected game...\n"
f"{meta['away_team']} @ {meta['home_team']}\n"
f"Found {len(players)} MLB players to process."
)
with sync_playwright() as p:
browser = p.chromium.launch(
headless=True,
args=["--no-sandbox", "--disable-dev-shm-usage"]
)
page = browser.new_page()
savant_rows = scrape_savant_player_links_for_game(page, game_pk, game_date)
matched = match_players_to_savant(players, savant_rows)
total = len(matched)
scraped_map = {}
progress_lines = [
f"{meta['away_team']} @ {meta['home_team']}",
f"Matched {sum(1 for x in matched if x['matched'])}/{total} players to Savant.",
"",
]
for idx, player in enumerate(matched, start=1):
elapsed = int((datetime.now() - started_at).total_seconds())
if not player["matched"] or not player["savant_slug"]:
progress_lines.append(
f"[{idx}/{total}] {player['type']} {player['name']} -> no Savant match"
)
yield "\n".join(progress_lines[-12:])
continue
progress_lines.append(
f"[{idx}/{total}] loading {player['type']} {player['name']}..."
)
yield "\n".join(progress_lines[-12:])
if player["type"] == "hitter":
pdata = scrape_hitter_savant_data(page, player["savant_slug"])
else:
pdata = scrape_pitcher_savant_data(page, player["savant_slug"])
scraped_map[player["name"]] = pdata
progress_lines.append(
f"[{idx}/{total}] done {player['name']}"
)
if elapsed >= 60:
progress_lines.append(
"Still working normally — full lineup scraping just takes a bit."
)
yield "\n".join(progress_lines[-12:])
browser.close()
final_report = build_final_report(meta, matched, scraped_map, started_at)
yield final_report
except Exception as e:
yield f"Error: {repr(e)}"
with gr.Blocks() as app:
gr.Markdown("## MLB Savant Quick Runner")
gr.Markdown("Today's games auto-load using Pacific time. Click Run for the selected game.")
dropdown = gr.Dropdown(label="Today's Games", choices=[])
button = gr.Button("Run Selected Game")
output = gr.Textbox(label="Output", lines=30)
app.load(get_games, outputs=dropdown)
button.click(run_selected_game, inputs=dropdown, outputs=output)
app.queue()
if __name__ == "__main__":
app.launch(server_name="0.0.0.0", server_port=7860)