import gradio as gr import requests import re import unicodedata from datetime import datetime from zoneinfo import ZoneInfo from playwright.sync_api import sync_playwright MLB_SCHEDULE_URL = "https://statsapi.mlb.com/api/v1/schedule" MLB_LIVE_URL = "https://statsapi.mlb.com/api/v1.1/game/{game_pk}/feed/live" SAVANT_PREVIEW_URL = ( "https://baseballsavant.mlb.com/preview" "?game_pk={game_pk}&game_date={game_date}&date={game_date}" ) SAVANT_HITTER_X_URL = ( "https://baseballsavant.mlb.com/savant-player/" "{slug}?stats=statcast-r-hitting-mlb" ) SAVANT_HITTER_SPLIT_URL = ( "https://baseballsavant.mlb.com/savant-player/" "{slug}?stats=splits-r-hitting-mlb&season={season}" ) SAVANT_PITCHER_X_URL = ( "https://baseballsavant.mlb.com/savant-player/" "{slug}?stats=statcast-r-pitching-mlb" ) HITTER_X_TABLE_SELECTOR = "#statcast_glance_batter > table" HITTER_PLATOON_TABLE_SELECTOR = "#date-platoon-mlb > table" PITCHER_X_TABLE_SELECTOR = "#statcast_stats_pitching > table" PA_THRESHOLD_HITTER = 50 PITCH_THRESHOLD_PITCHER = 500 def today_pacific() -> str: return datetime.now(ZoneInfo("America/Los_Angeles")).strftime("%Y-%m-%d") def normalize_name(name: str) -> str: if not name: return "" name = unicodedata.normalize("NFKD", name) name = "".join(ch for ch in name if not unicodedata.combining(ch)) name = name.lower().strip() name = re.sub(r"[^\w\s]", "", name) suffixes = {"jr", "sr", "ii", "iii", "iv", "v"} parts = [p for p in name.split() if p not in suffixes] return " ".join(parts) def get_json(url: str, params=None) -> dict: r = requests.get(url, params=params, timeout=25) r.raise_for_status() return r.json() def safe_int(text: str) -> int: text = (text or "").strip().replace(",", "") try: return int(text) except Exception: return 0 def safe_float(text: str) -> float: text = (text or "").strip().replace("%", "").replace(",", "") try: return float(text) except Exception: return 0.0 def get_games(): date_str = today_pacific() data = get_json(MLB_SCHEDULE_URL, params={"sportId": 1, "date": date_str}) games = [] for d in data.get("dates", []): for g in d.get("games", []): game_pk = g.get("gamePk") away = g.get("teams", {}).get("away", {}).get("team", {}).get("name", "Away") home = g.get("teams", {}).get("home", {}).get("team", {}).get("name", "Home") status = g.get("status", {}).get("detailedState", "") label = f"{away} @ {home} | {status}" games.append((label, game_pk)) if not games: return gr.update(choices=[], value=None) return gr.update(choices=games, value=games[0][1]) def fetch_game_players(game_pk: int): data = get_json(MLB_LIVE_URL.format(game_pk=game_pk)) game_data = data.get("gameData", {}) live_data = data.get("liveData", {}) box_teams = live_data.get("boxscore", {}).get("teams", {}) players_meta = game_data.get("players", {}) probable = game_data.get("probablePitchers", {}) away_team = game_data.get("teams", {}).get("away", {}).get("name", "Away") home_team = game_data.get("teams", {}).get("home", {}).get("name", "Home") players = [] for side in ["away", "home"]: team_name = away_team if side == "away" else home_team team_box = box_teams.get(side, {}) batter_ids = team_box.get("batters", []) players_box = team_box.get("players", {}) for idx, pid in enumerate(batter_ids[:9], start=1): player = players_box.get(f"ID{pid}", {}) name = player.get("person", {}).get("fullName", "") hand = players_meta.get(f"ID{pid}", {}).get("batSide", {}).get("code", "") players.append( { "type": "hitter", "order": idx, "side": side, "team": team_name, "player_id": pid, "name": name, "hand": hand, "norm_name": normalize_name(name), } ) p = probable.get(side, {}) pid = p.get("id") name = p.get("fullName", "") hand = players_meta.get(f"ID{pid}", {}).get("pitchHand", {}).get("code", "") if pid else "" if name and pid: players.append( { "type": "pitcher", "order": None, "side": side, "team": team_name, "player_id": pid, "name": name, "hand": hand, "norm_name": normalize_name(name), } ) return players, {"away_team": away_team, "home_team": home_team} def scrape_savant_player_links_for_game(page, game_pk: int, game_date: str): preview_url = SAVANT_PREVIEW_URL.format(game_pk=game_pk, game_date=game_date) page.goto(preview_url, wait_until="domcontentloaded", timeout=60000) page.wait_for_timeout(5000) links = page.locator('a[href*="/savant-player/"]') link_count = links.count() rows = [] seen = set() for i in range(link_count): href = links.nth(i).get_attribute("href") or "" name = links.nth(i).inner_text().strip() if not name: continue m = re.search(r"/savant-player/([a-z0-9-]+)", href) if not m: continue slug = m.group(1) norm = normalize_name(name) if norm in seen: continue seen.add(norm) rows.append( { "name": name, "norm_name": norm, "slug": slug, } ) return rows def match_players_to_savant(players, savant_rows): savant_map = {r["norm_name"]: r for r in savant_rows} matched = [] for player in players: savant = savant_map.get(player["norm_name"]) row = dict(player) if savant: row["matched"] = True row["savant_slug"] = savant["slug"] else: row["matched"] = False row["savant_slug"] = None matched.append(row) return matched def parse_hitter_x_stats_year_table(page): page.wait_for_selector(HITTER_X_TABLE_SELECTOR, timeout=30000) page.wait_for_timeout(800) rows = page.locator(f"{HITTER_X_TABLE_SELECTOR} tbody tr") out = {} for i in range(rows.count()): row = rows.nth(i) cells = row.locator("td") if cells.count() < 19: continue vals = [cells.nth(c).inner_text().strip() for c in range(cells.count())] if not vals[0].isdigit(): continue yr = int(vals[0]) out[yr] = { "season": yr, "pa": safe_int(vals[2]), "xwoba": safe_float(vals[14]), "xba": safe_float(vals[11]), "xslg": safe_float(vals[12]), "hard_hit_pct": safe_float(vals[16]), "barrel_pct": safe_float(vals[5]), "k_pct": safe_float(vals[17]), "bb_pct": safe_float(vals[18]), } return out def parse_hitter_platoon_rows(page, season: int): page.wait_for_selector(HITTER_PLATOON_TABLE_SELECTOR, timeout=30000) page.wait_for_timeout(800) rows = page.locator(f"{HITTER_PLATOON_TABLE_SELECTOR} tbody tr") out = {} for i in range(min(rows.count(), 2)): row = rows.nth(i) cells = row.locator("td") if cells.count() < 20: continue split_label = cells.nth(2).inner_text().strip() split_key = "vs_L" if "Left" in split_label else "vs_R" out[split_key] = { "season": season, "split": split_label, "pa": safe_int(cells.nth(3).inner_text()), "ops": safe_float(cells.nth(19).inner_text()), } return out def combine_hitter_split_rows(row_2026, row_2025): if not row_2026 and not row_2025: return {} if row_2026 and row_2026.get("pa", 0) >= PA_THRESHOLD_HITTER: out = dict(row_2026) out["source"] = "2026_only" return out if not row_2025: out = dict(row_2026) if row_2026 else {} if out: out["source"] = "2026_only_no_2025" return out pa_26 = row_2026.get("pa", 0) if row_2026 else 0 pa_25 = row_2025.get("pa", 0) total = pa_26 + pa_25 if total == 0: return {} def weighted(key): v26 = row_2026.get(key, 0.0) if row_2026 else 0.0 v25 = row_2025.get(key, 0.0) return ((v26 * pa_26) + (v25 * pa_25)) / total return { "split": row_2026["split"] if row_2026 else row_2025["split"], "pa_2026": pa_26, "pa_2025": pa_25, "pa_total_used": total, "ops": weighted("ops"), "source": "2026_plus_2025", } def blend_hitter_overall_xstats(x2026, x2025): if not x2026 and not x2025: return {} if x2026 and x2026.get("pa", 0) >= PA_THRESHOLD_HITTER: out = dict(x2026) out["source"] = "2026_only" return out if not x2025: out = dict(x2026) if x2026 else {} if out: out["source"] = "2026_only_no_2025" return out pa_26 = x2026.get("pa", 0) if x2026 else 0 pa_25 = x2025.get("pa", 0) total = pa_26 + pa_25 def weighted(key): v26 = x2026.get(key, 0.0) if x2026 else 0.0 v25 = x2025.get(key, 0.0) return ((v26 * pa_26) + (v25 * pa_25)) / total return { "pa_2026": pa_26, "pa_2025": pa_25, "pa_total_used": total, "xwoba": weighted("xwoba"), "xba": weighted("xba"), "xslg": weighted("xslg"), "hard_hit_pct": weighted("hard_hit_pct"), "barrel_pct": weighted("barrel_pct"), "k_pct": weighted("k_pct"), "bb_pct": weighted("bb_pct"), "source": "2026_plus_2025", } def parse_pitcher_xstats(page): page.wait_for_selector(PITCHER_X_TABLE_SELECTOR, timeout=30000) page.wait_for_timeout(800) rows = page.locator(f"{PITCHER_X_TABLE_SELECTOR} tbody tr") out = {} for i in range(rows.count()): row = rows.nth(i) cells = row.locator("td") if cells.count() < 21: continue vals = [cells.nth(c).inner_text().strip() for c in range(cells.count())] if not vals[0].isdigit(): continue yr = int(vals[0]) out[yr] = { "season": yr, "pitches": safe_int(vals[2]), "xwoba": safe_float(vals[14]), "xba": safe_float(vals[11]), "xslg": safe_float(vals[12]), "hard_hit_pct": safe_float(vals[16]), "barrel_pct": safe_float(vals[5]), "k_pct": safe_float(vals[17]), "bb_pct": safe_float(vals[18]), "era": safe_float(vals[19]), "xera": safe_float(vals[20]), } return out def blend_pitcher_xstats(x2026, x2025): if not x2026 and not x2025: return {} if x2026 and x2026.get("pitches", 0) >= PITCH_THRESHOLD_PITCHER: out = dict(x2026) out["source"] = "2026_only" return out if not x2025: out = dict(x2026) if x2026 else {} if out: out["source"] = "2026_only_no_2025" return out p26 = x2026.get("pitches", 0) if x2026 else 0 p25 = x2025.get("pitches", 0) total = p26 + p25 def weighted(key): v26 = x2026.get(key, 0.0) if x2026 else 0.0 v25 = x2025.get(key, 0.0) return ((v26 * p26) + (v25 * p25)) / total return { "pitches_2026": p26, "pitches_2025": p25, "pitches_total_used": total, "xwoba": weighted("xwoba"), "xba": weighted("xba"), "xslg": weighted("xslg"), "hard_hit_pct": weighted("hard_hit_pct"), "barrel_pct": weighted("barrel_pct"), "k_pct": weighted("k_pct"), "bb_pct": weighted("bb_pct"), "era": weighted("era"), "xera": weighted("xera"), "source": "2026_plus_2025", } def scrape_hitter_savant_data(page, player_slug: str): x_url = SAVANT_HITTER_X_URL.format(slug=player_slug) split_2026_url = SAVANT_HITTER_SPLIT_URL.format(slug=player_slug, season=2026) split_2025_url = SAVANT_HITTER_SPLIT_URL.format(slug=player_slug, season=2025) page.goto(x_url, wait_until="domcontentloaded", timeout=60000) page.wait_for_timeout(2200) x_years = parse_hitter_x_stats_year_table(page) page.goto(split_2026_url, wait_until="domcontentloaded", timeout=60000) page.wait_for_timeout(2200) split_2026 = parse_hitter_platoon_rows(page, 2026) page.goto(split_2025_url, wait_until="domcontentloaded", timeout=60000) page.wait_for_timeout(2200) split_2025 = parse_hitter_platoon_rows(page, 2025) x_2026 = x_years.get(2026, {}) x_2025 = x_years.get(2025, {}) return { "overall_used": blend_hitter_overall_xstats(x_2026, x_2025), "vs_L_used": combine_hitter_split_rows(split_2026.get("vs_L"), split_2025.get("vs_L")), "vs_R_used": combine_hitter_split_rows(split_2026.get("vs_R"), split_2025.get("vs_R")), } def scrape_pitcher_savant_data(page, player_slug: str): x_url = SAVANT_PITCHER_X_URL.format(slug=player_slug) page.goto(x_url, wait_until="domcontentloaded", timeout=60000) page.wait_for_timeout(2500) x_years = parse_pitcher_xstats(page) x_2026 = x_years.get(2026, {}) x_2025 = x_years.get(2025, {}) return {"overall_used": blend_pitcher_xstats(x_2026, x_2025)} def compact_player_block(player, data): if player["type"] == "pitcher": overall = data.get("overall_used", {}) return ( f"SP {player['team']} | {player['name']} ({player['hand']}) | " f"xwOBA {overall.get('xwoba', 0):.3f} | xERA {overall.get('xera', 0):.2f} | " f"K% {overall.get('k_pct', 0):.1f} | BB% {overall.get('bb_pct', 0):.1f} | " f"HH% {overall.get('hard_hit_pct', 0):.1f} | Barrel% {overall.get('barrel_pct', 0):.1f}" ) overall = data.get("overall_used", {}) vs_l = data.get("vs_L_used", {}) vs_r = data.get("vs_R_used", {}) return ( f"{player['order']:>2}. {player['name']} ({player['hand']}) | " f"xwOBA {overall.get('xwoba', 0):.3f} | xSLG {overall.get('xslg', 0):.3f} | " f"HH% {overall.get('hard_hit_pct', 0):.1f} | K% {overall.get('k_pct', 0):.1f} | " f"BB% {overall.get('bb_pct', 0):.1f} | " f"OPS vL {vs_l.get('ops', 0):.3f} | OPS vR {vs_r.get('ops', 0):.3f}" ) def build_final_report(meta, matched, scraped_map, started_at): elapsed = int((datetime.now() - started_at).total_seconds()) away_pitcher = None home_pitcher = None away_hitters = [] home_hitters = [] for player in matched: pdata = scraped_map.get(player["name"], {}) line = compact_player_block(player, pdata) if pdata else ( f"{player['type'].upper()} | {player['team']} | {player['name']} ({player['hand']}) | No Savant match" ) if player["type"] == "pitcher": if player["side"] == "away": away_pitcher = line else: home_pitcher = line else: if player["side"] == "away": away_hitters.append(line) else: home_hitters.append(line) report = [ f"{meta['away_team']} @ {meta['home_team']}", f"Date (PT): {today_pacific()}", f"Build time: {elapsed}s", "", away_pitcher or "Away SP not found", home_pitcher or "Home SP not found", "", f"{meta['away_team']} lineup", *away_hitters, "", f"{meta['home_team']} lineup", *home_hitters, ] return "\n".join(report) def run_selected_game(game_pk): if not game_pk: yield "No game selected." return started_at = datetime.now() game_date = today_pacific() try: yield f"Loading selected game...\nDate (PT): {game_date}" players, meta = fetch_game_players(game_pk) yield ( f"Loading selected game...\n" f"{meta['away_team']} @ {meta['home_team']}\n" f"Found {len(players)} MLB players to process." ) with sync_playwright() as p: browser = p.chromium.launch( headless=True, args=["--no-sandbox", "--disable-dev-shm-usage"] ) page = browser.new_page() savant_rows = scrape_savant_player_links_for_game(page, game_pk, game_date) matched = match_players_to_savant(players, savant_rows) total = len(matched) scraped_map = {} progress_lines = [ f"{meta['away_team']} @ {meta['home_team']}", f"Matched {sum(1 for x in matched if x['matched'])}/{total} players to Savant.", "", ] for idx, player in enumerate(matched, start=1): elapsed = int((datetime.now() - started_at).total_seconds()) if not player["matched"] or not player["savant_slug"]: progress_lines.append( f"[{idx}/{total}] {player['type']} {player['name']} -> no Savant match" ) yield "\n".join(progress_lines[-12:]) continue progress_lines.append( f"[{idx}/{total}] loading {player['type']} {player['name']}..." ) yield "\n".join(progress_lines[-12:]) if player["type"] == "hitter": pdata = scrape_hitter_savant_data(page, player["savant_slug"]) else: pdata = scrape_pitcher_savant_data(page, player["savant_slug"]) scraped_map[player["name"]] = pdata progress_lines.append( f"[{idx}/{total}] done {player['name']}" ) if elapsed >= 60: progress_lines.append( "Still working normally — full lineup scraping just takes a bit." ) yield "\n".join(progress_lines[-12:]) browser.close() final_report = build_final_report(meta, matched, scraped_map, started_at) yield final_report except Exception as e: yield f"Error: {repr(e)}" with gr.Blocks() as app: gr.Markdown("## MLB Savant Quick Runner") gr.Markdown("Today's games auto-load using Pacific time. Click Run for the selected game.") dropdown = gr.Dropdown(label="Today's Games", choices=[]) button = gr.Button("Run Selected Game") output = gr.Textbox(label="Output", lines=30) app.load(get_games, outputs=dropdown) button.click(run_selected_game, inputs=dropdown, outputs=output) app.queue() if __name__ == "__main__": app.launch(server_name="0.0.0.0", server_port=7860)