2026_MLB_Model / data /provider_scrape.py
Syntrex's picture
Add DraftKings-first HR supplemental coverage
7e836ba
raw
history blame
24.4 kB
"""
data/provider_scrape.py
Fallback scraper for HR props when The Odds API hasn't yet indexed player props.
Hits each book's semi-public JSON API directly using requests only (no browser).
Books: DraftKings, FanDuel, BetMGM, Caesars
Each book's fetch is independent — one failure does not block the others.
Results are concatenated across all books that respond successfully.
"""
from __future__ import annotations
import json
import logging
import re
import threading
import time
from typing import Any
import pandas as pd
import requests
from data.market_provider_base import MarketProviderBase
from data.odds_name_map import map_odds_name_to_model_name
_log = logging.getLogger(__name__)
_HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36"
),
"Accept": "application/json, text/plain, */*",
"Accept-Language": "en-US,en;q=0.9",
}
_DK_BASE = "https://sportsbook-nash.draftkings.com/api/odds/v1/league/84240"
_DK_HR_ROUTE = (
"https://sportsbook.draftkings.com/leagues/baseball/mlb"
"?category=batter-props&subcategory=home-runs"
)
_FD_BASE = "https://sbapi.il.sportsbook.fanduel.com/api"
_FD_AK = "FhMFpcPWXMeyZxOx"
_BETMGM_ACCESS_ID = "NmFjNjYzMmQtMmZlNS00MDQ3LWIzZjctNGMxMjhmNjNmNWVm"
_BOOK_FETCHERS = {
"draftkings": "_fetch_draftkings",
"fanduel": "_fetch_fanduel",
"betmgm": "_fetch_betmgm",
"williamhill_us": "_fetch_caesars",
}
_HR_MARKET_KEYWORDS = (
"home run",
"home_run",
"homer",
"to hit a home run",
"to record a home run",
"player to hit a home run",
"player home runs",
"anytime hr",
"anytime home run",
)
_ADAPTER_STATE_LOCK = threading.Lock()
_ADAPTER_STATE: dict[str, dict[str, Any]] = {}
_HARD_FAILURE_RETRY_SECONDS = 60 * 30
_SOFT_FAILURE_RETRY_SECONDS = 60 * 10
def _looks_like_hr_market(*values: Any) -> bool:
haystack = " | ".join(str(value or "").strip().lower() for value in values if str(value or "").strip())
if not haystack:
return False
return any(keyword in haystack for keyword in _HR_MARKET_KEYWORDS)
def _extract_american_odds(*values: Any) -> int | None:
for value in values:
if value is None:
continue
try:
text = str(value).strip()
if not text:
continue
return int(text.replace("+", ""))
except (TypeError, ValueError):
continue
return None
def _utc_epoch_seconds() -> float:
return float(time.time())
def _utc_iso_now() -> str:
return pd.Timestamp.utcnow().isoformat()
def _book_display_name(book_key: str) -> str:
labels = {
"draftkings": "DraftKings",
"fanduel": "FanDuel",
"betmgm": "BetMGM",
"williamhill_us": "Caesars",
}
return labels.get(str(book_key or "").strip().lower(), str(book_key or "").strip())
def _is_hard_failure(status: str, error_text: str) -> bool:
haystack = f"{status} {error_text}".lower()
hard_markers = ("404", "400", "tls", "ssl", "parse", "json", "not found")
return any(marker in haystack for marker in hard_markers)
def _get_adapter_state(book_key: str) -> dict[str, Any]:
with _ADAPTER_STATE_LOCK:
return dict(_ADAPTER_STATE.get(book_key, {}))
def _set_adapter_state(
book_key: str,
*,
status: str,
error: str = "",
rows_returned: int = 0,
) -> dict[str, Any]:
now_epoch = _utc_epoch_seconds()
now_iso = _utc_iso_now()
existing = _get_adapter_state(book_key)
failure_streak = int(existing.get("failure_streak") or 0)
if status == "healthy":
failure_streak = 0
retry_after_epoch = None
else:
failure_streak += 1
delay_seconds = (
_HARD_FAILURE_RETRY_SECONDS
if _is_hard_failure(status, error)
else _SOFT_FAILURE_RETRY_SECONDS
)
retry_after_epoch = now_epoch + delay_seconds
state = {
"adapter_status": status,
"adapter_error": str(error or ""),
"adapter_rows_returned": int(rows_returned or 0),
"last_attempted_at": now_iso,
"retry_after": (
pd.Timestamp.fromtimestamp(retry_after_epoch, tz="UTC").isoformat()
if retry_after_epoch is not None
else ""
),
"failure_streak": failure_streak,
"_retry_after_epoch": retry_after_epoch,
}
with _ADAPTER_STATE_LOCK:
_ADAPTER_STATE[book_key] = state
return dict(state)
def _throttled_adapter_state(book_key: str) -> dict[str, Any] | None:
state = _get_adapter_state(book_key)
retry_after_epoch = state.get("_retry_after_epoch")
if retry_after_epoch is None:
return None
if float(retry_after_epoch) <= _utc_epoch_seconds():
return None
throttled = dict(state)
throttled["adapter_status"] = "throttled"
return throttled
def _make_row(
provider_name: str,
event_id: str,
commence_time: str,
away_team: str,
home_team: str,
sportsbook: str,
sportsbook_key: str,
player_name_raw: str,
odds_american: int,
line: float = 0.5,
selection_label: str | None = None,
) -> dict[str, Any]:
return {
"provider": provider_name,
"event_id": event_id,
"commence_time": commence_time,
"away_team": away_team,
"home_team": home_team,
"sportsbook": sportsbook,
"sportsbook_key": sportsbook_key,
"market_key": "batter_home_runs",
"market": "hr",
"player_name_raw": player_name_raw,
"selection_label": selection_label,
"player_name": map_odds_name_to_model_name(player_name_raw),
"odds_american": odds_american,
"line": line,
}
class ScrapeFallbackProvider(MarketProviderBase):
provider_name = "scrape_fallback"
def fetch_live_prop_odds(
self,
game_context,
sportsbooks=None,
markets=None,
) -> pd.DataFrame:
return pd.DataFrame()
def fetch_all_upcoming_hr_props(self, sportsbooks=None, markets=None) -> pd.DataFrame:
result, _ = self.fetch_all_upcoming_hr_props_with_meta(
sportsbooks=sportsbooks,
markets=markets,
)
return result
def fetch_all_upcoming_hr_props_with_meta(
self,
sportsbooks=None,
markets=None,
) -> tuple[pd.DataFrame, dict[str, Any]]:
del markets
requested_books = [
str(book or "").strip().lower()
for book in (sportsbooks or list(_BOOK_FETCHERS.keys()))
]
frames = []
adapter_status_by_book: dict[str, str] = {}
adapter_error_by_book: dict[str, str] = {}
adapter_rows_by_book: dict[str, int] = {}
adapter_last_attempted_at_by_book: dict[str, str] = {}
adapter_retry_after_by_book: dict[str, str] = {}
for book_key in requested_books:
fetch_name = _BOOK_FETCHERS.get(book_key)
if not fetch_name:
continue
throttled_state = _throttled_adapter_state(book_key)
if throttled_state is not None:
adapter_status_by_book[book_key] = str(throttled_state.get("adapter_status") or "throttled")
adapter_error_by_book[book_key] = str(throttled_state.get("adapter_error") or "")
adapter_rows_by_book[book_key] = int(throttled_state.get("adapter_rows_returned") or 0)
adapter_last_attempted_at_by_book[book_key] = str(throttled_state.get("last_attempted_at") or "")
adapter_retry_after_by_book[book_key] = str(throttled_state.get("retry_after") or "")
_log.warning(
"[scrape_fallback] %s (%s) throttled until %s",
fetch_name,
book_key,
adapter_retry_after_by_book[book_key] or "unknown",
)
continue
fetch_fn = getattr(self, fetch_name)
try:
df = fetch_fn()
state = _set_adapter_state(
book_key,
status="healthy" if not df.empty else "empty_result",
rows_returned=len(df),
)
if not df.empty:
frames.append(df)
_log.warning(
"[scrape_fallback] %s (%s) returned %d rows",
fetch_fn.__name__,
book_key,
len(df),
)
else:
_log.warning(
"[scrape_fallback] %s (%s) returned 0 rows",
fetch_fn.__name__,
book_key,
)
except Exception as exc:
state = _set_adapter_state(
book_key,
status=exc.__class__.__name__.lower() or "error",
error=str(exc),
rows_returned=0,
)
_log.warning(
"[scrape_fallback] %s (%s) failed: %s",
fetch_fn.__name__,
book_key,
exc,
)
adapter_status_by_book[book_key] = str(state.get("adapter_status") or "")
adapter_error_by_book[book_key] = str(state.get("adapter_error") or "")
adapter_rows_by_book[book_key] = int(state.get("adapter_rows_returned") or 0)
adapter_last_attempted_at_by_book[book_key] = str(state.get("last_attempted_at") or "")
adapter_retry_after_by_book[book_key] = str(state.get("retry_after") or "")
result = pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()
_log.warning(
"[scrape_fallback] SUMMARY requested_books=%s total_rows=%d",
requested_books,
len(result),
)
return result, {
"adapter_status_by_book": adapter_status_by_book,
"adapter_error_by_book": adapter_error_by_book,
"adapter_rows_by_book": adapter_rows_by_book,
"adapter_last_attempted_at_by_book": adapter_last_attempted_at_by_book,
"adapter_retry_after_by_book": adapter_retry_after_by_book,
}
# ---------------------------------------------------------------------------
# DraftKings
# ---------------------------------------------------------------------------
def _fetch_draftkings(self) -> pd.DataFrame:
headers = dict(_HEADERS)
headers["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
r = requests.get(_DK_HR_ROUTE, headers=headers, timeout=20)
_log.warning("[dk_scrape] HTTP %s route=%s", r.status_code, _DK_HR_ROUTE)
r.raise_for_status()
return self._parse_dk_route_html(r.text)
def _extract_dk_json_candidates(self, html_text: str) -> list[Any]:
if not html_text:
return []
patterns = [
r'<script[^>]*id="__NEXT_DATA__"[^>]*>\s*(.*?)\s*</script>',
r'<script[^>]*>\s*window\.__NEXT_DATA__\s*=\s*(\{.*?\})\s*;?\s*</script>',
r'<script[^>]*>\s*window\.__PRELOADED_STATE__\s*=\s*(\{.*?\})\s*;?\s*</script>',
r'<script[^>]*type="application/json"[^>]*>\s*(.*?)\s*</script>',
]
candidates: list[Any] = []
for pattern in patterns:
for match in re.finditer(pattern, html_text, flags=re.IGNORECASE | re.DOTALL):
blob = str(match.group(1) or "").strip()
if not blob or blob[:1] not in {"{", "["}:
continue
try:
parsed = json.loads(blob)
except Exception:
continue
candidates.append(parsed)
return candidates
def _walk_json(self, payload: Any):
if isinstance(payload, dict):
yield payload
for value in payload.values():
yield from self._walk_json(value)
elif isinstance(payload, list):
for value in payload:
yield from self._walk_json(value)
def _parse_dk_route_html(self, html_text: str) -> pd.DataFrame:
rows: list[dict[str, Any]] = []
for candidate in self._extract_dk_json_candidates(html_text):
for node in self._walk_json(candidate):
event_group = None
if isinstance(node, dict) and isinstance(node.get("eventGroup"), dict):
event_group = node["eventGroup"]
elif isinstance(node, dict) and any(
key in node for key in ("offerCategories", "offerSubcategoryDescriptors")
):
event_group = node
if not isinstance(event_group, dict):
continue
parsed = self._parse_dk({"eventGroup": event_group})
if not parsed.empty:
rows.extend(parsed.to_dict("records"))
if not rows:
return pd.DataFrame()
deduped = pd.DataFrame(rows).drop_duplicates(
subset=[
"event_id",
"player_name",
"sportsbook_key",
"market_key",
"selection_label",
"line",
"odds_american",
],
keep="first",
)
return deduped.reset_index(drop=True)
def _parse_dk(self, data: dict) -> pd.DataFrame:
rows: list[dict[str, Any]] = []
event_group = data.get("eventGroup", {})
for offer_cat in event_group.get("offerCategories", []):
for sub_desc in offer_cat.get("offerSubcategoryDescriptors", []):
for offer_group in sub_desc.get("offerGroups", []):
event_id = str(offer_group.get("eventId", "") or "")
event_desc = str(offer_group.get("eventDescription", "") or "")
parts = [p.strip() for p in event_desc.replace(" vs ", " @ ").split(" @ ")]
away_team = parts[0] if len(parts) >= 2 else ""
home_team = parts[1] if len(parts) >= 2 else ""
commence_time = str(offer_group.get("startDate", "") or "")
for offer_list in offer_group.get("offers", []):
for offer in (offer_list if isinstance(offer_list, list) else [offer_list]):
for outcome in offer.get("outcomes", []):
player_name_raw = str(
offer.get("label", "")
or outcome.get("label", "")
or outcome.get("participant", "")
or outcome.get("name", "")
or ""
).strip()
if not player_name_raw:
continue
price = _extract_american_odds(
outcome.get("oddsAmerican"),
outcome.get("odds_american"),
outcome.get("price"),
)
if price is None:
continue
rows.append(_make_row(
self.provider_name, event_id, commence_time,
away_team, home_team,
"DraftKings", "draftkings",
player_name_raw, price,
selection_label=str(outcome.get("label", "") or outcome.get("name", "") or "").strip() or None,
))
return pd.DataFrame(rows)
# ---------------------------------------------------------------------------
# FanDuel
# ---------------------------------------------------------------------------
def _fetch_fanduel(self) -> pd.DataFrame:
url = (
f"{_FD_BASE}/content-managed-page"
f"?page=SPORT_LEAGUE&countryCode=US&regionCode=IL"
f"&channel=BASEBALL&lang=en-US&_ak={_FD_AK}"
)
r = requests.get(url, headers=_HEADERS, timeout=20)
_log.warning("[fd_scrape] HTTP %s", r.status_code)
r.raise_for_status()
return self._parse_fd(r.json())
def _parse_fd(self, data: dict) -> pd.DataFrame:
rows: list[dict[str, Any]] = []
attachments = data.get("attachments", {})
events = attachments.get("events", {})
markets = attachments.get("markets", {})
for _market_id, market in markets.items():
market_type = str(market.get("marketType", "") or "")
market_name = str(market.get("marketName", "") or market.get("name", "") or market.get("title", "") or "")
if not _looks_like_hr_market(market_type, market_name):
continue
event_id = str(market.get("eventId", "") or "")
event = events.get(str(event_id), {})
away_team = str(
event.get("awayTeam", {}).get("name", "")
or event.get("awayTeamName", "")
or ""
)
home_team = str(
event.get("homeTeam", {}).get("name", "")
or event.get("homeTeamName", "")
or ""
)
commence_time = str(event.get("openDate", "") or "")
for runner in market.get("runners", []):
player_name_raw = str(
runner.get("runnerName", "")
or runner.get("runnerTitle", "")
or runner.get("name", "")
or ""
).strip()
if not player_name_raw:
continue
price = _extract_american_odds(
(
runner.get("winRunnerOdds", {})
.get("americanDisplayOdds", {})
.get("americanOdds", "")
),
runner.get("price"),
runner.get("odds"),
)
if price is None:
continue
rows.append(_make_row(
self.provider_name, event_id, commence_time,
away_team, home_team,
"FanDuel", "fanduel",
player_name_raw, price,
selection_label=str(runner.get("result", "") or runner.get("name", "") or "").strip() or None,
))
return pd.DataFrame(rows)
# ---------------------------------------------------------------------------
# BetMGM
# ---------------------------------------------------------------------------
def _fetch_betmgm(self) -> pd.DataFrame:
url = (
"https://sports.nj.betmgm.com/en/sports/api/v2/leagues/baseball-mlb/events"
f"?lang=en-us&x-bwin-accessid={_BETMGM_ACCESS_ID}"
)
r = requests.get(url, headers=_HEADERS, timeout=20)
_log.warning("[betmgm_scrape] HTTP %s", r.status_code)
r.raise_for_status()
return self._parse_betmgm(r.json())
def _parse_betmgm(self, data: dict | list) -> pd.DataFrame:
rows: list[dict[str, Any]] = []
events = (
data
if isinstance(data, list)
else data.get("result", {}).get("dataList", data.get("events", []))
)
for event in events:
event_id = str(event.get("id", "") or "")
name_obj = event.get("name", {})
name = str(name_obj.get("value", "") if isinstance(name_obj, dict) else name_obj or "")
parts = [p.strip() for p in name.replace(" vs ", " @ ").split(" @ ")]
away_team = parts[0] if len(parts) >= 2 else ""
home_team = parts[1] if len(parts) >= 2 else ""
commence_time = str(event.get("startDate", "") or "")
for fixture in event.get("markets", []):
mkt_name_obj = fixture.get("name", {})
mkt_name = str(
mkt_name_obj.get("value", "") if isinstance(mkt_name_obj, dict) else mkt_name_obj or ""
)
if not _looks_like_hr_market(mkt_name, fixture.get("type"), fixture.get("key")):
continue
for selection in fixture.get("selections", []):
sel_name_obj = selection.get("name", {})
player_name_raw = str(
sel_name_obj.get("value", "") if isinstance(sel_name_obj, dict) else sel_name_obj or ""
).strip()
if not player_name_raw:
continue
price = _extract_american_odds(
selection.get("price", {}).get("americanOdds"),
selection.get("americanOdds"),
selection.get("price"),
)
if price is None:
continue
rows.append(_make_row(
self.provider_name, event_id, commence_time,
away_team, home_team,
"BetMGM", "betmgm",
player_name_raw, price,
selection_label=str(selection.get("result", "") or "").strip() or None,
))
return pd.DataFrame(rows)
# ---------------------------------------------------------------------------
# Caesars
# ---------------------------------------------------------------------------
def _fetch_caesars(self) -> pd.DataFrame:
url = (
"https://api.levelmgr.caesarssportsbook.com/api/v1"
"/leagues/baseball-mlb/player-props/home-run"
)
r = requests.get(url, headers=_HEADERS, timeout=20)
_log.warning("[caesars_scrape] HTTP %s", r.status_code)
r.raise_for_status()
return self._parse_caesars(r.json())
def _parse_caesars(self, data: dict | list) -> pd.DataFrame:
rows: list[dict[str, Any]] = []
items = (
data
if isinstance(data, list)
else data.get("data", data.get("events", data.get("items", [])))
)
for item in items:
event_id = str(item.get("eventId", item.get("id", "")) or "")
away_team = str(item.get("awayTeamName", item.get("away_team", "")) or "")
home_team = str(item.get("homeTeamName", item.get("home_team", "")) or "")
commence_time = str(item.get("eventDate", item.get("startTime", "")) or "")
selections = item.get("participants", item.get("props", item.get("selections", [])))
for prop in selections:
if not _looks_like_hr_market(
item.get("marketName"),
item.get("name"),
prop.get("marketName"),
prop.get("name"),
):
continue
player_name_raw = str(
prop.get("name", prop.get("participantName", prop.get("playerName", ""))) or ""
).strip()
if not player_name_raw:
continue
price = _extract_american_odds(
prop.get("odds", {}).get("american")
if isinstance(prop.get("odds"), dict)
else None,
prop.get("americanOdds"),
prop.get("price"),
)
if price is None:
continue
rows.append(_make_row(
self.provider_name, event_id, commence_time,
away_team, home_team,
"Caesars", "williamhill_us",
player_name_raw, price,
selection_label=str(prop.get("result", "") or prop.get("selection", "") or "").strip() or None,
))
return pd.DataFrame(rows)