""" data/provider_scrape.py Fallback scraper for HR props when The Odds API hasn't yet indexed player props. Hits each book's semi-public JSON API directly using requests only (no browser). Books: DraftKings, FanDuel, BetMGM, Caesars Each book's fetch is independent — one failure does not block the others. Results are concatenated across all books that respond successfully. """ from __future__ import annotations import json import logging import re import threading import time from typing import Any import pandas as pd import requests from data.market_provider_base import MarketProviderBase from data.odds_name_map import map_odds_name_to_model_name _log = logging.getLogger(__name__) _HEADERS = { "User-Agent": ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/120.0.0.0 Safari/537.36" ), "Accept": "application/json, text/plain, */*", "Accept-Language": "en-US,en;q=0.9", } _DK_BASE = "https://sportsbook-nash.draftkings.com/api/odds/v1/league/84240" _DK_HR_ROUTE = ( "https://sportsbook.draftkings.com/leagues/baseball/mlb" "?category=batter-props&subcategory=home-runs" ) _FD_BASE = "https://sbapi.il.sportsbook.fanduel.com/api" _FD_AK = "FhMFpcPWXMeyZxOx" _BETMGM_ACCESS_ID = "NmFjNjYzMmQtMmZlNS00MDQ3LWIzZjctNGMxMjhmNjNmNWVm" _BOOK_FETCHERS = { "draftkings": "_fetch_draftkings", "fanduel": "_fetch_fanduel", "betmgm": "_fetch_betmgm", "williamhill_us": "_fetch_caesars", } _HR_MARKET_KEYWORDS = ( "home run", "home_run", "homer", "to hit a home run", "to record a home run", "player to hit a home run", "player home runs", "anytime hr", "anytime home run", ) _ADAPTER_STATE_LOCK = threading.Lock() _ADAPTER_STATE: dict[str, dict[str, Any]] = {} _HARD_FAILURE_RETRY_SECONDS = 60 * 30 _SOFT_FAILURE_RETRY_SECONDS = 60 * 10 def _looks_like_hr_market(*values: Any) -> bool: haystack = " | ".join(str(value or "").strip().lower() for value in values if str(value or "").strip()) if not haystack: return False return any(keyword in haystack for keyword in _HR_MARKET_KEYWORDS) def _extract_american_odds(*values: Any) -> int | None: for value in values: if value is None: continue try: text = str(value).strip() if not text: continue return int(text.replace("+", "")) except (TypeError, ValueError): continue return None def _utc_epoch_seconds() -> float: return float(time.time()) def _utc_iso_now() -> str: return pd.Timestamp.utcnow().isoformat() def _book_display_name(book_key: str) -> str: labels = { "draftkings": "DraftKings", "fanduel": "FanDuel", "betmgm": "BetMGM", "williamhill_us": "Caesars", } return labels.get(str(book_key or "").strip().lower(), str(book_key or "").strip()) def _is_hard_failure(status: str, error_text: str) -> bool: haystack = f"{status} {error_text}".lower() hard_markers = ("404", "400", "tls", "ssl", "parse", "json", "not found") return any(marker in haystack for marker in hard_markers) def _get_adapter_state(book_key: str) -> dict[str, Any]: with _ADAPTER_STATE_LOCK: return dict(_ADAPTER_STATE.get(book_key, {})) def _set_adapter_state( book_key: str, *, status: str, error: str = "", rows_returned: int = 0, ) -> dict[str, Any]: now_epoch = _utc_epoch_seconds() now_iso = _utc_iso_now() existing = _get_adapter_state(book_key) failure_streak = int(existing.get("failure_streak") or 0) if status == "healthy": failure_streak = 0 retry_after_epoch = None else: failure_streak += 1 delay_seconds = ( _HARD_FAILURE_RETRY_SECONDS if _is_hard_failure(status, error) else _SOFT_FAILURE_RETRY_SECONDS ) retry_after_epoch = now_epoch + delay_seconds state = { "adapter_status": status, "adapter_error": str(error or ""), "adapter_rows_returned": int(rows_returned or 0), "last_attempted_at": now_iso, "retry_after": ( pd.Timestamp.fromtimestamp(retry_after_epoch, tz="UTC").isoformat() if retry_after_epoch is not None else "" ), "failure_streak": failure_streak, "_retry_after_epoch": retry_after_epoch, } with _ADAPTER_STATE_LOCK: _ADAPTER_STATE[book_key] = state return dict(state) def _throttled_adapter_state(book_key: str) -> dict[str, Any] | None: state = _get_adapter_state(book_key) retry_after_epoch = state.get("_retry_after_epoch") if retry_after_epoch is None: return None if float(retry_after_epoch) <= _utc_epoch_seconds(): return None throttled = dict(state) throttled["adapter_status"] = "throttled" return throttled def _make_row( provider_name: str, event_id: str, commence_time: str, away_team: str, home_team: str, sportsbook: str, sportsbook_key: str, player_name_raw: str, odds_american: int, line: float = 0.5, selection_label: str | None = None, ) -> dict[str, Any]: return { "provider": provider_name, "event_id": event_id, "commence_time": commence_time, "away_team": away_team, "home_team": home_team, "sportsbook": sportsbook, "sportsbook_key": sportsbook_key, "market_key": "batter_home_runs", "market": "hr", "player_name_raw": player_name_raw, "selection_label": selection_label, "player_name": map_odds_name_to_model_name(player_name_raw), "odds_american": odds_american, "line": line, } class ScrapeFallbackProvider(MarketProviderBase): provider_name = "scrape_fallback" def fetch_live_prop_odds( self, game_context, sportsbooks=None, markets=None, ) -> pd.DataFrame: return pd.DataFrame() def fetch_all_upcoming_hr_props(self, sportsbooks=None, markets=None) -> pd.DataFrame: result, _ = self.fetch_all_upcoming_hr_props_with_meta( sportsbooks=sportsbooks, markets=markets, ) return result def fetch_all_upcoming_hr_props_with_meta( self, sportsbooks=None, markets=None, ) -> tuple[pd.DataFrame, dict[str, Any]]: del markets requested_books = [ str(book or "").strip().lower() for book in (sportsbooks or list(_BOOK_FETCHERS.keys())) ] frames = [] adapter_status_by_book: dict[str, str] = {} adapter_error_by_book: dict[str, str] = {} adapter_rows_by_book: dict[str, int] = {} adapter_last_attempted_at_by_book: dict[str, str] = {} adapter_retry_after_by_book: dict[str, str] = {} for book_key in requested_books: fetch_name = _BOOK_FETCHERS.get(book_key) if not fetch_name: continue throttled_state = _throttled_adapter_state(book_key) if throttled_state is not None: adapter_status_by_book[book_key] = str(throttled_state.get("adapter_status") or "throttled") adapter_error_by_book[book_key] = str(throttled_state.get("adapter_error") or "") adapter_rows_by_book[book_key] = int(throttled_state.get("adapter_rows_returned") or 0) adapter_last_attempted_at_by_book[book_key] = str(throttled_state.get("last_attempted_at") or "") adapter_retry_after_by_book[book_key] = str(throttled_state.get("retry_after") or "") _log.warning( "[scrape_fallback] %s (%s) throttled until %s", fetch_name, book_key, adapter_retry_after_by_book[book_key] or "unknown", ) continue fetch_fn = getattr(self, fetch_name) try: df = fetch_fn() state = _set_adapter_state( book_key, status="healthy" if not df.empty else "empty_result", rows_returned=len(df), ) if not df.empty: frames.append(df) _log.warning( "[scrape_fallback] %s (%s) returned %d rows", fetch_fn.__name__, book_key, len(df), ) else: _log.warning( "[scrape_fallback] %s (%s) returned 0 rows", fetch_fn.__name__, book_key, ) except Exception as exc: state = _set_adapter_state( book_key, status=exc.__class__.__name__.lower() or "error", error=str(exc), rows_returned=0, ) _log.warning( "[scrape_fallback] %s (%s) failed: %s", fetch_fn.__name__, book_key, exc, ) adapter_status_by_book[book_key] = str(state.get("adapter_status") or "") adapter_error_by_book[book_key] = str(state.get("adapter_error") or "") adapter_rows_by_book[book_key] = int(state.get("adapter_rows_returned") or 0) adapter_last_attempted_at_by_book[book_key] = str(state.get("last_attempted_at") or "") adapter_retry_after_by_book[book_key] = str(state.get("retry_after") or "") result = pd.concat(frames, ignore_index=True) if frames else pd.DataFrame() _log.warning( "[scrape_fallback] SUMMARY requested_books=%s total_rows=%d", requested_books, len(result), ) return result, { "adapter_status_by_book": adapter_status_by_book, "adapter_error_by_book": adapter_error_by_book, "adapter_rows_by_book": adapter_rows_by_book, "adapter_last_attempted_at_by_book": adapter_last_attempted_at_by_book, "adapter_retry_after_by_book": adapter_retry_after_by_book, } # --------------------------------------------------------------------------- # DraftKings # --------------------------------------------------------------------------- def _fetch_draftkings(self) -> pd.DataFrame: headers = dict(_HEADERS) headers["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" r = requests.get(_DK_HR_ROUTE, headers=headers, timeout=20) _log.warning("[dk_scrape] HTTP %s route=%s", r.status_code, _DK_HR_ROUTE) r.raise_for_status() return self._parse_dk_route_html(r.text) def _extract_dk_json_candidates(self, html_text: str) -> list[Any]: if not html_text: return [] patterns = [ r']*id="__NEXT_DATA__"[^>]*>\s*(.*?)\s*', r']*>\s*window\.__NEXT_DATA__\s*=\s*(\{.*?\})\s*;?\s*', r']*>\s*window\.__PRELOADED_STATE__\s*=\s*(\{.*?\})\s*;?\s*', r']*type="application/json"[^>]*>\s*(.*?)\s*', ] candidates: list[Any] = [] for pattern in patterns: for match in re.finditer(pattern, html_text, flags=re.IGNORECASE | re.DOTALL): blob = str(match.group(1) or "").strip() if not blob or blob[:1] not in {"{", "["}: continue try: parsed = json.loads(blob) except Exception: continue candidates.append(parsed) return candidates def _walk_json(self, payload: Any): if isinstance(payload, dict): yield payload for value in payload.values(): yield from self._walk_json(value) elif isinstance(payload, list): for value in payload: yield from self._walk_json(value) def _parse_dk_route_html(self, html_text: str) -> pd.DataFrame: rows: list[dict[str, Any]] = [] for candidate in self._extract_dk_json_candidates(html_text): for node in self._walk_json(candidate): event_group = None if isinstance(node, dict) and isinstance(node.get("eventGroup"), dict): event_group = node["eventGroup"] elif isinstance(node, dict) and any( key in node for key in ("offerCategories", "offerSubcategoryDescriptors") ): event_group = node if not isinstance(event_group, dict): continue parsed = self._parse_dk({"eventGroup": event_group}) if not parsed.empty: rows.extend(parsed.to_dict("records")) if not rows: return pd.DataFrame() deduped = pd.DataFrame(rows).drop_duplicates( subset=[ "event_id", "player_name", "sportsbook_key", "market_key", "selection_label", "line", "odds_american", ], keep="first", ) return deduped.reset_index(drop=True) def _parse_dk(self, data: dict) -> pd.DataFrame: rows: list[dict[str, Any]] = [] event_group = data.get("eventGroup", {}) for offer_cat in event_group.get("offerCategories", []): for sub_desc in offer_cat.get("offerSubcategoryDescriptors", []): for offer_group in sub_desc.get("offerGroups", []): event_id = str(offer_group.get("eventId", "") or "") event_desc = str(offer_group.get("eventDescription", "") or "") parts = [p.strip() for p in event_desc.replace(" vs ", " @ ").split(" @ ")] away_team = parts[0] if len(parts) >= 2 else "" home_team = parts[1] if len(parts) >= 2 else "" commence_time = str(offer_group.get("startDate", "") or "") for offer_list in offer_group.get("offers", []): for offer in (offer_list if isinstance(offer_list, list) else [offer_list]): for outcome in offer.get("outcomes", []): player_name_raw = str( offer.get("label", "") or outcome.get("label", "") or outcome.get("participant", "") or outcome.get("name", "") or "" ).strip() if not player_name_raw: continue price = _extract_american_odds( outcome.get("oddsAmerican"), outcome.get("odds_american"), outcome.get("price"), ) if price is None: continue rows.append(_make_row( self.provider_name, event_id, commence_time, away_team, home_team, "DraftKings", "draftkings", player_name_raw, price, selection_label=str(outcome.get("label", "") or outcome.get("name", "") or "").strip() or None, )) return pd.DataFrame(rows) # --------------------------------------------------------------------------- # FanDuel # --------------------------------------------------------------------------- def _fetch_fanduel(self) -> pd.DataFrame: url = ( f"{_FD_BASE}/content-managed-page" f"?page=SPORT_LEAGUE&countryCode=US®ionCode=IL" f"&channel=BASEBALL&lang=en-US&_ak={_FD_AK}" ) r = requests.get(url, headers=_HEADERS, timeout=20) _log.warning("[fd_scrape] HTTP %s", r.status_code) r.raise_for_status() return self._parse_fd(r.json()) def _parse_fd(self, data: dict) -> pd.DataFrame: rows: list[dict[str, Any]] = [] attachments = data.get("attachments", {}) events = attachments.get("events", {}) markets = attachments.get("markets", {}) for _market_id, market in markets.items(): market_type = str(market.get("marketType", "") or "") market_name = str(market.get("marketName", "") or market.get("name", "") or market.get("title", "") or "") if not _looks_like_hr_market(market_type, market_name): continue event_id = str(market.get("eventId", "") or "") event = events.get(str(event_id), {}) away_team = str( event.get("awayTeam", {}).get("name", "") or event.get("awayTeamName", "") or "" ) home_team = str( event.get("homeTeam", {}).get("name", "") or event.get("homeTeamName", "") or "" ) commence_time = str(event.get("openDate", "") or "") for runner in market.get("runners", []): player_name_raw = str( runner.get("runnerName", "") or runner.get("runnerTitle", "") or runner.get("name", "") or "" ).strip() if not player_name_raw: continue price = _extract_american_odds( ( runner.get("winRunnerOdds", {}) .get("americanDisplayOdds", {}) .get("americanOdds", "") ), runner.get("price"), runner.get("odds"), ) if price is None: continue rows.append(_make_row( self.provider_name, event_id, commence_time, away_team, home_team, "FanDuel", "fanduel", player_name_raw, price, selection_label=str(runner.get("result", "") or runner.get("name", "") or "").strip() or None, )) return pd.DataFrame(rows) # --------------------------------------------------------------------------- # BetMGM # --------------------------------------------------------------------------- def _fetch_betmgm(self) -> pd.DataFrame: url = ( "https://sports.nj.betmgm.com/en/sports/api/v2/leagues/baseball-mlb/events" f"?lang=en-us&x-bwin-accessid={_BETMGM_ACCESS_ID}" ) r = requests.get(url, headers=_HEADERS, timeout=20) _log.warning("[betmgm_scrape] HTTP %s", r.status_code) r.raise_for_status() return self._parse_betmgm(r.json()) def _parse_betmgm(self, data: dict | list) -> pd.DataFrame: rows: list[dict[str, Any]] = [] events = ( data if isinstance(data, list) else data.get("result", {}).get("dataList", data.get("events", [])) ) for event in events: event_id = str(event.get("id", "") or "") name_obj = event.get("name", {}) name = str(name_obj.get("value", "") if isinstance(name_obj, dict) else name_obj or "") parts = [p.strip() for p in name.replace(" vs ", " @ ").split(" @ ")] away_team = parts[0] if len(parts) >= 2 else "" home_team = parts[1] if len(parts) >= 2 else "" commence_time = str(event.get("startDate", "") or "") for fixture in event.get("markets", []): mkt_name_obj = fixture.get("name", {}) mkt_name = str( mkt_name_obj.get("value", "") if isinstance(mkt_name_obj, dict) else mkt_name_obj or "" ) if not _looks_like_hr_market(mkt_name, fixture.get("type"), fixture.get("key")): continue for selection in fixture.get("selections", []): sel_name_obj = selection.get("name", {}) player_name_raw = str( sel_name_obj.get("value", "") if isinstance(sel_name_obj, dict) else sel_name_obj or "" ).strip() if not player_name_raw: continue price = _extract_american_odds( selection.get("price", {}).get("americanOdds"), selection.get("americanOdds"), selection.get("price"), ) if price is None: continue rows.append(_make_row( self.provider_name, event_id, commence_time, away_team, home_team, "BetMGM", "betmgm", player_name_raw, price, selection_label=str(selection.get("result", "") or "").strip() or None, )) return pd.DataFrame(rows) # --------------------------------------------------------------------------- # Caesars # --------------------------------------------------------------------------- def _fetch_caesars(self) -> pd.DataFrame: url = ( "https://api.levelmgr.caesarssportsbook.com/api/v1" "/leagues/baseball-mlb/player-props/home-run" ) r = requests.get(url, headers=_HEADERS, timeout=20) _log.warning("[caesars_scrape] HTTP %s", r.status_code) r.raise_for_status() return self._parse_caesars(r.json()) def _parse_caesars(self, data: dict | list) -> pd.DataFrame: rows: list[dict[str, Any]] = [] items = ( data if isinstance(data, list) else data.get("data", data.get("events", data.get("items", []))) ) for item in items: event_id = str(item.get("eventId", item.get("id", "")) or "") away_team = str(item.get("awayTeamName", item.get("away_team", "")) or "") home_team = str(item.get("homeTeamName", item.get("home_team", "")) or "") commence_time = str(item.get("eventDate", item.get("startTime", "")) or "") selections = item.get("participants", item.get("props", item.get("selections", []))) for prop in selections: if not _looks_like_hr_market( item.get("marketName"), item.get("name"), prop.get("marketName"), prop.get("name"), ): continue player_name_raw = str( prop.get("name", prop.get("participantName", prop.get("playerName", ""))) or "" ).strip() if not player_name_raw: continue price = _extract_american_odds( prop.get("odds", {}).get("american") if isinstance(prop.get("odds"), dict) else None, prop.get("americanOdds"), prop.get("price"), ) if price is None: continue rows.append(_make_row( self.provider_name, event_id, commence_time, away_team, home_team, "Caesars", "williamhill_us", player_name_raw, price, selection_label=str(prop.get("result", "") or prop.get("selection", "") or "").strip() or None, )) return pd.DataFrame(rows)