Spaces:
Running
Running
| """ | |
| data/provider_scrape.py | |
| Fallback scraper for HR props when The Odds API hasn't yet indexed player props. | |
| Hits each book's semi-public JSON API directly using requests only (no browser). | |
| Books: DraftKings, FanDuel, BetMGM, Caesars | |
| Each book's fetch is independent — one failure does not block the others. | |
| Results are concatenated across all books that respond successfully. | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import logging | |
| import re | |
| import threading | |
| import time | |
| from typing import Any | |
| import pandas as pd | |
| import requests | |
| from data.market_provider_base import MarketProviderBase | |
| from data.odds_name_map import map_odds_name_to_model_name | |
| _log = logging.getLogger(__name__) | |
| _HEADERS = { | |
| "User-Agent": ( | |
| "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " | |
| "AppleWebKit/537.36 (KHTML, like Gecko) " | |
| "Chrome/120.0.0.0 Safari/537.36" | |
| ), | |
| "Accept": "application/json, text/plain, */*", | |
| "Accept-Language": "en-US,en;q=0.9", | |
| } | |
| _DK_BASE = "https://sportsbook-nash.draftkings.com/api/odds/v1/league/84240" | |
| _DK_HR_ROUTE = ( | |
| "https://sportsbook.draftkings.com/leagues/baseball/mlb" | |
| "?category=batter-props&subcategory=home-runs" | |
| ) | |
| _FD_BASE = "https://sbapi.il.sportsbook.fanduel.com/api" | |
| _FD_AK = "FhMFpcPWXMeyZxOx" | |
| _BETMGM_ACCESS_ID = "NmFjNjYzMmQtMmZlNS00MDQ3LWIzZjctNGMxMjhmNjNmNWVm" | |
| _BOOK_FETCHERS = { | |
| "draftkings": "_fetch_draftkings", | |
| "fanduel": "_fetch_fanduel", | |
| "betmgm": "_fetch_betmgm", | |
| "williamhill_us": "_fetch_caesars", | |
| } | |
| _HR_MARKET_KEYWORDS = ( | |
| "home run", | |
| "home_run", | |
| "homer", | |
| "to hit a home run", | |
| "to record a home run", | |
| "player to hit a home run", | |
| "player home runs", | |
| "anytime hr", | |
| "anytime home run", | |
| ) | |
| _ADAPTER_STATE_LOCK = threading.Lock() | |
| _ADAPTER_STATE: dict[str, dict[str, Any]] = {} | |
| _HARD_FAILURE_RETRY_SECONDS = 60 * 30 | |
| _SOFT_FAILURE_RETRY_SECONDS = 60 * 10 | |
| def _looks_like_hr_market(*values: Any) -> bool: | |
| haystack = " | ".join(str(value or "").strip().lower() for value in values if str(value or "").strip()) | |
| if not haystack: | |
| return False | |
| return any(keyword in haystack for keyword in _HR_MARKET_KEYWORDS) | |
| def _extract_american_odds(*values: Any) -> int | None: | |
| for value in values: | |
| if value is None: | |
| continue | |
| try: | |
| text = str(value).strip() | |
| if not text: | |
| continue | |
| return int(text.replace("+", "")) | |
| except (TypeError, ValueError): | |
| continue | |
| return None | |
| def _utc_epoch_seconds() -> float: | |
| return float(time.time()) | |
| def _utc_iso_now() -> str: | |
| return pd.Timestamp.utcnow().isoformat() | |
| def _book_display_name(book_key: str) -> str: | |
| labels = { | |
| "draftkings": "DraftKings", | |
| "fanduel": "FanDuel", | |
| "betmgm": "BetMGM", | |
| "williamhill_us": "Caesars", | |
| } | |
| return labels.get(str(book_key or "").strip().lower(), str(book_key or "").strip()) | |
| def _is_hard_failure(status: str, error_text: str) -> bool: | |
| haystack = f"{status} {error_text}".lower() | |
| hard_markers = ("404", "400", "tls", "ssl", "parse", "json", "not found") | |
| return any(marker in haystack for marker in hard_markers) | |
| def _get_adapter_state(book_key: str) -> dict[str, Any]: | |
| with _ADAPTER_STATE_LOCK: | |
| return dict(_ADAPTER_STATE.get(book_key, {})) | |
| def _set_adapter_state( | |
| book_key: str, | |
| *, | |
| status: str, | |
| error: str = "", | |
| rows_returned: int = 0, | |
| ) -> dict[str, Any]: | |
| now_epoch = _utc_epoch_seconds() | |
| now_iso = _utc_iso_now() | |
| existing = _get_adapter_state(book_key) | |
| failure_streak = int(existing.get("failure_streak") or 0) | |
| if status == "healthy": | |
| failure_streak = 0 | |
| retry_after_epoch = None | |
| else: | |
| failure_streak += 1 | |
| delay_seconds = ( | |
| _HARD_FAILURE_RETRY_SECONDS | |
| if _is_hard_failure(status, error) | |
| else _SOFT_FAILURE_RETRY_SECONDS | |
| ) | |
| retry_after_epoch = now_epoch + delay_seconds | |
| state = { | |
| "adapter_status": status, | |
| "adapter_error": str(error or ""), | |
| "adapter_rows_returned": int(rows_returned or 0), | |
| "last_attempted_at": now_iso, | |
| "retry_after": ( | |
| pd.Timestamp.fromtimestamp(retry_after_epoch, tz="UTC").isoformat() | |
| if retry_after_epoch is not None | |
| else "" | |
| ), | |
| "failure_streak": failure_streak, | |
| "_retry_after_epoch": retry_after_epoch, | |
| } | |
| with _ADAPTER_STATE_LOCK: | |
| _ADAPTER_STATE[book_key] = state | |
| return dict(state) | |
| def _throttled_adapter_state(book_key: str) -> dict[str, Any] | None: | |
| state = _get_adapter_state(book_key) | |
| retry_after_epoch = state.get("_retry_after_epoch") | |
| if retry_after_epoch is None: | |
| return None | |
| if float(retry_after_epoch) <= _utc_epoch_seconds(): | |
| return None | |
| throttled = dict(state) | |
| throttled["adapter_status"] = "throttled" | |
| return throttled | |
| def _make_row( | |
| provider_name: str, | |
| event_id: str, | |
| commence_time: str, | |
| away_team: str, | |
| home_team: str, | |
| sportsbook: str, | |
| sportsbook_key: str, | |
| player_name_raw: str, | |
| odds_american: int, | |
| line: float = 0.5, | |
| selection_label: str | None = None, | |
| ) -> dict[str, Any]: | |
| return { | |
| "provider": provider_name, | |
| "event_id": event_id, | |
| "commence_time": commence_time, | |
| "away_team": away_team, | |
| "home_team": home_team, | |
| "sportsbook": sportsbook, | |
| "sportsbook_key": sportsbook_key, | |
| "market_key": "batter_home_runs", | |
| "market": "hr", | |
| "player_name_raw": player_name_raw, | |
| "selection_label": selection_label, | |
| "player_name": map_odds_name_to_model_name(player_name_raw), | |
| "odds_american": odds_american, | |
| "line": line, | |
| } | |
| class ScrapeFallbackProvider(MarketProviderBase): | |
| provider_name = "scrape_fallback" | |
| def fetch_live_prop_odds( | |
| self, | |
| game_context, | |
| sportsbooks=None, | |
| markets=None, | |
| ) -> pd.DataFrame: | |
| return pd.DataFrame() | |
| def fetch_all_upcoming_hr_props(self, sportsbooks=None, markets=None) -> pd.DataFrame: | |
| result, _ = self.fetch_all_upcoming_hr_props_with_meta( | |
| sportsbooks=sportsbooks, | |
| markets=markets, | |
| ) | |
| return result | |
| def fetch_all_upcoming_hr_props_with_meta( | |
| self, | |
| sportsbooks=None, | |
| markets=None, | |
| ) -> tuple[pd.DataFrame, dict[str, Any]]: | |
| del markets | |
| requested_books = [ | |
| str(book or "").strip().lower() | |
| for book in (sportsbooks or list(_BOOK_FETCHERS.keys())) | |
| ] | |
| frames = [] | |
| adapter_status_by_book: dict[str, str] = {} | |
| adapter_error_by_book: dict[str, str] = {} | |
| adapter_rows_by_book: dict[str, int] = {} | |
| adapter_last_attempted_at_by_book: dict[str, str] = {} | |
| adapter_retry_after_by_book: dict[str, str] = {} | |
| for book_key in requested_books: | |
| fetch_name = _BOOK_FETCHERS.get(book_key) | |
| if not fetch_name: | |
| continue | |
| throttled_state = _throttled_adapter_state(book_key) | |
| if throttled_state is not None: | |
| adapter_status_by_book[book_key] = str(throttled_state.get("adapter_status") or "throttled") | |
| adapter_error_by_book[book_key] = str(throttled_state.get("adapter_error") or "") | |
| adapter_rows_by_book[book_key] = int(throttled_state.get("adapter_rows_returned") or 0) | |
| adapter_last_attempted_at_by_book[book_key] = str(throttled_state.get("last_attempted_at") or "") | |
| adapter_retry_after_by_book[book_key] = str(throttled_state.get("retry_after") or "") | |
| _log.warning( | |
| "[scrape_fallback] %s (%s) throttled until %s", | |
| fetch_name, | |
| book_key, | |
| adapter_retry_after_by_book[book_key] or "unknown", | |
| ) | |
| continue | |
| fetch_fn = getattr(self, fetch_name) | |
| try: | |
| df = fetch_fn() | |
| state = _set_adapter_state( | |
| book_key, | |
| status="healthy" if not df.empty else "empty_result", | |
| rows_returned=len(df), | |
| ) | |
| if not df.empty: | |
| frames.append(df) | |
| _log.warning( | |
| "[scrape_fallback] %s (%s) returned %d rows", | |
| fetch_fn.__name__, | |
| book_key, | |
| len(df), | |
| ) | |
| else: | |
| _log.warning( | |
| "[scrape_fallback] %s (%s) returned 0 rows", | |
| fetch_fn.__name__, | |
| book_key, | |
| ) | |
| except Exception as exc: | |
| state = _set_adapter_state( | |
| book_key, | |
| status=exc.__class__.__name__.lower() or "error", | |
| error=str(exc), | |
| rows_returned=0, | |
| ) | |
| _log.warning( | |
| "[scrape_fallback] %s (%s) failed: %s", | |
| fetch_fn.__name__, | |
| book_key, | |
| exc, | |
| ) | |
| adapter_status_by_book[book_key] = str(state.get("adapter_status") or "") | |
| adapter_error_by_book[book_key] = str(state.get("adapter_error") or "") | |
| adapter_rows_by_book[book_key] = int(state.get("adapter_rows_returned") or 0) | |
| adapter_last_attempted_at_by_book[book_key] = str(state.get("last_attempted_at") or "") | |
| adapter_retry_after_by_book[book_key] = str(state.get("retry_after") or "") | |
| result = pd.concat(frames, ignore_index=True) if frames else pd.DataFrame() | |
| _log.warning( | |
| "[scrape_fallback] SUMMARY requested_books=%s total_rows=%d", | |
| requested_books, | |
| len(result), | |
| ) | |
| return result, { | |
| "adapter_status_by_book": adapter_status_by_book, | |
| "adapter_error_by_book": adapter_error_by_book, | |
| "adapter_rows_by_book": adapter_rows_by_book, | |
| "adapter_last_attempted_at_by_book": adapter_last_attempted_at_by_book, | |
| "adapter_retry_after_by_book": adapter_retry_after_by_book, | |
| } | |
| # --------------------------------------------------------------------------- | |
| # DraftKings | |
| # --------------------------------------------------------------------------- | |
| def _fetch_draftkings(self) -> pd.DataFrame: | |
| headers = dict(_HEADERS) | |
| headers["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" | |
| r = requests.get(_DK_HR_ROUTE, headers=headers, timeout=20) | |
| _log.warning("[dk_scrape] HTTP %s route=%s", r.status_code, _DK_HR_ROUTE) | |
| r.raise_for_status() | |
| return self._parse_dk_route_html(r.text) | |
| def _extract_dk_json_candidates(self, html_text: str) -> list[Any]: | |
| if not html_text: | |
| return [] | |
| patterns = [ | |
| r'<script[^>]*id="__NEXT_DATA__"[^>]*>\s*(.*?)\s*</script>', | |
| r'<script[^>]*>\s*window\.__NEXT_DATA__\s*=\s*(\{.*?\})\s*;?\s*</script>', | |
| r'<script[^>]*>\s*window\.__PRELOADED_STATE__\s*=\s*(\{.*?\})\s*;?\s*</script>', | |
| r'<script[^>]*type="application/json"[^>]*>\s*(.*?)\s*</script>', | |
| ] | |
| candidates: list[Any] = [] | |
| for pattern in patterns: | |
| for match in re.finditer(pattern, html_text, flags=re.IGNORECASE | re.DOTALL): | |
| blob = str(match.group(1) or "").strip() | |
| if not blob or blob[:1] not in {"{", "["}: | |
| continue | |
| try: | |
| parsed = json.loads(blob) | |
| except Exception: | |
| continue | |
| candidates.append(parsed) | |
| return candidates | |
| def _walk_json(self, payload: Any): | |
| if isinstance(payload, dict): | |
| yield payload | |
| for value in payload.values(): | |
| yield from self._walk_json(value) | |
| elif isinstance(payload, list): | |
| for value in payload: | |
| yield from self._walk_json(value) | |
| def _parse_dk_route_html(self, html_text: str) -> pd.DataFrame: | |
| rows: list[dict[str, Any]] = [] | |
| for candidate in self._extract_dk_json_candidates(html_text): | |
| for node in self._walk_json(candidate): | |
| event_group = None | |
| if isinstance(node, dict) and isinstance(node.get("eventGroup"), dict): | |
| event_group = node["eventGroup"] | |
| elif isinstance(node, dict) and any( | |
| key in node for key in ("offerCategories", "offerSubcategoryDescriptors") | |
| ): | |
| event_group = node | |
| if not isinstance(event_group, dict): | |
| continue | |
| parsed = self._parse_dk({"eventGroup": event_group}) | |
| if not parsed.empty: | |
| rows.extend(parsed.to_dict("records")) | |
| if not rows: | |
| return pd.DataFrame() | |
| deduped = pd.DataFrame(rows).drop_duplicates( | |
| subset=[ | |
| "event_id", | |
| "player_name", | |
| "sportsbook_key", | |
| "market_key", | |
| "selection_label", | |
| "line", | |
| "odds_american", | |
| ], | |
| keep="first", | |
| ) | |
| return deduped.reset_index(drop=True) | |
| def _parse_dk(self, data: dict) -> pd.DataFrame: | |
| rows: list[dict[str, Any]] = [] | |
| event_group = data.get("eventGroup", {}) | |
| for offer_cat in event_group.get("offerCategories", []): | |
| for sub_desc in offer_cat.get("offerSubcategoryDescriptors", []): | |
| for offer_group in sub_desc.get("offerGroups", []): | |
| event_id = str(offer_group.get("eventId", "") or "") | |
| event_desc = str(offer_group.get("eventDescription", "") or "") | |
| parts = [p.strip() for p in event_desc.replace(" vs ", " @ ").split(" @ ")] | |
| away_team = parts[0] if len(parts) >= 2 else "" | |
| home_team = parts[1] if len(parts) >= 2 else "" | |
| commence_time = str(offer_group.get("startDate", "") or "") | |
| for offer_list in offer_group.get("offers", []): | |
| for offer in (offer_list if isinstance(offer_list, list) else [offer_list]): | |
| for outcome in offer.get("outcomes", []): | |
| player_name_raw = str( | |
| offer.get("label", "") | |
| or outcome.get("label", "") | |
| or outcome.get("participant", "") | |
| or outcome.get("name", "") | |
| or "" | |
| ).strip() | |
| if not player_name_raw: | |
| continue | |
| price = _extract_american_odds( | |
| outcome.get("oddsAmerican"), | |
| outcome.get("odds_american"), | |
| outcome.get("price"), | |
| ) | |
| if price is None: | |
| continue | |
| rows.append(_make_row( | |
| self.provider_name, event_id, commence_time, | |
| away_team, home_team, | |
| "DraftKings", "draftkings", | |
| player_name_raw, price, | |
| selection_label=str(outcome.get("label", "") or outcome.get("name", "") or "").strip() or None, | |
| )) | |
| return pd.DataFrame(rows) | |
| # --------------------------------------------------------------------------- | |
| # FanDuel | |
| # --------------------------------------------------------------------------- | |
| def _fetch_fanduel(self) -> pd.DataFrame: | |
| url = ( | |
| f"{_FD_BASE}/content-managed-page" | |
| f"?page=SPORT_LEAGUE&countryCode=US®ionCode=IL" | |
| f"&channel=BASEBALL&lang=en-US&_ak={_FD_AK}" | |
| ) | |
| r = requests.get(url, headers=_HEADERS, timeout=20) | |
| _log.warning("[fd_scrape] HTTP %s", r.status_code) | |
| r.raise_for_status() | |
| return self._parse_fd(r.json()) | |
| def _parse_fd(self, data: dict) -> pd.DataFrame: | |
| rows: list[dict[str, Any]] = [] | |
| attachments = data.get("attachments", {}) | |
| events = attachments.get("events", {}) | |
| markets = attachments.get("markets", {}) | |
| for _market_id, market in markets.items(): | |
| market_type = str(market.get("marketType", "") or "") | |
| market_name = str(market.get("marketName", "") or market.get("name", "") or market.get("title", "") or "") | |
| if not _looks_like_hr_market(market_type, market_name): | |
| continue | |
| event_id = str(market.get("eventId", "") or "") | |
| event = events.get(str(event_id), {}) | |
| away_team = str( | |
| event.get("awayTeam", {}).get("name", "") | |
| or event.get("awayTeamName", "") | |
| or "" | |
| ) | |
| home_team = str( | |
| event.get("homeTeam", {}).get("name", "") | |
| or event.get("homeTeamName", "") | |
| or "" | |
| ) | |
| commence_time = str(event.get("openDate", "") or "") | |
| for runner in market.get("runners", []): | |
| player_name_raw = str( | |
| runner.get("runnerName", "") | |
| or runner.get("runnerTitle", "") | |
| or runner.get("name", "") | |
| or "" | |
| ).strip() | |
| if not player_name_raw: | |
| continue | |
| price = _extract_american_odds( | |
| ( | |
| runner.get("winRunnerOdds", {}) | |
| .get("americanDisplayOdds", {}) | |
| .get("americanOdds", "") | |
| ), | |
| runner.get("price"), | |
| runner.get("odds"), | |
| ) | |
| if price is None: | |
| continue | |
| rows.append(_make_row( | |
| self.provider_name, event_id, commence_time, | |
| away_team, home_team, | |
| "FanDuel", "fanduel", | |
| player_name_raw, price, | |
| selection_label=str(runner.get("result", "") or runner.get("name", "") or "").strip() or None, | |
| )) | |
| return pd.DataFrame(rows) | |
| # --------------------------------------------------------------------------- | |
| # BetMGM | |
| # --------------------------------------------------------------------------- | |
| def _fetch_betmgm(self) -> pd.DataFrame: | |
| url = ( | |
| "https://sports.nj.betmgm.com/en/sports/api/v2/leagues/baseball-mlb/events" | |
| f"?lang=en-us&x-bwin-accessid={_BETMGM_ACCESS_ID}" | |
| ) | |
| r = requests.get(url, headers=_HEADERS, timeout=20) | |
| _log.warning("[betmgm_scrape] HTTP %s", r.status_code) | |
| r.raise_for_status() | |
| return self._parse_betmgm(r.json()) | |
| def _parse_betmgm(self, data: dict | list) -> pd.DataFrame: | |
| rows: list[dict[str, Any]] = [] | |
| events = ( | |
| data | |
| if isinstance(data, list) | |
| else data.get("result", {}).get("dataList", data.get("events", [])) | |
| ) | |
| for event in events: | |
| event_id = str(event.get("id", "") or "") | |
| name_obj = event.get("name", {}) | |
| name = str(name_obj.get("value", "") if isinstance(name_obj, dict) else name_obj or "") | |
| parts = [p.strip() for p in name.replace(" vs ", " @ ").split(" @ ")] | |
| away_team = parts[0] if len(parts) >= 2 else "" | |
| home_team = parts[1] if len(parts) >= 2 else "" | |
| commence_time = str(event.get("startDate", "") or "") | |
| for fixture in event.get("markets", []): | |
| mkt_name_obj = fixture.get("name", {}) | |
| mkt_name = str( | |
| mkt_name_obj.get("value", "") if isinstance(mkt_name_obj, dict) else mkt_name_obj or "" | |
| ) | |
| if not _looks_like_hr_market(mkt_name, fixture.get("type"), fixture.get("key")): | |
| continue | |
| for selection in fixture.get("selections", []): | |
| sel_name_obj = selection.get("name", {}) | |
| player_name_raw = str( | |
| sel_name_obj.get("value", "") if isinstance(sel_name_obj, dict) else sel_name_obj or "" | |
| ).strip() | |
| if not player_name_raw: | |
| continue | |
| price = _extract_american_odds( | |
| selection.get("price", {}).get("americanOdds"), | |
| selection.get("americanOdds"), | |
| selection.get("price"), | |
| ) | |
| if price is None: | |
| continue | |
| rows.append(_make_row( | |
| self.provider_name, event_id, commence_time, | |
| away_team, home_team, | |
| "BetMGM", "betmgm", | |
| player_name_raw, price, | |
| selection_label=str(selection.get("result", "") or "").strip() or None, | |
| )) | |
| return pd.DataFrame(rows) | |
| # --------------------------------------------------------------------------- | |
| # Caesars | |
| # --------------------------------------------------------------------------- | |
| def _fetch_caesars(self) -> pd.DataFrame: | |
| url = ( | |
| "https://api.levelmgr.caesarssportsbook.com/api/v1" | |
| "/leagues/baseball-mlb/player-props/home-run" | |
| ) | |
| r = requests.get(url, headers=_HEADERS, timeout=20) | |
| _log.warning("[caesars_scrape] HTTP %s", r.status_code) | |
| r.raise_for_status() | |
| return self._parse_caesars(r.json()) | |
| def _parse_caesars(self, data: dict | list) -> pd.DataFrame: | |
| rows: list[dict[str, Any]] = [] | |
| items = ( | |
| data | |
| if isinstance(data, list) | |
| else data.get("data", data.get("events", data.get("items", []))) | |
| ) | |
| for item in items: | |
| event_id = str(item.get("eventId", item.get("id", "")) or "") | |
| away_team = str(item.get("awayTeamName", item.get("away_team", "")) or "") | |
| home_team = str(item.get("homeTeamName", item.get("home_team", "")) or "") | |
| commence_time = str(item.get("eventDate", item.get("startTime", "")) or "") | |
| selections = item.get("participants", item.get("props", item.get("selections", []))) | |
| for prop in selections: | |
| if not _looks_like_hr_market( | |
| item.get("marketName"), | |
| item.get("name"), | |
| prop.get("marketName"), | |
| prop.get("name"), | |
| ): | |
| continue | |
| player_name_raw = str( | |
| prop.get("name", prop.get("participantName", prop.get("playerName", ""))) or "" | |
| ).strip() | |
| if not player_name_raw: | |
| continue | |
| price = _extract_american_odds( | |
| prop.get("odds", {}).get("american") | |
| if isinstance(prop.get("odds"), dict) | |
| else None, | |
| prop.get("americanOdds"), | |
| prop.get("price"), | |
| ) | |
| if price is None: | |
| continue | |
| rows.append(_make_row( | |
| self.provider_name, event_id, commence_time, | |
| away_team, home_team, | |
| "Caesars", "williamhill_us", | |
| player_name_raw, price, | |
| selection_label=str(prop.get("result", "") or prop.get("selection", "") or "").strip() or None, | |
| )) | |
| return pd.DataFrame(rows) | |