Syntrex commited on
Commit
aa6a5bc
·
verified ·
1 Parent(s): 3aa81d5

Update data/schedule.py

Browse files
Files changed (1) hide show
  1. data/schedule.py +156 -74
data/schedule.py CHANGED
@@ -1,78 +1,160 @@
1
  from __future__ import annotations
2
 
3
- import os
4
-
5
- APP_TITLE = "WBC Analytics Assistant"
6
- REFRESH_TTL_SECONDS = 30
7
-
8
- DUCKDB_PATH = "data/wbc.duckdb"
9
-
10
- DEFAULT_EDGE_THRESHOLD = 0.05
11
- DEFAULT_CONFIDENCE_THRESHOLD = 0.70
12
-
13
- # Official / public sources
14
- WBC_SCHEDULE_PAGE_URL = "https://www.mlb.com/world-baseball-classic/schedule"
15
- WBC_HOME_URL = "https://www.mlb.com/world-baseball-classic"
16
- WBC_STATS_URL = "https://www.mlb.com/world-baseball-classic/stats"
17
- WBC_STANDINGS_URL = "https://www.mlb.com/world-baseball-classic/standings"
18
-
19
- # Baseball Savant WBC Statcast
20
- WBC_STATCAST_SEARCH_URL = "https://baseballsavant.mlb.com/statcast-search-world-baseball-classic"
21
- STATCAST_SEARCH_URL = "https://baseballsavant.mlb.com/statcast_search/csv"
22
-
23
- # MLB enrichment / fallback
24
- MLB_SCHEDULE_URL = "https://statsapi.mlb.com/api/v1/schedule"
25
- MLB_TEAMS_URL = "https://statsapi.mlb.com/api/v1/teams"
26
-
27
- # Weather
28
- OPENWEATHER_URL = "https://api.openweathermap.org/data/2.5/weather"
29
-
30
- # Odds
31
- ODDS_SPORT_KEY = "baseball_mlb"
32
- ODDS_BASE_URL = "https://api.the-odds-api.com/v4"
33
- ODDS_REGIONS = "us"
34
- ODDS_FEATURED_MARKETS = "h2h,spreads,totals"
35
- ODDS_FORMAT = "american"
36
-
37
- ODDS_API_KEY = os.getenv("ODDS_API_KEY", "")
38
- OPENWEATHER_API_KEY = os.getenv("OPENWEATHER_API_KEY", "")
39
-
40
- SUPPORTED_BOOKS = [
41
- "DraftKings",
42
- "FanDuel",
43
- "BetMGM",
44
- "Caesars",
45
- "bet365",
46
- "Pinnacle",
47
- ]
48
-
49
- WBC_2026_VENUES = {
50
- "Hiram Bithorn Stadium": {"lat": 18.3982, "lon": -66.0600, "city": "San Juan"},
51
- "Daikin Park": {"lat": 29.7573, "lon": -95.3555, "city": "Houston"},
52
- "Tokyo Dome": {"lat": 35.7056, "lon": 139.7519, "city": "Tokyo"},
53
- "loanDepot park": {"lat": 25.7781, "lon": -80.2197, "city": "Miami"},
54
  }
55
 
56
- WBC_TEAMS = [
57
- "Australia",
58
- "Brazil",
59
- "Canada",
60
- "China",
61
- "Chinese Taipei",
62
- "Colombia",
63
- "Cuba",
64
- "Czech Republic",
65
- "Dominican Republic",
66
- "Great Britain",
67
- "Israel",
68
- "Italy",
69
- "Japan",
70
- "Korea",
71
- "Mexico",
72
- "Netherlands",
73
- "Nicaragua",
74
- "Panama",
75
- "Puerto Rico",
76
- "United States",
77
- "Venezuela",
78
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from __future__ import annotations
2
 
3
+ import json
4
+ import re
5
+ from datetime import datetime
6
+ from typing import Any
7
+
8
+ import pandas as pd
9
+ import requests
10
+
11
+ from config.settings import WBC_SCHEDULE_PAGE_URL
12
+
13
+
14
+ HEADERS = {
15
+ "User-Agent": "Mozilla/5.0",
16
+ "Accept-Language": "en-US,en;q=0.9",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  }
18
 
19
+
20
+ def _safe_text(value: Any) -> str:
21
+ if value is None:
22
+ return ""
23
+ return str(value).strip()
24
+
25
+
26
+ def _extract_next_data_json(html: str) -> dict[str, Any] | None:
27
+ patterns = [
28
+ r'<script id="__NEXT_DATA__" type="application/json">(.+?)</script>',
29
+ r'window\.__PRELOADED_STATE__\s*=\s*({.+?});',
30
+ ]
31
+
32
+ for pattern in patterns:
33
+ match = re.search(pattern, html, flags=re.DOTALL)
34
+ if match:
35
+ raw = match.group(1)
36
+ try:
37
+ return json.loads(raw)
38
+ except json.JSONDecodeError:
39
+ continue
40
+ return None
41
+
42
+
43
+ def _rows_from_embedded_json(payload: dict[str, Any]) -> list[dict[str, Any]]:
44
+ rows: list[dict[str, Any]] = []
45
+
46
+ def walk(obj: Any) -> None:
47
+ if isinstance(obj, dict):
48
+ keys = set(obj.keys())
49
+
50
+ possible_game = (
51
+ ("homeTeam" in keys or "home_team" in keys or "home" in keys)
52
+ and ("awayTeam" in keys or "away_team" in keys or "away" in keys)
53
+ )
54
+
55
+ if possible_game:
56
+ home_team = (
57
+ obj.get("homeTeam")
58
+ or obj.get("home_team")
59
+ or (obj.get("home") or {}).get("name")
60
+ or ""
61
+ )
62
+ away_team = (
63
+ obj.get("awayTeam")
64
+ or obj.get("away_team")
65
+ or (obj.get("away") or {}).get("name")
66
+ or ""
67
+ )
68
+ venue = obj.get("venue") or obj.get("venueName") or ""
69
+ status = obj.get("status") or obj.get("detailedState") or ""
70
+ game_id = obj.get("id") or obj.get("gamePk") or obj.get("game_id") or ""
71
+ game_date = obj.get("gameDate") or obj.get("date") or obj.get("commence_time") or ""
72
+ home_score = obj.get("homeScore")
73
+ away_score = obj.get("awayScore")
74
+
75
+ if home_team and away_team:
76
+ rows.append(
77
+ {
78
+ "fetched_at": datetime.utcnow(),
79
+ "game_id": _safe_text(game_id),
80
+ "game_date": _safe_text(game_date),
81
+ "status": _safe_text(status),
82
+ "away_team": _safe_text(away_team),
83
+ "home_team": _safe_text(home_team),
84
+ "away_score": away_score,
85
+ "home_score": home_score,
86
+ "venue": _safe_text(venue),
87
+ }
88
+ )
89
+
90
+ for value in obj.values():
91
+ walk(value)
92
+
93
+ elif isinstance(obj, list):
94
+ for item in obj:
95
+ walk(item)
96
+
97
+ walk(payload)
98
+ return rows
99
+
100
+
101
+ def _rows_from_schedule_text(html: str) -> list[dict[str, Any]]:
102
+ rows: list[dict[str, Any]] = []
103
+
104
+ lines = [line.strip() for line in html.splitlines() if line.strip()]
105
+ joined = "\n".join(lines)
106
+
107
+ matches = re.findall(r"([A-Z][A-Za-z .]+)\s+vs\.\s+([A-Z][A-Za-z .]+)", joined)
108
+ seen = set()
109
+
110
+ for away_team, home_team in matches:
111
+ key = (away_team, home_team)
112
+ if key in seen:
113
+ continue
114
+ seen.add(key)
115
+ rows.append(
116
+ {
117
+ "fetched_at": datetime.utcnow(),
118
+ "game_id": "",
119
+ "game_date": "",
120
+ "status": "",
121
+ "away_team": away_team.strip(),
122
+ "home_team": home_team.strip(),
123
+ "away_score": None,
124
+ "home_score": None,
125
+ "venue": "",
126
+ }
127
+ )
128
+
129
+ return rows
130
+
131
+
132
+ def fetch_wbc_schedule() -> pd.DataFrame:
133
+ response = requests.get(WBC_SCHEDULE_PAGE_URL, headers=HEADERS, timeout=30)
134
+ response.raise_for_status()
135
+ html = response.text
136
+
137
+ payload = _extract_next_data_json(html)
138
+ rows: list[dict[str, Any]] = []
139
+
140
+ if payload is not None:
141
+ rows = _rows_from_embedded_json(payload)
142
+
143
+ if not rows:
144
+ rows = _rows_from_schedule_text(html)
145
+
146
+ df = pd.DataFrame(rows)
147
+
148
+ if df.empty:
149
+ return df
150
+
151
+ df = df.drop_duplicates(subset=["game_id", "away_team", "home_team", "game_date"]).copy()
152
+
153
+ if "game_date" in df.columns:
154
+ try:
155
+ df["game_date_sort"] = pd.to_datetime(df["game_date"], errors="coerce")
156
+ df = df.sort_values(["game_date_sort", "away_team", "home_team"]).drop(columns=["game_date_sort"])
157
+ except Exception:
158
+ pass
159
+
160
+ return df.reset_index(drop=True)