Syntrex commited on
Commit
c369231
·
verified ·
1 Parent(s): 46e00bf

Update data/schedule.py

Browse files
Files changed (1) hide show
  1. data/schedule.py +45 -100
data/schedule.py CHANGED
@@ -39,46 +39,9 @@ TEAM_MAP = {
39
  }
40
 
41
  TV_MARKERS = {"FS1", "FS2", "FOX", "Tubi"}
42
-
43
- IGNORE_PREFIXES = (
44
- "Tickets",
45
- "Schedule",
46
- "Scores",
47
- "Stats",
48
- "Standings",
49
- "Bracket",
50
- "Teams",
51
- "Watch",
52
- "News",
53
- "Venues",
54
- "Experiences",
55
- "History",
56
- "Shop",
57
- "MLB.com",
58
- "Lang",
59
- "Official Info",
60
- "About MLB",
61
- "Team Information",
62
- "Official Rules",
63
- "Replay Review Regulations",
64
- "Umpires",
65
- "Advertise with Us",
66
- "Press Releases",
67
- "Accessibility Information",
68
- "Help/Contact Us",
69
- "MLB App FAQs",
70
- "MLB.TV Help Center",
71
- "Shop Help",
72
- "Careers Home",
73
- "Terms of Use",
74
- "Privacy Policy",
75
- "Legal Notices",
76
- )
77
-
78
- ABBR_RE = re.compile(r"^[A-Z]{3}$")
79
- FINAL_RE = re.compile(r"^([A-Z]{3})\s+(\d+),\s+([A-Z]{3})\s+(\d+)$")
80
  TIME_RE = re.compile(r"^\d{1,2}:\d{2}\s+[AP]M\s+ET$")
81
- DATE_HEADING_RE = re.compile(r"^(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)$", re.I)
 
82
 
83
 
84
  def _strip_html_to_lines(html: str) -> list[str]:
@@ -89,23 +52,18 @@ def _strip_html_to_lines(html: str) -> list[str]:
89
  text = re.sub(r"\r", "\n", text)
90
  text = re.sub(r"\n+", "\n", text)
91
 
92
- raw_lines = [line.strip() for line in text.split("\n")]
93
- lines: list[str] = []
94
 
 
95
  for line in raw_lines:
96
- if not line:
97
- continue
98
  if line.startswith("Image:"):
99
  continue
100
  if line.startswith("calendar-") or line.startswith("schedule-tickets-") or line.startswith("search-"):
101
  continue
102
- if any(line.startswith(prefix) for prefix in IGNORE_PREFIXES):
103
- continue
104
- lines.append(line)
105
 
106
- # remove consecutive duplicates
107
  deduped: list[str] = []
108
- for line in lines:
109
  if not deduped or deduped[-1] != line:
110
  deduped.append(line)
111
 
@@ -116,18 +74,36 @@ def _full_team(abbr: str) -> str:
116
  return TEAM_MAP.get(abbr, abbr)
117
 
118
 
119
- def _parse_schedule_lines(lines: list[str], date_str: str) -> pd.DataFrame:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  rows: list[dict[str, Any]] = []
 
121
  i = 0
 
122
 
123
  while i < len(lines):
124
  line = lines[i]
125
 
126
  if ABBR_RE.match(line):
127
  away_abbr = line
128
-
129
- # skip duplicated team abbreviation if present
130
  j = i + 1
 
131
  while j < len(lines) and lines[j] == away_abbr:
132
  j += 1
133
 
@@ -147,32 +123,21 @@ def _parse_schedule_lines(lines: list[str], date_str: str) -> pd.DataFrame:
147
  j += 1
148
 
149
  status = ""
150
- away_score = None
151
- home_score = None
152
- game_time = ""
153
  tv = ""
154
 
155
  if j < len(lines):
156
  token = lines[j]
157
 
158
- final_match = FINAL_RE.match(token)
159
- if final_match:
160
- a1, s1, a2, s2 = final_match.groups()
161
- if {a1, a2} == {away_abbr, home_abbr}:
162
- status = "Final"
163
- if a1 == away_abbr:
164
- away_score = int(s1)
165
- home_score = int(s2)
166
- else:
167
- away_score = int(s2)
168
- home_score = int(s1)
169
- j += 1
170
- elif token == "LIVE":
171
  status = "Live"
172
  j += 1
 
 
 
173
  elif TIME_RE.match(token):
174
  status = "Scheduled"
175
- game_time = token
176
  j += 1
177
  elif token.startswith("Preview"):
178
  status = "Preview"
@@ -182,19 +147,27 @@ def _parse_schedule_lines(lines: list[str], date_str: str) -> pd.DataFrame:
182
  tv = lines[j]
183
  j += 1
184
 
 
 
 
185
  rows.append(
186
  {
187
  "fetched_at": datetime.utcnow(),
188
  "game_id": f"{date_str}:{away_abbr}:{home_abbr}",
189
  "game_date": date_str,
 
190
  "status": status,
191
  "away_team": _full_team(away_abbr),
192
  "home_team": _full_team(home_abbr),
193
- "away_score": away_score,
194
- "home_score": home_score,
 
 
 
 
195
  "venue": "",
196
  "tv": tv,
197
- "start_time_et": game_time,
198
  }
199
  )
200
 
@@ -203,32 +176,4 @@ def _parse_schedule_lines(lines: list[str], date_str: str) -> pd.DataFrame:
203
 
204
  i += 1
205
 
206
- return pd.DataFrame(rows)
207
-
208
-
209
- def fetch_schedule_for_date(date_str: str) -> pd.DataFrame:
210
- url = SCHEDULE_URL_TEMPLATE.format(date_str=date_str)
211
- response = requests.get(url, headers=HEADERS, timeout=30)
212
- response.raise_for_status()
213
-
214
- lines = _strip_html_to_lines(response.text)
215
- df = _parse_schedule_lines(lines, date_str=date_str)
216
-
217
- if df.empty:
218
- return pd.DataFrame(
219
- columns=[
220
- "fetched_at",
221
- "game_id",
222
- "game_date",
223
- "status",
224
- "away_team",
225
- "home_team",
226
- "away_score",
227
- "home_score",
228
- "venue",
229
- "tv",
230
- "start_time_et",
231
- ]
232
- )
233
-
234
- return df.sort_values(["game_date", "away_team", "home_team"]).reset_index(drop=True)
 
39
  }
40
 
41
  TV_MARKERS = {"FS1", "FS2", "FOX", "Tubi"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  TIME_RE = re.compile(r"^\d{1,2}:\d{2}\s+[AP]M\s+ET$")
43
+ ABBR_RE = re.compile(r"^[A-Z]{3}$")
44
+ GAME_PK_RE = re.compile(r"/gameday/(\d+)")
45
 
46
 
47
  def _strip_html_to_lines(html: str) -> list[str]:
 
52
  text = re.sub(r"\r", "\n", text)
53
  text = re.sub(r"\n+", "\n", text)
54
 
55
+ raw_lines = [line.strip() for line in text.split("\n") if line.strip()]
 
56
 
57
+ cleaned: list[str] = []
58
  for line in raw_lines:
 
 
59
  if line.startswith("Image:"):
60
  continue
61
  if line.startswith("calendar-") or line.startswith("schedule-tickets-") or line.startswith("search-"):
62
  continue
63
+ cleaned.append(line)
 
 
64
 
 
65
  deduped: list[str] = []
66
+ for line in cleaned:
67
  if not deduped or deduped[-1] != line:
68
  deduped.append(line)
69
 
 
74
  return TEAM_MAP.get(abbr, abbr)
75
 
76
 
77
+ def _extract_game_pks(html: str) -> list[str]:
78
+ found = GAME_PK_RE.findall(html)
79
+ seen = []
80
+ for pk in found:
81
+ if pk not in seen:
82
+ seen.append(pk)
83
+ return seen
84
+
85
+
86
+ def fetch_schedule_for_date(date_str: str) -> pd.DataFrame:
87
+ url = SCHEDULE_URL_TEMPLATE.format(date_str=date_str)
88
+ response = requests.get(url, headers=HEADERS, timeout=30)
89
+ response.raise_for_status()
90
+
91
+ html = response.text
92
+ lines = _strip_html_to_lines(html)
93
+ game_pks = _extract_game_pks(html)
94
+
95
  rows: list[dict[str, Any]] = []
96
+
97
  i = 0
98
+ game_pk_index = 0
99
 
100
  while i < len(lines):
101
  line = lines[i]
102
 
103
  if ABBR_RE.match(line):
104
  away_abbr = line
 
 
105
  j = i + 1
106
+
107
  while j < len(lines) and lines[j] == away_abbr:
108
  j += 1
109
 
 
123
  j += 1
124
 
125
  status = ""
126
+ start_time_et = ""
 
 
127
  tv = ""
128
 
129
  if j < len(lines):
130
  token = lines[j]
131
 
132
+ if token == "LIVE":
 
 
 
 
 
 
 
 
 
 
 
 
133
  status = "Live"
134
  j += 1
135
+ elif token.startswith("Final"):
136
+ status = token
137
+ j += 1
138
  elif TIME_RE.match(token):
139
  status = "Scheduled"
140
+ start_time_et = token
141
  j += 1
142
  elif token.startswith("Preview"):
143
  status = "Preview"
 
147
  tv = lines[j]
148
  j += 1
149
 
150
+ game_pk = game_pks[game_pk_index] if game_pk_index < len(game_pks) else ""
151
+ game_pk_index += 1
152
+
153
  rows.append(
154
  {
155
  "fetched_at": datetime.utcnow(),
156
  "game_id": f"{date_str}:{away_abbr}:{home_abbr}",
157
  "game_date": date_str,
158
+ "game_pk": game_pk,
159
  "status": status,
160
  "away_team": _full_team(away_abbr),
161
  "home_team": _full_team(home_abbr),
162
+ "away_score": None,
163
+ "home_score": None,
164
+ "away_hits": None,
165
+ "home_hits": None,
166
+ "away_errors": None,
167
+ "home_errors": None,
168
  "venue": "",
169
  "tv": tv,
170
+ "start_time_et": start_time_et,
171
  }
172
  )
173
 
 
176
 
177
  i += 1
178
 
179
+ return pd.DataFrame(rows)