Codex commited on
Commit
fa535ea
·
1 Parent(s): 09a8cd2

Fix Circa strikeout supplemental parsing

Browse files
src/market-scanner.js CHANGED
@@ -1087,6 +1087,84 @@ function extractNoisyStrikeoutEntries(text, section) {
1087
  return entries;
1088
  }
1089
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1090
  function normalizeCombinedBinaryLaneOdds(oddsTokens, section) {
1091
  if (!Array.isArray(oddsTokens) || oddsTokens.length < 2) {
1092
  return { yesOdds: null, noOdds: null };
@@ -1696,12 +1774,8 @@ export function parseCircaOcrText(text) {
1696
  }
1697
  }
1698
 
1699
- const hasStrikeouts = entries.some((entry) => entry.marketType === 'pitcher_strikeouts_generic');
1700
- if (!hasStrikeouts && /TOTAL STRIKEOUTS|PITCHER STRIKEOUTS/i.test(String(text ?? ''))) {
1701
- const strikeoutsSnippet = extractCircaSectionSnippet(text, /TOTAL STRIKEOUTS|PITCHER STRIKEOUTS/i, 8);
1702
- if (strikeoutsSnippet) {
1703
- entries.push(...extractNoisyStrikeoutEntries(strikeoutsSnippet, detectCircaSection('TOTAL STRIKEOUTS')));
1704
- }
1705
  }
1706
 
1707
  return dedupeBy(entries.filter(isLikelyValidCircaEntry), (entry) => `${entry.marketKey}|${entry.book}`);
 
1087
  return entries;
1088
  }
1089
 
1090
+ function extractStrikeoutLaneEntriesFromSnippet(text, section) {
1091
+ const rawLines = String(text ?? '')
1092
+ .replace(/\r\n/g, '\n')
1093
+ .split('\n')
1094
+ .map((line) => line.replace(/\s+$/g, ''));
1095
+
1096
+ const headerIndex = rawLines.findIndex((line) => /TOTAL STRIKEOUTS|PITCHER STRIKEOUTS/i.test(line));
1097
+ if (headerIndex === -1) {
1098
+ return [];
1099
+ }
1100
+
1101
+ const headerSegments = splitCircaLayoutColumns(rawLines[headerIndex]);
1102
+ const strikeoutLaneIndex = headerSegments.findIndex((segment) => /TOTAL STRIKEOUTS|PITCHER STRIKEOUTS/i.test(segment.text));
1103
+ if (strikeoutLaneIndex === -1) {
1104
+ return [];
1105
+ }
1106
+
1107
+ const states = new Map();
1108
+ const entries = [];
1109
+
1110
+ for (const rawLine of rawLines.slice(headerIndex + 1)) {
1111
+ if (!rawLine.trim()) {
1112
+ continue;
1113
+ }
1114
+
1115
+ if (detectCircaSection(rawLine) && !/TOTAL STRIKEOUTS|PITCHER STRIKEOUTS/i.test(rawLine)) {
1116
+ break;
1117
+ }
1118
+
1119
+ const segments = splitCircaLayoutColumns(rawLine);
1120
+ if (segments.length === 0) {
1121
+ continue;
1122
+ }
1123
+
1124
+ const preferredSegment = segments.find((segment) => segment.index === strikeoutLaneIndex)
1125
+ ?? segments.at(-1);
1126
+ if (!preferredSegment?.text) {
1127
+ continue;
1128
+ }
1129
+
1130
+ const stateKey = `strikeout-snippet-${preferredSegment.index}`;
1131
+ const state = states.get(stateKey) ?? {
1132
+ section,
1133
+ pendingPlayer: null,
1134
+ pendingTeam: null,
1135
+ pendingLineValue: null,
1136
+ pendingOverOdds: null,
1137
+ pendingUnderOdds: null,
1138
+ };
1139
+ state.section = section;
1140
+ entries.push(...extractCircaOverUnderTableEntriesV2(preferredSegment.text, section, state));
1141
+ states.set(stateKey, state);
1142
+ }
1143
+
1144
+ for (const [stateKey, state] of states.entries()) {
1145
+ entries.push(...extractCircaOverUnderTableEntriesV2('', section, state));
1146
+ states.set(stateKey, state);
1147
+ }
1148
+
1149
+ return entries;
1150
+ }
1151
+
1152
+ export function extractSupplementalStrikeoutEntries(text) {
1153
+ const section = detectCircaSection('TOTAL STRIKEOUTS');
1154
+ const snippet = extractCircaSectionSnippet(text, /TOTAL STRIKEOUTS|PITCHER STRIKEOUTS/i, 40);
1155
+ if (!snippet || !section) {
1156
+ return [];
1157
+ }
1158
+
1159
+ return dedupeBy(
1160
+ [
1161
+ ...extractStrikeoutLaneEntriesFromSnippet(snippet, section),
1162
+ ...extractNoisyStrikeoutEntries(snippet, section),
1163
+ ].filter(isLikelyValidCircaEntry),
1164
+ (entry) => `${entry.marketKey}|${entry.book}`,
1165
+ );
1166
+ }
1167
+
1168
  function normalizeCombinedBinaryLaneOdds(oddsTokens, section) {
1169
  if (!Array.isArray(oddsTokens) || oddsTokens.length < 2) {
1170
  return { yesOdds: null, noOdds: null };
 
1774
  }
1775
  }
1776
 
1777
+ if (/TOTAL STRIKEOUTS|PITCHER STRIKEOUTS/i.test(String(text ?? ''))) {
1778
+ entries.push(...extractSupplementalStrikeoutEntries(text));
 
 
 
 
1779
  }
1780
 
1781
  return dedupeBy(entries.filter(isLikelyValidCircaEntry), (entry) => `${entry.marketKey}|${entry.book}`);
test/market-scanner.test.js CHANGED
@@ -6,6 +6,7 @@ import {
6
  buildMarketKey,
7
  extractCircaPdfFromArchiveBuffer,
8
  extractCircaFileCandidatesFromHtml,
 
9
  fetchOddsApiEntries,
10
  matchesCircaMlbFilename,
11
  normalizeOddsApiEntries,
@@ -252,6 +253,42 @@ test('recovers noisy live-style strikeout rows', () => {
252
  assert.ok(entries.some((entry) => entry.marketType === 'pitcher_strikeouts_generic' && entry.playerName === 'LOGAN GILBERT' && entry.side === 'over' && entry.lineValue === 6.5));
253
  });
254
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
255
  test('ranks discrepancy, width, and circa alerts', () => {
256
  const entries = [
257
  {
 
6
  buildMarketKey,
7
  extractCircaPdfFromArchiveBuffer,
8
  extractCircaFileCandidatesFromHtml,
9
+ extractSupplementalStrikeoutEntries,
10
  fetchOddsApiEntries,
11
  matchesCircaMlbFilename,
12
  normalizeOddsApiEntries,
 
253
  assert.ok(entries.some((entry) => entry.marketType === 'pitcher_strikeouts_generic' && entry.playerName === 'LOGAN GILBERT' && entry.side === 'over' && entry.lineValue === 6.5));
254
  });
255
 
256
+ test('extracts strikeout lane from mixed Team Totals header', () => {
257
+ const entries = extractSupplementalStrikeoutEntries([
258
+ 'Team Totals Total Strikeouts',
259
+ '86601 PADRES 3½ -130 85001 GERMAN MARQUEZ (SD) 3½ +105',
260
+ '86602 3½ +110 85002 3½ -125',
261
+ '86603 PIRATES 4½ +105 85003 BUBBA CHANDLER (PIT) 5½ +135',
262
+ '86604 4½ -125 85004 5½ -155',
263
+ '86611 NATIONALS 3½ -140 85027 JOE RYAN (MIL) 6½ +105',
264
+ '86612 3½ +120 85028 6½ -125',
265
+ ].join('\n'));
266
+
267
+ assert.ok(entries.some((entry) => entry.marketType === 'pitcher_strikeouts_generic' && entry.playerName === 'GERMAN MARQUEZ' && entry.side === 'over' && entry.lineValue === 3.5 && entry.oddsInput === '+105'));
268
+ assert.ok(entries.some((entry) => entry.marketType === 'pitcher_strikeouts_generic' && entry.playerName === 'BUBBA CHANDLER' && entry.side === 'over' && entry.lineValue === 5.5 && entry.oddsInput === '+135'));
269
+ assert.ok(entries.some((entry) => entry.marketType === 'pitcher_strikeouts_generic' && entry.playerName === 'JOE RYAN' && entry.side === 'over' && entry.lineValue === 6.5 && entry.oddsInput === '+105'));
270
+ });
271
+
272
+ test('supplements strikeouts even when generic parsing already found some rows', () => {
273
+ const entries = parseCircaOcrText([
274
+ 'Team Totals Total Strikeouts',
275
+ 'asst 0 [86627 "10 {ssn ns [85029 PE',
276
+ 'asé02 PADRES 34150 [sez TVINS 3% 110 [saonz GERMAN MARUQUEZ. (SD) 3% "15% | saay LOGAN GILBERT (SEA) 6%',
277
+ '86603 ios [86679 "ro [65003 "ras [85031 E',
278
+ 'sens PIRATES 4% 110% [Seen MARINERS 3% 110 |3aops BUBBA CHANDLER (PIT) 5% 1 5037 2ACOB DEGRON (TEX) 6% 15%',
279
+ '8665 "io [86st "is [esos "an (85033 E',
280
+ 'ss606 REDS 3% in [sess RANGERS 3% ie [gaone BRANDON WILLIAMSON (CIN) 4% *1i0 (270 JAMESON TAILLON (CHC) 341%',
281
+ 'ee MARLINS 4% 1 [4 cups 3 11055007 JANSON JUNK. (IA) ® = SHANE NCCLANAHAN (TB) 5% 110',
282
+ '86611 NATIONALS 3½ -140 85027 JOE RYAN (MIL) 6½ +105',
283
+ '86612 3½ +120 85028 6½ -125',
284
+ ].join('\n'));
285
+
286
+ const strikeoutOvers = entries.filter((entry) => entry.marketType === 'pitcher_strikeouts_generic' && entry.side === 'over');
287
+ assert.ok(strikeoutOvers.some((entry) => entry.playerName === 'JOE RYAN' && entry.lineValue === 6.5 && entry.oddsInput === '+105'));
288
+ assert.ok(strikeoutOvers.some((entry) => entry.playerName.includes('GERMAN') && entry.lineValue === 3.5));
289
+ assert.ok(strikeoutOvers.length >= 6);
290
+ });
291
+
292
  test('ranks discrepancy, width, and circa alerts', () => {
293
  const entries = [
294
  {