Codex commited on
Commit ·
fa535ea
1
Parent(s): 09a8cd2
Fix Circa strikeout supplemental parsing
Browse files- src/market-scanner.js +80 -6
- test/market-scanner.test.js +37 -0
src/market-scanner.js
CHANGED
|
@@ -1087,6 +1087,84 @@ function extractNoisyStrikeoutEntries(text, section) {
|
|
| 1087 |
return entries;
|
| 1088 |
}
|
| 1089 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1090 |
function normalizeCombinedBinaryLaneOdds(oddsTokens, section) {
|
| 1091 |
if (!Array.isArray(oddsTokens) || oddsTokens.length < 2) {
|
| 1092 |
return { yesOdds: null, noOdds: null };
|
|
@@ -1696,12 +1774,8 @@ export function parseCircaOcrText(text) {
|
|
| 1696 |
}
|
| 1697 |
}
|
| 1698 |
|
| 1699 |
-
|
| 1700 |
-
|
| 1701 |
-
const strikeoutsSnippet = extractCircaSectionSnippet(text, /TOTAL STRIKEOUTS|PITCHER STRIKEOUTS/i, 8);
|
| 1702 |
-
if (strikeoutsSnippet) {
|
| 1703 |
-
entries.push(...extractNoisyStrikeoutEntries(strikeoutsSnippet, detectCircaSection('TOTAL STRIKEOUTS')));
|
| 1704 |
-
}
|
| 1705 |
}
|
| 1706 |
|
| 1707 |
return dedupeBy(entries.filter(isLikelyValidCircaEntry), (entry) => `${entry.marketKey}|${entry.book}`);
|
|
|
|
| 1087 |
return entries;
|
| 1088 |
}
|
| 1089 |
|
| 1090 |
+
function extractStrikeoutLaneEntriesFromSnippet(text, section) {
|
| 1091 |
+
const rawLines = String(text ?? '')
|
| 1092 |
+
.replace(/\r\n/g, '\n')
|
| 1093 |
+
.split('\n')
|
| 1094 |
+
.map((line) => line.replace(/\s+$/g, ''));
|
| 1095 |
+
|
| 1096 |
+
const headerIndex = rawLines.findIndex((line) => /TOTAL STRIKEOUTS|PITCHER STRIKEOUTS/i.test(line));
|
| 1097 |
+
if (headerIndex === -1) {
|
| 1098 |
+
return [];
|
| 1099 |
+
}
|
| 1100 |
+
|
| 1101 |
+
const headerSegments = splitCircaLayoutColumns(rawLines[headerIndex]);
|
| 1102 |
+
const strikeoutLaneIndex = headerSegments.findIndex((segment) => /TOTAL STRIKEOUTS|PITCHER STRIKEOUTS/i.test(segment.text));
|
| 1103 |
+
if (strikeoutLaneIndex === -1) {
|
| 1104 |
+
return [];
|
| 1105 |
+
}
|
| 1106 |
+
|
| 1107 |
+
const states = new Map();
|
| 1108 |
+
const entries = [];
|
| 1109 |
+
|
| 1110 |
+
for (const rawLine of rawLines.slice(headerIndex + 1)) {
|
| 1111 |
+
if (!rawLine.trim()) {
|
| 1112 |
+
continue;
|
| 1113 |
+
}
|
| 1114 |
+
|
| 1115 |
+
if (detectCircaSection(rawLine) && !/TOTAL STRIKEOUTS|PITCHER STRIKEOUTS/i.test(rawLine)) {
|
| 1116 |
+
break;
|
| 1117 |
+
}
|
| 1118 |
+
|
| 1119 |
+
const segments = splitCircaLayoutColumns(rawLine);
|
| 1120 |
+
if (segments.length === 0) {
|
| 1121 |
+
continue;
|
| 1122 |
+
}
|
| 1123 |
+
|
| 1124 |
+
const preferredSegment = segments.find((segment) => segment.index === strikeoutLaneIndex)
|
| 1125 |
+
?? segments.at(-1);
|
| 1126 |
+
if (!preferredSegment?.text) {
|
| 1127 |
+
continue;
|
| 1128 |
+
}
|
| 1129 |
+
|
| 1130 |
+
const stateKey = `strikeout-snippet-${preferredSegment.index}`;
|
| 1131 |
+
const state = states.get(stateKey) ?? {
|
| 1132 |
+
section,
|
| 1133 |
+
pendingPlayer: null,
|
| 1134 |
+
pendingTeam: null,
|
| 1135 |
+
pendingLineValue: null,
|
| 1136 |
+
pendingOverOdds: null,
|
| 1137 |
+
pendingUnderOdds: null,
|
| 1138 |
+
};
|
| 1139 |
+
state.section = section;
|
| 1140 |
+
entries.push(...extractCircaOverUnderTableEntriesV2(preferredSegment.text, section, state));
|
| 1141 |
+
states.set(stateKey, state);
|
| 1142 |
+
}
|
| 1143 |
+
|
| 1144 |
+
for (const [stateKey, state] of states.entries()) {
|
| 1145 |
+
entries.push(...extractCircaOverUnderTableEntriesV2('', section, state));
|
| 1146 |
+
states.set(stateKey, state);
|
| 1147 |
+
}
|
| 1148 |
+
|
| 1149 |
+
return entries;
|
| 1150 |
+
}
|
| 1151 |
+
|
| 1152 |
+
export function extractSupplementalStrikeoutEntries(text) {
|
| 1153 |
+
const section = detectCircaSection('TOTAL STRIKEOUTS');
|
| 1154 |
+
const snippet = extractCircaSectionSnippet(text, /TOTAL STRIKEOUTS|PITCHER STRIKEOUTS/i, 40);
|
| 1155 |
+
if (!snippet || !section) {
|
| 1156 |
+
return [];
|
| 1157 |
+
}
|
| 1158 |
+
|
| 1159 |
+
return dedupeBy(
|
| 1160 |
+
[
|
| 1161 |
+
...extractStrikeoutLaneEntriesFromSnippet(snippet, section),
|
| 1162 |
+
...extractNoisyStrikeoutEntries(snippet, section),
|
| 1163 |
+
].filter(isLikelyValidCircaEntry),
|
| 1164 |
+
(entry) => `${entry.marketKey}|${entry.book}`,
|
| 1165 |
+
);
|
| 1166 |
+
}
|
| 1167 |
+
|
| 1168 |
function normalizeCombinedBinaryLaneOdds(oddsTokens, section) {
|
| 1169 |
if (!Array.isArray(oddsTokens) || oddsTokens.length < 2) {
|
| 1170 |
return { yesOdds: null, noOdds: null };
|
|
|
|
| 1774 |
}
|
| 1775 |
}
|
| 1776 |
|
| 1777 |
+
if (/TOTAL STRIKEOUTS|PITCHER STRIKEOUTS/i.test(String(text ?? ''))) {
|
| 1778 |
+
entries.push(...extractSupplementalStrikeoutEntries(text));
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1779 |
}
|
| 1780 |
|
| 1781 |
return dedupeBy(entries.filter(isLikelyValidCircaEntry), (entry) => `${entry.marketKey}|${entry.book}`);
|
test/market-scanner.test.js
CHANGED
|
@@ -6,6 +6,7 @@ import {
|
|
| 6 |
buildMarketKey,
|
| 7 |
extractCircaPdfFromArchiveBuffer,
|
| 8 |
extractCircaFileCandidatesFromHtml,
|
|
|
|
| 9 |
fetchOddsApiEntries,
|
| 10 |
matchesCircaMlbFilename,
|
| 11 |
normalizeOddsApiEntries,
|
|
@@ -252,6 +253,42 @@ test('recovers noisy live-style strikeout rows', () => {
|
|
| 252 |
assert.ok(entries.some((entry) => entry.marketType === 'pitcher_strikeouts_generic' && entry.playerName === 'LOGAN GILBERT' && entry.side === 'over' && entry.lineValue === 6.5));
|
| 253 |
});
|
| 254 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 255 |
test('ranks discrepancy, width, and circa alerts', () => {
|
| 256 |
const entries = [
|
| 257 |
{
|
|
|
|
| 6 |
buildMarketKey,
|
| 7 |
extractCircaPdfFromArchiveBuffer,
|
| 8 |
extractCircaFileCandidatesFromHtml,
|
| 9 |
+
extractSupplementalStrikeoutEntries,
|
| 10 |
fetchOddsApiEntries,
|
| 11 |
matchesCircaMlbFilename,
|
| 12 |
normalizeOddsApiEntries,
|
|
|
|
| 253 |
assert.ok(entries.some((entry) => entry.marketType === 'pitcher_strikeouts_generic' && entry.playerName === 'LOGAN GILBERT' && entry.side === 'over' && entry.lineValue === 6.5));
|
| 254 |
});
|
| 255 |
|
| 256 |
+
test('extracts strikeout lane from mixed Team Totals header', () => {
|
| 257 |
+
const entries = extractSupplementalStrikeoutEntries([
|
| 258 |
+
'Team Totals Total Strikeouts',
|
| 259 |
+
'86601 PADRES 3½ -130 85001 GERMAN MARQUEZ (SD) 3½ +105',
|
| 260 |
+
'86602 3½ +110 85002 3½ -125',
|
| 261 |
+
'86603 PIRATES 4½ +105 85003 BUBBA CHANDLER (PIT) 5½ +135',
|
| 262 |
+
'86604 4½ -125 85004 5½ -155',
|
| 263 |
+
'86611 NATIONALS 3½ -140 85027 JOE RYAN (MIL) 6½ +105',
|
| 264 |
+
'86612 3½ +120 85028 6½ -125',
|
| 265 |
+
].join('\n'));
|
| 266 |
+
|
| 267 |
+
assert.ok(entries.some((entry) => entry.marketType === 'pitcher_strikeouts_generic' && entry.playerName === 'GERMAN MARQUEZ' && entry.side === 'over' && entry.lineValue === 3.5 && entry.oddsInput === '+105'));
|
| 268 |
+
assert.ok(entries.some((entry) => entry.marketType === 'pitcher_strikeouts_generic' && entry.playerName === 'BUBBA CHANDLER' && entry.side === 'over' && entry.lineValue === 5.5 && entry.oddsInput === '+135'));
|
| 269 |
+
assert.ok(entries.some((entry) => entry.marketType === 'pitcher_strikeouts_generic' && entry.playerName === 'JOE RYAN' && entry.side === 'over' && entry.lineValue === 6.5 && entry.oddsInput === '+105'));
|
| 270 |
+
});
|
| 271 |
+
|
| 272 |
+
test('supplements strikeouts even when generic parsing already found some rows', () => {
|
| 273 |
+
const entries = parseCircaOcrText([
|
| 274 |
+
'Team Totals Total Strikeouts',
|
| 275 |
+
'asst 0 [86627 "10 {ssn ns [85029 PE',
|
| 276 |
+
'asé02 PADRES 34150 [sez TVINS 3% 110 [saonz GERMAN MARUQUEZ. (SD) 3% "15% | saay LOGAN GILBERT (SEA) 6%',
|
| 277 |
+
'86603 ios [86679 "ro [65003 "ras [85031 E',
|
| 278 |
+
'sens PIRATES 4% 110% [Seen MARINERS 3% 110 |3aops BUBBA CHANDLER (PIT) 5% 1 5037 2ACOB DEGRON (TEX) 6% 15%',
|
| 279 |
+
'8665 "io [86st "is [esos "an (85033 E',
|
| 280 |
+
'ss606 REDS 3% in [sess RANGERS 3% ie [gaone BRANDON WILLIAMSON (CIN) 4% *1i0 (270 JAMESON TAILLON (CHC) 341%',
|
| 281 |
+
'ee MARLINS 4% 1 [4 cups 3 11055007 JANSON JUNK. (IA) ® = SHANE NCCLANAHAN (TB) 5% 110',
|
| 282 |
+
'86611 NATIONALS 3½ -140 85027 JOE RYAN (MIL) 6½ +105',
|
| 283 |
+
'86612 3½ +120 85028 6½ -125',
|
| 284 |
+
].join('\n'));
|
| 285 |
+
|
| 286 |
+
const strikeoutOvers = entries.filter((entry) => entry.marketType === 'pitcher_strikeouts_generic' && entry.side === 'over');
|
| 287 |
+
assert.ok(strikeoutOvers.some((entry) => entry.playerName === 'JOE RYAN' && entry.lineValue === 6.5 && entry.oddsInput === '+105'));
|
| 288 |
+
assert.ok(strikeoutOvers.some((entry) => entry.playerName.includes('GERMAN') && entry.lineValue === 3.5));
|
| 289 |
+
assert.ok(strikeoutOvers.length >= 6);
|
| 290 |
+
});
|
| 291 |
+
|
| 292 |
test('ranks discrepancy, width, and circa alerts', () => {
|
| 293 |
const entries = [
|
| 294 |
{
|