Codex commited on
Commit ·
262b7bd
1
Parent(s): fa535ea
Supplement thin Circa strikeouts with raster OCR
Browse files- src/market-scanner.js +18 -1
src/market-scanner.js
CHANGED
|
@@ -2419,7 +2419,24 @@ export async function fetchCircaEntries(config) {
|
|
| 2419 |
|
| 2420 |
const sourceFile = await discoverCircaFileFromSharedFolder(config);
|
| 2421 |
const text = await extractPdfText(sourceFile.buffer);
|
| 2422 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2423 |
const normalizedEntries = await normalizeCircaEntriesWithFanGraphs(parsedEntries);
|
| 2424 |
return {
|
| 2425 |
fileName: sourceFile.fileName,
|
|
|
|
| 2419 |
|
| 2420 |
const sourceFile = await discoverCircaFileFromSharedFolder(config);
|
| 2421 |
const text = await extractPdfText(sourceFile.buffer);
|
| 2422 |
+
let parsedEntries = parseCircaOcrText(text);
|
| 2423 |
+
const strikeoutOverCount = parsedEntries.filter(
|
| 2424 |
+
(entry) => entry.marketType === 'pitcher_strikeouts_generic' && entry.side === 'over'
|
| 2425 |
+
).length;
|
| 2426 |
+
|
| 2427 |
+
if (strikeoutOverCount > 0 && strikeoutOverCount < 10) {
|
| 2428 |
+
const rasterText = await extractPdfTextViaPdftoppm(sourceFile.buffer);
|
| 2429 |
+
if (rasterText.trim().length > 20) {
|
| 2430 |
+
parsedEntries = dedupeBy(
|
| 2431 |
+
[
|
| 2432 |
+
...parsedEntries,
|
| 2433 |
+
...extractSupplementalStrikeoutEntries(rasterText),
|
| 2434 |
+
].filter(isLikelyValidCircaEntry),
|
| 2435 |
+
(entry) => `${entry.marketKey}|${entry.book}`,
|
| 2436 |
+
);
|
| 2437 |
+
}
|
| 2438 |
+
}
|
| 2439 |
+
|
| 2440 |
const normalizedEntries = await normalizeCircaEntriesWithFanGraphs(parsedEntries);
|
| 2441 |
return {
|
| 2442 |
fileName: sourceFile.fileName,
|