Codex commited on
Commit
262b7bd
·
1 Parent(s): fa535ea

Supplement thin Circa strikeouts with raster OCR

Browse files
Files changed (1) hide show
  1. src/market-scanner.js +18 -1
src/market-scanner.js CHANGED
@@ -2419,7 +2419,24 @@ export async function fetchCircaEntries(config) {
2419
 
2420
  const sourceFile = await discoverCircaFileFromSharedFolder(config);
2421
  const text = await extractPdfText(sourceFile.buffer);
2422
- const parsedEntries = parseCircaOcrText(text);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2423
  const normalizedEntries = await normalizeCircaEntriesWithFanGraphs(parsedEntries);
2424
  return {
2425
  fileName: sourceFile.fileName,
 
2419
 
2420
  const sourceFile = await discoverCircaFileFromSharedFolder(config);
2421
  const text = await extractPdfText(sourceFile.buffer);
2422
+ let parsedEntries = parseCircaOcrText(text);
2423
+ const strikeoutOverCount = parsedEntries.filter(
2424
+ (entry) => entry.marketType === 'pitcher_strikeouts_generic' && entry.side === 'over'
2425
+ ).length;
2426
+
2427
+ if (strikeoutOverCount > 0 && strikeoutOverCount < 10) {
2428
+ const rasterText = await extractPdfTextViaPdftoppm(sourceFile.buffer);
2429
+ if (rasterText.trim().length > 20) {
2430
+ parsedEntries = dedupeBy(
2431
+ [
2432
+ ...parsedEntries,
2433
+ ...extractSupplementalStrikeoutEntries(rasterText),
2434
+ ].filter(isLikelyValidCircaEntry),
2435
+ (entry) => `${entry.marketKey}|${entry.book}`,
2436
+ );
2437
+ }
2438
+ }
2439
+
2440
  const normalizedEntries = await normalizeCircaEntriesWithFanGraphs(parsedEntries);
2441
  return {
2442
  fileName: sourceFile.fileName,