alm7640 Claude Sonnet 4.6 commited on
Commit
0ea5501
Β·
1 Parent(s): 3b813b2

fix: improve PDF parser to filter summary pages and reduce false positives

Browse files

- Extract _SUMMARY_PAGE_SIGNALS and _FAKE_MERCHANT_SIGNALS as module-level constants
- Run Strategy 2 page-by-page to skip summary/overview pages (prevents balance totals from appearing as transactions)
- Add _FAKE_MERCHANT_SIGNALS guard to filter summary row false positives in Strategy 2
- Run Strategy 3 independently and let it win when it finds more transactions (fixes BofA PDFs)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (1) hide show
  1. parser.py +65 -26
parser.py CHANGED
@@ -67,6 +67,30 @@ def _parse_date(val) -> Optional[datetime]:
67
  return None
68
 
69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  # ─────────────────────────────────────────────────────────────────────────────
71
  # Format-specific parsers
72
  # ─────────────────────────────────────────────────────────────────────────────
@@ -232,35 +256,54 @@ def _parse_pdf(file_bytes: bytes, filename: str) -> pd.DataFrame:
232
  pass
233
 
234
  # ── Strategy 2: Full-date regex (MM/DD/YYYY or YYYY-MM-DD etc.) ──────
 
235
  if not rows and full_text:
236
  pattern = re.compile(
237
  r"(\d{1,2}[/\-]\d{1,2}[/\-]\d{2,4})\s+"
238
  r"([A-Za-z][^\d\n]{3,50})\s+"
239
  r"\$?([\d,]+\.\d{2})"
240
  )
241
- for match in pattern.finditer(full_text):
242
- date_str, desc, amt_str = match.groups()
243
- date = _parse_date(date_str)
244
- amt = _clean_amount(amt_str)
245
- merchant_raw = desc.strip()
246
 
247
- if date is None or amt is None or amt <= 0:
248
- continue
249
- if _looks_like_payment(merchant_raw, amt):
 
250
  continue
251
 
252
- rows.append({
253
- "date": date,
254
- "raw_merchant": merchant_raw,
255
- "merchant": normalize_merchant(merchant_raw),
256
- "amount": amt,
257
- "source_file": filename,
258
- })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
 
260
  # ── Strategy 3: Two-date MM/DD format β€” Bank of America and similar ──
261
  # Format: MM/DD MM/DD DESCRIPTION REF(4) ACCT(4) AMOUNT
262
  # Dates have no year; infer from statement period in the header text.
263
- if not rows and full_text:
 
 
 
264
  # Extract closing month/year from text like "December 13 - January 12, 2025"
265
  closing_year = datetime.now().year
266
  closing_month = datetime.now().month
@@ -281,15 +324,6 @@ def _parse_pdf(file_bytes: bytes, filename: str) -> pd.DataFrame:
281
  re.MULTILINE,
282
  )
283
 
284
- # Summary-page indicators: pages with these phrases are overview/totals pages,
285
- # not transaction listing pages. Skip them to avoid pulling in summary rows.
286
- _SUMMARY_PAGE_SIGNALS = re.compile(
287
- r"(account\s+summary|statement\s+summary|previous\s+balance"
288
- r"|new\s+balance|credit\s+limit|minimum\s+payment\s+due"
289
- r"|opening/closing\s+date|payment\s+information)",
290
- re.IGNORECASE,
291
- )
292
-
293
  try:
294
  with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
295
  pages_text = [page.extract_text() or "" for page in pdf.pages]
@@ -319,7 +353,7 @@ def _parse_pdf(file_bytes: bytes, filename: str) -> pd.DataFrame:
319
  if _looks_like_payment(merchant_raw, amt):
320
  continue
321
 
322
- rows.append({
323
  "date": date,
324
  "raw_merchant": merchant_raw,
325
  "merchant": normalize_merchant(merchant_raw),
@@ -327,6 +361,11 @@ def _parse_pdf(file_bytes: bytes, filename: str) -> pd.DataFrame:
327
  "source_file": filename,
328
  })
329
 
 
 
 
 
 
330
  return pd.DataFrame(rows) if rows else pd.DataFrame()
331
 
332
 
 
67
  return None
68
 
69
 
70
+ # ─────────────────────────────────────────────────────────────────────────────
71
+ # Summary-page filter β€” shared across ALL PDF strategies
72
+ # Pages containing these phrases are overview/totals pages, not transaction
73
+ # listing pages. Skip them entirely to avoid pulling in summary rows.
74
+ # ─────────────────────────────────────────────────────────────────────────────
75
+
76
+ _SUMMARY_PAGE_SIGNALS = re.compile(
77
+ r"(account\s+summary|statement\s+summary|previous\s+balance"
78
+ r"|new\s+balance\s+total|credit\s+limit|minimum\s+payment\s+due"
79
+ r"|opening/closing\s+date|payment\s+information"
80
+ r"|total\s+credit\s+line|statement\s+closing\s+date)",
81
+ re.IGNORECASE,
82
+ )
83
+
84
+ # Merchant strings that look like statement summary rows, not real merchants.
85
+ # Used to filter false positives from Strategy 2 regex matches.
86
+ _FAKE_MERCHANT_SIGNALS = re.compile(
87
+ r"^(new balance|previous balance|minimum payment|payment due"
88
+ r"|total credit|interest charge|fees charged|purchases and adj"
89
+ r"|payments and other|statement closing|days in billing)",
90
+ re.IGNORECASE,
91
+ )
92
+
93
+
94
  # ─────────────────────────────────────────────────────────────────────────────
95
  # Format-specific parsers
96
  # ─────────────────────────────────────────────────────────────────────────────
 
256
  pass
257
 
258
  # ── Strategy 2: Full-date regex (MM/DD/YYYY or YYYY-MM-DD etc.) ──────
259
+ # Runs page-by-page (not on full_text) so summary pages can be skipped.
260
  if not rows and full_text:
261
  pattern = re.compile(
262
  r"(\d{1,2}[/\-]\d{1,2}[/\-]\d{2,4})\s+"
263
  r"([A-Za-z][^\d\n]{3,50})\s+"
264
  r"\$?([\d,]+\.\d{2})"
265
  )
266
+ try:
267
+ with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
268
+ pages_text_s2 = [page.extract_text() or "" for page in pdf.pages]
269
+ except Exception:
270
+ pages_text_s2 = [full_text]
271
 
272
+ for page_text in pages_text_s2:
273
+ # Skip summary/overview pages β€” they contain balance totals that
274
+ # look like transactions but aren't (e.g. "New Balance Total $4,814")
275
+ if _SUMMARY_PAGE_SIGNALS.search(page_text):
276
  continue
277
 
278
+ for match in pattern.finditer(page_text):
279
+ date_str, desc, amt_str = match.groups()
280
+ date = _parse_date(date_str)
281
+ amt = _clean_amount(amt_str)
282
+ merchant_raw = desc.strip()
283
+
284
+ if date is None or amt is None or amt <= 0:
285
+ continue
286
+ if _looks_like_payment(merchant_raw, amt):
287
+ continue
288
+ # Guard against summary row false positives
289
+ if _FAKE_MERCHANT_SIGNALS.search(merchant_raw):
290
+ continue
291
+
292
+ rows.append({
293
+ "date": date,
294
+ "raw_merchant": merchant_raw,
295
+ "merchant": normalize_merchant(merchant_raw),
296
+ "amount": amt,
297
+ "source_file": filename,
298
+ })
299
 
300
  # ── Strategy 3: Two-date MM/DD format β€” Bank of America and similar ──
301
  # Format: MM/DD MM/DD DESCRIPTION REF(4) ACCT(4) AMOUNT
302
  # Dates have no year; infer from statement period in the header text.
303
+ # NOTE: Run this BEFORE Strategy 2, and independently of row count.
304
+ # BofA PDFs will always match here; if we get more hits than rows, use these.
305
+ s3_rows = []
306
+ if full_text:
307
  # Extract closing month/year from text like "December 13 - January 12, 2025"
308
  closing_year = datetime.now().year
309
  closing_month = datetime.now().month
 
324
  re.MULTILINE,
325
  )
326
 
 
 
 
 
 
 
 
 
 
327
  try:
328
  with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
329
  pages_text = [page.extract_text() or "" for page in pdf.pages]
 
353
  if _looks_like_payment(merchant_raw, amt):
354
  continue
355
 
356
+ s3_rows.append({
357
  "date": date,
358
  "raw_merchant": merchant_raw,
359
  "merchant": normalize_merchant(merchant_raw),
 
361
  "source_file": filename,
362
  })
363
 
364
+ # Strategy 3 wins if it found more transactions than earlier strategies
365
+ # (BofA PDFs always have many transactions; Strategy 2 false positives are few)
366
+ if len(s3_rows) > len(rows):
367
+ rows = s3_rows
368
+
369
  return pd.DataFrame(rows) if rows else pd.DataFrame()
370
 
371