alm7640 Claude Sonnet 4.6 commited on
Commit
e583877
·
1 Parent(s): d783e45

Fix BoA PDF parser to skip summary pages in Strategy 3

Browse files

Switch from running the two-date regex over the full concatenated document
to processing each page individually. Pages containing summary-page signals
(previous balance, new balance, account summary, credit limit, etc.) are
skipped so that page-1 overview blocks don't get mistaken for transaction
line items. Falls back to full-text if pdfplumber re-open fails.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (1) hide show
  1. parser.py +43 -23
parser.py CHANGED
@@ -280,32 +280,52 @@ def _parse_pdf(file_bytes: bytes, filename: str) -> pd.DataFrame:
280
  r"^(\d{2}/\d{2})\s+\d{2}/\d{2}\s+(.+?)\s+\d{4}\s+\d{4}\s+([\d,]+\.\d{2})\s*$",
281
  re.MULTILINE,
282
  )
283
- for match in boa_pattern.finditer(full_text):
284
- date_str, desc, amt_str = match.groups()
285
- try:
286
- month, day = map(int, date_str.split("/"))
287
- # If transaction month is later in the year than the closing month,
288
- # it belongs to the prior year (e.g. Dec txn in a Jan-closing statement)
289
- year = closing_year - 1 if month > closing_month else closing_year
290
- date = datetime(year, month, day)
291
- except (ValueError, OverflowError):
292
- continue
293
 
294
- amt = _clean_amount(amt_str)
295
- merchant_raw = desc.strip()
 
 
 
 
 
 
296
 
297
- if amt is None or amt <= 0:
298
- continue
299
- if _looks_like_payment(merchant_raw, amt):
300
- continue
 
301
 
302
- rows.append({
303
- "date": date,
304
- "raw_merchant": merchant_raw,
305
- "merchant": normalize_merchant(merchant_raw),
306
- "amount": amt,
307
- "source_file": filename,
308
- })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
309
 
310
  return pd.DataFrame(rows) if rows else pd.DataFrame()
311
 
 
280
  r"^(\d{2}/\d{2})\s+\d{2}/\d{2}\s+(.+?)\s+\d{4}\s+\d{4}\s+([\d,]+\.\d{2})\s*$",
281
  re.MULTILINE,
282
  )
 
 
 
 
 
 
 
 
 
 
283
 
284
+ # Summary-page indicators: pages with these phrases are overview/totals pages,
285
+ # not transaction listing pages. Skip them to avoid pulling in summary rows.
286
+ _SUMMARY_PAGE_SIGNALS = re.compile(
287
+ r"(account\s+summary|statement\s+summary|previous\s+balance"
288
+ r"|new\s+balance|credit\s+limit|minimum\s+payment\s+due"
289
+ r"|opening/closing\s+date|payment\s+information)",
290
+ re.IGNORECASE,
291
+ )
292
 
293
+ try:
294
+ with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
295
+ pages_text = [page.extract_text() or "" for page in pdf.pages]
296
+ except Exception:
297
+ pages_text = [full_text] # fallback: treat whole doc as one page
298
 
299
+ for page_text in pages_text:
300
+ if _SUMMARY_PAGE_SIGNALS.search(page_text):
301
+ continue # skip summary/overview pages
302
+
303
+ for match in boa_pattern.finditer(page_text):
304
+ date_str, desc, amt_str = match.groups()
305
+ try:
306
+ month, day = map(int, date_str.split("/"))
307
+ # If transaction month is later in the year than the closing month,
308
+ # it belongs to the prior year (e.g. Dec txn in a Jan-closing statement)
309
+ year = closing_year - 1 if month > closing_month else closing_year
310
+ date = datetime(year, month, day)
311
+ except (ValueError, OverflowError):
312
+ continue
313
+
314
+ amt = _clean_amount(amt_str)
315
+ merchant_raw = desc.strip()
316
+
317
+ if amt is None or amt <= 0:
318
+ continue
319
+ if _looks_like_payment(merchant_raw, amt):
320
+ continue
321
+
322
+ rows.append({
323
+ "date": date,
324
+ "raw_merchant": merchant_raw,
325
+ "merchant": normalize_merchant(merchant_raw),
326
+ "amount": amt,
327
+ "source_file": filename,
328
+ })
329
 
330
  return pd.DataFrame(rows) if rows else pd.DataFrame()
331