Fix BoA PDF parser to skip summary pages in Strategy 3
Browse filesSwitch from running the two-date regex over the full concatenated document
to processing each page individually. Pages containing summary-page signals
(previous balance, new balance, account summary, credit limit, etc.) are
skipped so that page-1 overview blocks don't get mistaken for transaction
line items. Falls back to full-text if pdfplumber re-open fails.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
parser.py
CHANGED
|
@@ -280,32 +280,52 @@ def _parse_pdf(file_bytes: bytes, filename: str) -> pd.DataFrame:
|
|
| 280 |
r"^(\d{2}/\d{2})\s+\d{2}/\d{2}\s+(.+?)\s+\d{4}\s+\d{4}\s+([\d,]+\.\d{2})\s*$",
|
| 281 |
re.MULTILINE,
|
| 282 |
)
|
| 283 |
-
for match in boa_pattern.finditer(full_text):
|
| 284 |
-
date_str, desc, amt_str = match.groups()
|
| 285 |
-
try:
|
| 286 |
-
month, day = map(int, date_str.split("/"))
|
| 287 |
-
# If transaction month is later in the year than the closing month,
|
| 288 |
-
# it belongs to the prior year (e.g. Dec txn in a Jan-closing statement)
|
| 289 |
-
year = closing_year - 1 if month > closing_month else closing_year
|
| 290 |
-
date = datetime(year, month, day)
|
| 291 |
-
except (ValueError, OverflowError):
|
| 292 |
-
continue
|
| 293 |
|
| 294 |
-
|
| 295 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 296 |
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
|
|
|
| 301 |
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 309 |
|
| 310 |
return pd.DataFrame(rows) if rows else pd.DataFrame()
|
| 311 |
|
|
|
|
| 280 |
r"^(\d{2}/\d{2})\s+\d{2}/\d{2}\s+(.+?)\s+\d{4}\s+\d{4}\s+([\d,]+\.\d{2})\s*$",
|
| 281 |
re.MULTILINE,
|
| 282 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 283 |
|
| 284 |
+
# Summary-page indicators: pages with these phrases are overview/totals pages,
|
| 285 |
+
# not transaction listing pages. Skip them to avoid pulling in summary rows.
|
| 286 |
+
_SUMMARY_PAGE_SIGNALS = re.compile(
|
| 287 |
+
r"(account\s+summary|statement\s+summary|previous\s+balance"
|
| 288 |
+
r"|new\s+balance|credit\s+limit|minimum\s+payment\s+due"
|
| 289 |
+
r"|opening/closing\s+date|payment\s+information)",
|
| 290 |
+
re.IGNORECASE,
|
| 291 |
+
)
|
| 292 |
|
| 293 |
+
try:
|
| 294 |
+
with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
|
| 295 |
+
pages_text = [page.extract_text() or "" for page in pdf.pages]
|
| 296 |
+
except Exception:
|
| 297 |
+
pages_text = [full_text] # fallback: treat whole doc as one page
|
| 298 |
|
| 299 |
+
for page_text in pages_text:
|
| 300 |
+
if _SUMMARY_PAGE_SIGNALS.search(page_text):
|
| 301 |
+
continue # skip summary/overview pages
|
| 302 |
+
|
| 303 |
+
for match in boa_pattern.finditer(page_text):
|
| 304 |
+
date_str, desc, amt_str = match.groups()
|
| 305 |
+
try:
|
| 306 |
+
month, day = map(int, date_str.split("/"))
|
| 307 |
+
# If transaction month is later in the year than the closing month,
|
| 308 |
+
# it belongs to the prior year (e.g. Dec txn in a Jan-closing statement)
|
| 309 |
+
year = closing_year - 1 if month > closing_month else closing_year
|
| 310 |
+
date = datetime(year, month, day)
|
| 311 |
+
except (ValueError, OverflowError):
|
| 312 |
+
continue
|
| 313 |
+
|
| 314 |
+
amt = _clean_amount(amt_str)
|
| 315 |
+
merchant_raw = desc.strip()
|
| 316 |
+
|
| 317 |
+
if amt is None or amt <= 0:
|
| 318 |
+
continue
|
| 319 |
+
if _looks_like_payment(merchant_raw, amt):
|
| 320 |
+
continue
|
| 321 |
+
|
| 322 |
+
rows.append({
|
| 323 |
+
"date": date,
|
| 324 |
+
"raw_merchant": merchant_raw,
|
| 325 |
+
"merchant": normalize_merchant(merchant_raw),
|
| 326 |
+
"amount": amt,
|
| 327 |
+
"source_file": filename,
|
| 328 |
+
})
|
| 329 |
|
| 330 |
return pd.DataFrame(rows) if rows else pd.DataFrame()
|
| 331 |
|