Bank-Scrubber / src /extractor /balance_extractor.py
Aryan Jain
bank scrubber streamlit application
4e71548
import asyncio
import re
from typing import List, Dict, Any, Optional, Tuple
from src.extractor.account_extractor import AccountExtractor
class BalanceExtractor:
"""Async balance extractor for extracting balance information."""
def __init__(self):
self.amount_pattern = re.compile(r'-?(?:\d{1,3}(?:,\d{2}){1,}(?:,\d{3})?|\d{1,3}(?:,\d{3})+|\d+)?\.\d{1,2}-?')
self.account_extractor = AccountExtractor()
async def __aenter__(self):
return self
async def __aexit__(self, exc_type, exc_value, traceback):
pass
async def extract_balances(self, object_line: List[Dict]) -> Tuple[Optional[str], Optional[str]]:
"""Extract beginning and ending balances from line data."""
def _extract_balances():
# Keywords
previous_keywords = [
"previous balance", "starting balance", "beginning balance",
"balance last statement", "balance previous statement", "last statement",
"beginning statement", "previous statement", "starting"
]
ending_keywords = [
"ending balance", "current balance", "balance this statement",
"balance ending statement", "this statement", "ending statement", "ending"
]
beginning_balance = None
ending_balance = None
for idx, line_obj in enumerate(object_line):
line = line_obj['line']
line_lower = line.lower()
# Search for beginning balance keywords
if not beginning_balance:
for keyword in previous_keywords:
if keyword in line_lower:
start_index = line_lower.find(keyword) + len(keyword)
after_keyword = line[start_index:]
match = self.amount_pattern.search(after_keyword)
if match:
beginning_balance = match.group().replace(",", "")
break # Stop after first match
else:
# combine the bbox of the keyword and check exact below word in range of keyword bbox range
keyword_bbox = None
keyword_bbox = self.account_extractor.match_keyword_bbox(keyword, line_obj["words"])
if keyword_bbox:
x_min, _, x_max, _ = keyword_bbox
for next_line in object_line[idx+1:idx+3]:
final_amt = ""
for w in next_line.get("words", []):
wx_min, _, wx_max, _ = w["bbox"]
if wx_min >= x_min-0.1 and wx_max <= x_max+0.1:
final_amt += w["word"]
match = self.amount_pattern.search(final_amt)
if match:
beginning_balance = match.group().replace(",", "")
break
if beginning_balance:
break
if not ending_balance:
# Search for ending balance keywords
for keyword in ending_keywords:
if keyword in line_lower:
start_index = line_lower.find(keyword) + len(keyword)
after_keyword = line[start_index:]
match = self.amount_pattern.search(after_keyword)
if match:
ending_balance = match.group().replace(",", "")
break # Stop after first match
else:
# combine the bbox of the keyword and check exact below word in range of keyword bbox range
keyword_bbox = None
keyword_bbox = self.account_extractor.match_keyword_bbox(keyword, line_obj["words"])
if keyword_bbox:
x_min, _, x_max, _ = keyword_bbox
for next_line in object_line[idx+1:idx+3]:
final_amt = ""
for w in next_line.get("words", []):
wx_min, _, wx_max, _ = w["bbox"]
if wx_min >= x_min-0.1 and wx_max <= x_max+0.1:
final_amt += w["word"]
match = self.amount_pattern.search(final_amt)
if match:
ending_balance = match.group().replace(",", "")
break
if ending_balance:
break
return beginning_balance, ending_balance
return await asyncio.get_event_loop().run_in_executor(None, _extract_balances)