import asyncio import re from typing import List, Dict, Any, Optional, Tuple from src.extractor.account_extractor import AccountExtractor class BalanceExtractor: """Async balance extractor for extracting balance information.""" def __init__(self): self.amount_pattern = re.compile(r'-?(?:\d{1,3}(?:,\d{2}){1,}(?:,\d{3})?|\d{1,3}(?:,\d{3})+|\d+)?\.\d{1,2}-?') self.account_extractor = AccountExtractor() async def __aenter__(self): return self async def __aexit__(self, exc_type, exc_value, traceback): pass async def extract_balances(self, object_line: List[Dict]) -> Tuple[Optional[str], Optional[str]]: """Extract beginning and ending balances from line data.""" def _extract_balances(): # Keywords previous_keywords = [ "previous balance", "starting balance", "beginning balance", "balance last statement", "balance previous statement", "last statement", "beginning statement", "previous statement", "starting" ] ending_keywords = [ "ending balance", "current balance", "balance this statement", "balance ending statement", "this statement", "ending statement", "ending" ] beginning_balance = None ending_balance = None for idx, line_obj in enumerate(object_line): line = line_obj['line'] line_lower = line.lower() # Search for beginning balance keywords if not beginning_balance: for keyword in previous_keywords: if keyword in line_lower: start_index = line_lower.find(keyword) + len(keyword) after_keyword = line[start_index:] match = self.amount_pattern.search(after_keyword) if match: beginning_balance = match.group().replace(",", "") break # Stop after first match else: # combine the bbox of the keyword and check exact below word in range of keyword bbox range keyword_bbox = None keyword_bbox = self.account_extractor.match_keyword_bbox(keyword, line_obj["words"]) if keyword_bbox: x_min, _, x_max, _ = keyword_bbox for next_line in object_line[idx+1:idx+3]: final_amt = "" for w in next_line.get("words", []): wx_min, _, wx_max, _ = w["bbox"] if wx_min >= x_min-0.1 and wx_max <= x_max+0.1: final_amt += w["word"] match = self.amount_pattern.search(final_amt) if match: beginning_balance = match.group().replace(",", "") break if beginning_balance: break if not ending_balance: # Search for ending balance keywords for keyword in ending_keywords: if keyword in line_lower: start_index = line_lower.find(keyword) + len(keyword) after_keyword = line[start_index:] match = self.amount_pattern.search(after_keyword) if match: ending_balance = match.group().replace(",", "") break # Stop after first match else: # combine the bbox of the keyword and check exact below word in range of keyword bbox range keyword_bbox = None keyword_bbox = self.account_extractor.match_keyword_bbox(keyword, line_obj["words"]) if keyword_bbox: x_min, _, x_max, _ = keyword_bbox for next_line in object_line[idx+1:idx+3]: final_amt = "" for w in next_line.get("words", []): wx_min, _, wx_max, _ = w["bbox"] if wx_min >= x_min-0.1 and wx_max <= x_max+0.1: final_amt += w["word"] match = self.amount_pattern.search(final_amt) if match: ending_balance = match.group().replace(",", "") break if ending_balance: break return beginning_balance, ending_balance return await asyncio.get_event_loop().run_in_executor(None, _extract_balances)