Spaces:
Runtime error
Runtime error
| import asyncio | |
| import re | |
| from typing import List, Dict, Any, Optional, Tuple | |
| from src.extractor.account_extractor import AccountExtractor | |
| class BalanceExtractor: | |
| """Async balance extractor for extracting balance information.""" | |
| def __init__(self): | |
| self.amount_pattern = re.compile(r'-?(?:\d{1,3}(?:,\d{2}){1,}(?:,\d{3})?|\d{1,3}(?:,\d{3})+|\d+)?\.\d{1,2}-?') | |
| self.account_extractor = AccountExtractor() | |
| async def __aenter__(self): | |
| return self | |
| async def __aexit__(self, exc_type, exc_value, traceback): | |
| pass | |
| async def extract_balances(self, object_line: List[Dict]) -> Tuple[Optional[str], Optional[str]]: | |
| """Extract beginning and ending balances from line data.""" | |
| def _extract_balances(): | |
| # Keywords | |
| previous_keywords = [ | |
| "previous balance", "starting balance", "beginning balance", | |
| "balance last statement", "balance previous statement", "last statement", | |
| "beginning statement", "previous statement", "starting" | |
| ] | |
| ending_keywords = [ | |
| "ending balance", "current balance", "balance this statement", | |
| "balance ending statement", "this statement", "ending statement", "ending" | |
| ] | |
| beginning_balance = None | |
| ending_balance = None | |
| for idx, line_obj in enumerate(object_line): | |
| line = line_obj['line'] | |
| line_lower = line.lower() | |
| # Search for beginning balance keywords | |
| if not beginning_balance: | |
| for keyword in previous_keywords: | |
| if keyword in line_lower: | |
| start_index = line_lower.find(keyword) + len(keyword) | |
| after_keyword = line[start_index:] | |
| match = self.amount_pattern.search(after_keyword) | |
| if match: | |
| beginning_balance = match.group().replace(",", "") | |
| break # Stop after first match | |
| else: | |
| # combine the bbox of the keyword and check exact below word in range of keyword bbox range | |
| keyword_bbox = None | |
| keyword_bbox = self.account_extractor.match_keyword_bbox(keyword, line_obj["words"]) | |
| if keyword_bbox: | |
| x_min, _, x_max, _ = keyword_bbox | |
| for next_line in object_line[idx+1:idx+3]: | |
| final_amt = "" | |
| for w in next_line.get("words", []): | |
| wx_min, _, wx_max, _ = w["bbox"] | |
| if wx_min >= x_min-0.1 and wx_max <= x_max+0.1: | |
| final_amt += w["word"] | |
| match = self.amount_pattern.search(final_amt) | |
| if match: | |
| beginning_balance = match.group().replace(",", "") | |
| break | |
| if beginning_balance: | |
| break | |
| if not ending_balance: | |
| # Search for ending balance keywords | |
| for keyword in ending_keywords: | |
| if keyword in line_lower: | |
| start_index = line_lower.find(keyword) + len(keyword) | |
| after_keyword = line[start_index:] | |
| match = self.amount_pattern.search(after_keyword) | |
| if match: | |
| ending_balance = match.group().replace(",", "") | |
| break # Stop after first match | |
| else: | |
| # combine the bbox of the keyword and check exact below word in range of keyword bbox range | |
| keyword_bbox = None | |
| keyword_bbox = self.account_extractor.match_keyword_bbox(keyword, line_obj["words"]) | |
| if keyword_bbox: | |
| x_min, _, x_max, _ = keyword_bbox | |
| for next_line in object_line[idx+1:idx+3]: | |
| final_amt = "" | |
| for w in next_line.get("words", []): | |
| wx_min, _, wx_max, _ = w["bbox"] | |
| if wx_min >= x_min-0.1 and wx_max <= x_max+0.1: | |
| final_amt += w["word"] | |
| match = self.amount_pattern.search(final_amt) | |
| if match: | |
| ending_balance = match.group().replace(",", "") | |
| break | |
| if ending_balance: | |
| break | |
| return beginning_balance, ending_balance | |
| return await asyncio.get_event_loop().run_in_executor(None, _extract_balances) |